9 months ago · ab202ff1fc
--- a/botr/extract_table.py
+++ b/botr/extract_table.py
@@ -1,29 +1,37 @@
 
															+import copy
														
 
															+import math
														
 
															+import os
														
 
															 import re
														
 
															 import time
														
 
															 import traceback
														
 
															+from glob import glob
														
 
															+import numpy as np
														
 
															 import cv2
														
 
															+import wcwidth
														
 
															 from pdfminer.layout import LTLine
														
 
															 # from botr.nsp.predict import nsp_predict
														
 
															+from sklearn.cluster import KMeans
														
 
															+
														
 
															 from botr.rules.get_table_by_rules import get_table_by_rule
														
 
															 from botr.utils import line_iou, get_table_iou
														
 
															 from format_convert.convert_need_interface import from_yolo_interface
														
 
															-from format_convert.utils import log, np2bytes
														
 
															+from format_convert.utils import log, np2bytes, text_bbox_to_lt, pil_resize, memory_decorator
														
 
															 def b_table_process(list_line, list_text_boxes, list_cell, table_location):
														
 
															     def merge_textbox(textbox_list, in_objs):
														
 
															         delete_obj = []
														
 
															         threshold = 5
														
 
															-        textbox_list.sort(key=lambda x:x.bbox[0])
														
 
															+        textbox_list.sort(key=lambda x: x.bbox[0])
														
 
															         for k in range(len(textbox_list)):
														
 
															             tb1 = textbox_list[k]
														
 
															             if tb1 not in in_objs and tb1 not in delete_obj:
														
 
															-                for m in range(k+1, len(textbox_list)):
														
 
															+                for m in range(k + 1, len(textbox_list)):
														
 
															                     tb2 = textbox_list[m]
														
 
															                     if tb2 in in_objs:
														
 
															                         continue
														
 
															-                    if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
														
 
															-                            and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
														
 
															+                    if abs(tb1.bbox[1] - tb2.bbox[1]) <= threshold \
														
 
															+                            and abs(tb1.bbox[3] - tb2.bbox[3]) <= threshold:
														
 
															                         if tb1.bbox[0] <= tb2.bbox[0]:
														
 
															                             tb1.text = tb1.text + tb2.text
														
 
															                         else:
														
@@ -35,6 +43,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
 
															             if _obj in textbox_list:
														
 
															                 textbox_list.remove(_obj)
														
 
															         return textbox_list
														
 
															+
														
 
															     try:
														
 
															         if list_line:
														
 
															             from format_convert.convert_tree import TableLine
														
@@ -55,7 +64,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
 
															             current_y = area_list_text_boxes[0].bbox[1]
														
 
															             current_y2 = area_list_text_boxes[0].bbox[3]
														
 
															             # threshold = 2.
														
 
															-            threshold = max(2., 1/3 * abs(current_y2 - current_y))
														
 
															+            threshold = max(2., 1 / 3 * abs(current_y2 - current_y))
														
 
															             for t_b in area_list_text_boxes:
														
 
															                 bbox = t_b.bbox
														
 
															                 if current_y - threshold <= bbox[1] <= current_y + threshold:
														
@@ -69,6 +78,11 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
 
															             obj_in_table = []
														
 
															             table_dict = {'bbox': table_location}
														
 
															             row_list = []
														
 
															+
														
 
															+            # yolo检测出的表格，忽略两列的，因为已经补充了两列的新规则 250529
														
 
															+            if list_cell and len(list_cell[0]) == 2:
														
 
															+                return list_text_boxes, [], set()
														
 
															+
														
 
															             for row in list_cell:
														
 
															                 col_list = []
														
 
															                 for col in row:
														
@@ -112,17 +126,19 @@ def get_text_box_obj(_text_list, _bbox_list):
 
															     return _text_box_list
														
 
															-def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
														
 
															+def get_table(img, table_list, text_list, bbox_list, text_box_list, from_pdf=False, show=0):
														
 
															     log('start')
														
 
															     # 检测无边框表格
														
 
															     start_time_all = time.time()
														
 
															     start_time = time.time()
														
 
															     img_bytes = np2bytes(img)
														
 
															     b_table_list = from_yolo_interface(img_bytes)
														
 
															-    log('yolo detect cost: ' + str(time.time()-start_time))
														
 
															+    log('yolo detect cost: ' + str(time.time() - start_time))
														
 
															     b_table_list = b_table_list[0]
														
 
															     if not b_table_list:
														
 
															         log('detect not b_table_list')
														
 
															+        if from_pdf:
														
 
															+            save_b_table(img)
														
 
															         return [], [], []
														
 
															     # if show:
														
@@ -156,8 +172,9 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
															         b_loc = [min_x, min_y, max_x, max_y, b_table[4]]
														
 
															         inter_flag = False
														
 
															         for table in table_list:
														
 
															-            loc = table.get('bbox')
														
 
															-            rows = table.get('table')
														
 
															+            # loc = table.get('bbox')
														
 
															+            loc = table.bbox
														
 
															+            # rows = table.get('table')
														
 
															             iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1)
														
 
															             if iou > 0.3:
														
 
															                 # if len(rows) <= 1:
														
@@ -190,7 +207,7 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
															             if b_loc1 in used_b_loc:
														
 
															                 continue
														
 
															             inter_flag = False
														
 
															-            for j in range(i+1, len(b_table_location_list)):
														
 
															+            for j in range(i + 1, len(b_table_location_list)):
														
 
															                 b_loc2 = b_table_location_list[j]
														
 
															                 iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1)
														
 
															                 if show:
														
@@ -230,7 +247,8 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
															         # 根据ocr bbox，规则生成表格线
														
 
															         start_time = time.time()
														
 
															-        line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list, b_loc, show=show)
														
 
															+        line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list,
														
 
															+                                                                                 b_loc, show=show)
														
 
															         if not table_location:
														
 
															             log('get_table_by_rule not table_location')
														
 
															             continue
														
@@ -240,14 +258,15 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
															             area_bbox_list.append(eval(key))
														
 
															             area_text_list.append(bbox_text_dict.get(key))
														
 
															         b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list)
														
 
															-        log('get_table_by_rule cost: ' + str(time.time()-start_time))
														
 
															+        log('get_table_by_rule cost: ' + str(time.time() - start_time))
														
 
															         # 根据表格线生成单元格
														
 
															         start_time = time.time()
														
 
															-        b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list, table_location)
														
 
															+        b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list,
														
 
															+                                                                           table_location)
														
 
															         table_list += _table_list
														
 
															         obj_in_table_list += _obj_in_table_list
														
 
															-        log('b_table_process cost: ' + str(time.time()-start_time))
														
 
															+        log('b_table_process cost: ' + str(time.time() - start_time))
														
 
															         # if not table_list:
														
 
															         #     log('table_process not table_list')
														
@@ -317,4 +336,2421 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
 
															         # _table_list[0]['table'] = new_table
														
 
															     log('get_table finish ' + str(time.time() - start_time_all))
														
 
															-    return text_box_list, table_list, obj_in_table_list
														
 
															+    return text_box_list, table_list, obj_in_table_list
														
 
															+
														
 
															+
														
 
															+def save_b_table(image_np):
														
 
															+    _start_time = time.time()
														
 
															+    _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_not_detect'
														
 
															+    # _path = 'D:/Project/format_conversion_maxcompute/save_b_table_not_detect'
														
 
															+    max_index = 20000
														
 
															+    if os.path.exists(_path):
														
 
															+        file_list = glob(_path + '/*')
														
 
															+        if file_list:
														
 
															+            file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
														
 
															+            file_index_list.sort(key=lambda x: x)
														
 
															+            index = file_index_list[-1] + 1
														
 
															+        else:
														
 
															+            index = 0
														
 
															+        if index > max_index:
														
 
															+            return
														
 
															+
														
 
															+        # 文件md5
														
 
															+        from format_convert import _global
														
 
															+        _md5 = _global.get("md5")
														
 
															+
														
 
															+        _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
														
 
															+        cv2.imwrite(_image_path, image_np)
														
 
															+        log('save yolo not detect b_table image success!')
														
 
															+
														
 
															+
														
 
															+@memory_decorator
														
 
															+def get_b_table_by_blank_colon(lt_text_list, table_list, layout_bbox, image_np=None, show=0):
														
 
															+    start_time = time.time()
														
 
															+
														
 
															+    # print('len(lt_text_list)', len(lt_text_list))
														
 
															+    # for lt_text in lt_text_list:
														
 
															+    #     print('lt_text', lt_text)
														
 
															+
														
 
															+    # 新增冒号提前判断
														
 
															+    colon_cnt = 0
														
 
															+    for lt_text in lt_text_list:
														
 
															+        if re.search('[：:]', lt_text.get_text()):
														
 
															+            colon_cnt += 1
														
 
															+    if colon_cnt <= 6:
														
 
															+        log('pre judge colon_cnt <= 6')
														
 
															+        return [], []
														
 
															+
														
 
															+    # 图片类型，限制lt_text_list个数，并且很多是单字的
														
 
															+    if image_np is not None and len(lt_text_list) >= 60:
														
 
															+        single_char_cnt = 0
														
 
															+        for lt_text in lt_text_list:
														
 
															+            if len(lt_text.get_text()) <= 1:
														
 
															+                single_char_cnt += 1
														
 
															+        # log('len(lt_text_list), single_char_cnt ' + str(len(lt_text_list)) + ' ' + str(single_char_cnt))
														
 
															+        if single_char_cnt > 50 or single_char_cnt > 1/3 * len(lt_text_list):
														
 
															+            return [], []
														
 
															+
														
 
															+    # raise
														
 
															+    # 有些确定为非表格，也输出，防止后续YOLO判断为表格，搞乱数据
														
 
															+    not_b_table_list = []
														
 
															+
														
 
															+    layout_h = int(layout_bbox[3])
														
 
															+    layout_w = int(layout_bbox[2])
														
 
															+
														
 
															+    if show:
														
 
															+        print('layout_w, layout_h', layout_w, layout_h)
														
 
															+        show_image = np.full((layout_h, layout_w, 3), 255, dtype=np.uint8)
														
 
															+
														
 
															+    if show and image_np is not None:
														
 
															+        image_np_show = copy.copy(image_np)
														
 
															+        for lt_text in lt_text_list:
														
 
															+            bbox = [int(x) for x in lt_text.bbox]
														
 
															+            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
														
 
															+        cv2.imshow('image origin', image_np_show)
														
 
															+        cv2.waitKey(0)
														
 
															+
														
 
															+    # pdf类型预处理
														
 
															+    start_time1 = time.time()
														
 
															+    if image_np is None:
														
 
															+        # 把单个lt_text中，中间多个空格分割的分开
														
 
															+        lt_text_list = split_lt_text_by_many_space(lt_text_list)
														
 
															+
														
 
															+        if show:
														
 
															+            for lt_text in lt_text_list:
														
 
															+                bbox = [int(x) for x in lt_text.bbox]
														
 
															+                cv2.rectangle(show_image, bbox[:2], bbox[2:4], (0, 0, 255))
														
 
															+            cv2.imshow('pdf preprocess', show_image)
														
 
															+            cv2.waitKey(0)
														
 
															+        # log('get_b_table_by_blank_colon pdf preprocess cost: ' + str(time.time()-start_time1))
														
 
															+
														
 
															+    # 图片类型预处理
														
 
															+    start_time1 = time.time()
														
 
															+    if image_np is not None:
														
 
															+        # 删除空的
														
 
															+        start_time2 = time.time()
														
 
															+        lt_text_list = delete_empty_bbox(lt_text_list)
														
 
															+        # print('delete_empty_bbox cost: ', time.time()-start_time2)
														
 
															+
														
 
															+        # ocr识别的文本框需处理后紧贴文本，才能依靠空白分行
														
 
															+        start_time2 = time.time()
														
 
															+        new_bbox_list = shrink_bbox(image_np, [x.bbox for x in lt_text_list])
														
 
															+        # print('shrink_bbox cost: ', time.time()-start_time2)
														
 
															+        start_time2 = time.time()
														
 
															+        for i, lt_text in enumerate(lt_text_list):
														
 
															+            lt_text.bbox = new_bbox_list[i]
														
 
															+        # print('lt_text.bbox = new_bbox_list[i] cost: ', time.time()-start_time2)
														
 
															+        # log('get_b_table_by_blank_colon image preprocess1 cost: ' + str(time.time()-start_time1))
														
 
															+
														
 
															+    # 计算单字平均距离
														
 
															+    start_time1 = time.time()
														
 
															+    all_char_cnt = 0
														
 
															+    all_text_width = 0
														
 
															+    for lt_text in lt_text_list:
														
 
															+        all_char_cnt += len(lt_text.get_text())
														
 
															+        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
														
 
															+    if all_char_cnt == 0:
														
 
															+        return [], not_b_table_list
														
 
															+    avg_char_width = all_text_width / all_char_cnt
														
 
															+
														
 
															+    # 图片类型预处理2
														
 
															+    if image_np is not None:
														
 
															+        # ocr识别的表格的值可能因空格分开，合并
														
 
															+        lt_text_list = merge_same_bbox(lt_text_list, avg_char_width)
														
 
															+
														
 
															+        # bbox交叉，修复
														
 
															+        lt_text_list = fix_cross_bbox(lt_text_list)
														
 
															+        # log('get_b_table_by_blank_colon image preprocess2 cost: ' + str(time.time()-start_time1))
														
 
															+
														
 
															+    if show and image_np is not None:
														
 
															+        image_np_show = copy.copy(image_np)
														
 
															+        for lt_text in lt_text_list:
														
 
															+            bbox = [int(x) for x in lt_text.bbox]
														
 
															+            cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
														
 
															+        cv2.imshow('image preprocess', image_np_show)
														
 
															+        cv2.waitKey(0)
														
 
															+
														
 
															+    if show:
														
 
															+        for lt_text in lt_text_list:
														
 
															+            print('lt_text', lt_text)
														
 
															+
														
 
															+    # 过滤xy值过大过小的
														
 
															+    temp_list = []
														
 
															+    for lt_text in lt_text_list:
														
 
															+        if min(lt_text.bbox) < 0 or max(lt_text.bbox) > 10000:
														
 
															+            continue
														
 
															+        temp_list.append(lt_text)
														
 
															+    lt_text_list = temp_list
														
 
															+
														
 
															+    if show:
														
 
															+        for lt_text in lt_text_list:
														
 
															+            cv2.rectangle(show_image,
														
 
															+                          (int(lt_text.bbox[0]), int(lt_text.bbox[1])),
														
 
															+                          (int(lt_text.bbox[2]), int(lt_text.bbox[3])),
														
 
															+                          (0, 0, 255)
														
 
															+                          )
														
 
															+        for table in table_list:
														
 
															+            cv2.rectangle(show_image,
														
 
															+                          (int(table.bbox[0]), int(table.bbox[1])),
														
 
															+                          (int(table.bbox[2]), int(table.bbox[3])),
														
 
															+                          (0, 255, 0)
														
 
															+                          )
														
 
															+
														
 
															+    # 计算单字平均距离
														
 
															+    all_char_cnt = 0
														
 
															+    all_text_width = 0
														
 
															+    for lt_text in lt_text_list:
														
 
															+        all_char_cnt += len(lt_text.get_text())
														
 
															+        all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
														
 
															+    if all_char_cnt == 0:
														
 
															+        return [], not_b_table_list
														
 
															+    avg_char_width = all_text_width / all_char_cnt
														
 
															+    if show:
														
 
															+        print('avg_char_width', avg_char_width)
														
 
															+
														
 
															+    if image_np is None:
														
 
															+        blank_width = 1 * avg_char_width
														
 
															+    else:
														
 
															+        blank_width = 1 * avg_char_width
														
 
															+    if show:
														
 
															+        print('blank_width', blank_width)
														
 
															+
														
 
															+    # 根据有边框表格位置，将该页分为多个区域
														
 
															+    table_h_list = []
														
 
															+    area_h_list = []
														
 
															+    area_start_h = 0
														
 
															+    table_list.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3]))
														
 
															+    for table in table_list:
														
 
															+        table_h_list.append([table.bbox[1], table.bbox[3]])
														
 
															+        area_h_list.append([area_start_h, table.bbox[1]])
														
 
															+        area_start_h = table.bbox[3]
														
 
															+    area_h_list.append([area_start_h, layout_h])
														
 
															+
														
 
															+    if show:
														
 
															+        for min_h, max_h in area_h_list:
														
 
															+            print('area_h_list', min_h, max_h)
														
 
															+            cv2.rectangle(show_image,
														
 
															+                          (0, int(min_h)),
														
 
															+                          (layout_w, int(max_h)),
														
 
															+                          (255, 0, 0)
														
 
															+                          )
														
 
															+
														
 
															+    lt_text_area_list = []
														
 
															+    for area_min_h, area_max_h in area_h_list:
														
 
															+        sub_area = []
														
 
															+        for lt_text in lt_text_list:
														
 
															+            if area_min_h <= lt_text.bbox[1] <= lt_text.bbox[3] <= area_max_h:
														
 
															+                sub_area.append(lt_text)
														
 
															+        lt_text_area_list.append(sub_area)
														
 
															+    if show:
														
 
															+        print('len(lt_text_area_list)', len(lt_text_area_list))
														
 
															+
														
 
															+    # 每个区域分别进行判断无边框表格
														
 
															+    result_table_list = []
														
 
															+    start_time1 = time.time()
														
 
															+    for sub_lt_text_list in lt_text_area_list:
														
 
															+        start_time2 = time.time()
														
 
															+        lt_text_row_list = get_text_row_by_blank(sub_lt_text_list, layout_h)
														
 
															+        # log('get_text_row_by_blank cost: ' + str(time.time()-start_time2))
														
 
															+
														
 
															+        # 有补充的占位lt_text,需添加到lt_text_list
														
 
															+        for row in lt_text_row_list:
														
 
															+            for lt_text in row:
														
 
															+                if lt_text not in lt_text_list:
														
 
															+                    lt_text_list.append(lt_text)
														
 
															+
														
 
															+        if show:
														
 
															+            for row in lt_text_row_list:
														
 
															+                print('row', row)
														
 
															+
														
 
															+        start_time2 = time.time()
														
 
															+        b_table_list1, b_table_bbox_list1 = get_b_table_by_lt_text_row(lt_text_row_list)
														
 
															+        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
														
 
															+
														
 
															+        # 确定区域后，对表格内重新分行，更精准
														
 
															+        start_time2 = time.time()
														
 
															+        table_lt_text_row_list = []
														
 
															+        for bi, b_table in enumerate(b_table_list1):
														
 
															+            b_table_bbox = b_table_bbox_list1[bi]
														
 
															+            sub_lt_text_list = []
														
 
															+            for lt_text in lt_text_list:
														
 
															+                if b_table_bbox[1] <= lt_text.bbox[1] <= lt_text.bbox[3] <= b_table_bbox[3]:
														
 
															+                    sub_lt_text_list.append(lt_text)
														
 
															+            _lt_text_row_list, center_blank_row = get_text_row_by_center_blank(b_table, sub_lt_text_list, blank_width,
														
 
															+                                                                               layout_h)
														
 
															+            table_lt_text_row_list += _lt_text_row_list
														
 
															+        # log('get_text_row_by_center_blank cost: ' + str(time.time()-start_time2))
														
 
															+
														
 
															+        start_time2 = time.time()
														
 
															+        b_table_list3, b_table_bbox_list3 = get_b_table_by_lt_text_row(table_lt_text_row_list)
														
 
															+        # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
														
 
															+
														
 
															+        if show:
														
 
															+            for b_table in b_table_list3:
														
 
															+                print('b_table3', b_table)
														
 
															+
														
 
															+        # 对大致的表格进行列判断，表格内不同列的框不能交叉，可以重合，需有一定空白
														
 
															+        start_time2 = time.time()
														
 
															+        b_table_list2 = []
														
 
															+        for b_table in b_table_list3:
														
 
															+
														
 
															+            blank_row_list = get_blank_row(b_table, blank_width)
														
 
															+            if show:
														
 
															+                print('b_table get_blank_row b_table_list3', b_table)
														
 
															+                print('blank_row_list b_table_list3', blank_row_list)
														
 
															+
														
 
															+            b_table2 = []
														
 
															+            for bi, lt_text_row1 in enumerate(b_table[:-1]):
														
 
															+                lt_text_row2 = b_table[bi + 1]
														
 
															+                # if row1_row2_has_same_col(lt_text_row1, lt_text_row2):
														
 
															+                if row1_row2_has_same_blank(blank_row_list[bi], blank_row_list[bi + 1]):
														
 
															+                    if lt_text_row1 not in b_table2:
														
 
															+                        b_table2.append(lt_text_row1)
														
 
															+                    if lt_text_row2 not in b_table2:
														
 
															+                        b_table2.append(lt_text_row2)
														
 
															+                else:
														
 
															+                    # print('not cross blank', blank_row_list[bi], blank_row_list[bi + 1])
														
 
															+                    if len(b_table2) >= 2:
														
 
															+                        b_table_list2.append(b_table2)
														
 
															+                    b_table2 = []
														
 
															+            if len(b_table2) >= 2:
														
 
															+                b_table_list2.append(b_table2)
														
 
															+        # log('get_blank_row cost: ' + str(time.time()-start_time2))
														
 
															+
														
 
															+        if show:
														
 
															+            for b_table2 in b_table_list2:
														
 
															+                print('b_table2')
														
 
															+                for lt_text_row in b_table2:
														
 
															+                    print('b_table2 lt_text_row', lt_text_row)
														
 
															+
														
 
															+        start_time2 = time.time()
														
 
															+        for bi, b_table2 in enumerate(b_table_list2):
														
 
															+            # 根据冒号得到表格
														
 
															+            start_time3 = time.time()
														
 
															+            table2, center_blank_row, _not_b_table_bbox_list, table_bbox \
														
 
															+                = get_b_table_by_colon(b_table2, blank_width)
														
 
															+            log('get_b_table_by_colon cost: ' + str(time.time()-start_time3))
														
 
															+            not_b_table_list += [[[], x] for x in _not_b_table_bbox_list]
														
 
															+
														
 
															+            if show and center_blank_row:
														
 
															+                print('show center_blank_row', center_blank_row)
														
 
															+                bx = int((center_blank_row[2] + center_blank_row[0]) / 2)
														
 
															+                by = int((center_blank_row[3] + center_blank_row[1]) / 2)
														
 
															+                br = int((center_blank_row[2] - center_blank_row[0]) / 2)
														
 
															+                if br <= 5:
														
 
															+                    br = 5
														
 
															+                print('bx, by, br', bx, by, br)
														
 
															+                cv2.circle(show_image, (bx, by), br, (0, 255, 0))
														
 
															+
														
 
															+            if show:
														
 
															+                min_w, min_h, max_w, max_h = table_bbox
														
 
															+                cv2.rectangle(show_image,
														
 
															+                              (int(min_w), int(min_h)),
														
 
															+                              (int(max_w), int(max_h)),
														
 
															+                              (0, 255, 0)
														
 
															+                              )
														
 
															+
														
 
															+            # 修复最后一行跨行
														
 
															+            # table2 = fix_final_row(table2)
														
 
															+
														
 
															+            # 表格末尾有些只有一列的需补充
														
 
															+            table2 = add_last_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
														
 
															+
														
 
															+            table2 = add_first_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
														
 
															+
														
 
															+            # table格式转化
														
 
															+            table2 = table_list_to_dict(table2)
														
 
															+
														
 
															+            # 表格一些标准化，比如去掉占位符
														
 
															+            table2 = standard_table(table2)
														
 
															+
														
 
															+            if table2:
														
 
															+                result_table_list.append([table2, table_bbox])
														
 
															+        # log('colon, add, standard cost: ' + str(time.time()-start_time2))
														
 
															+
														
 
															+    # log('get_b_table_by_blank_colon area get b_table cost: ' + str(time.time()-start_time1))
														
 
															+
														
 
															+    if show:
														
 
															+        cv2.namedWindow("final result", cv2.WINDOW_NORMAL)
														
 
															+        cv2.resizeWindow("final result", 768, 1024)
														
 
															+        cv2.imshow('final result', show_image)
														
 
															+        cv2.waitKey(0)
														
 
															+
														
 
															+    if show:
														
 
															+        for table in result_table_list:
														
 
															+            print('get_b_table_by_bbox table ', table)
														
 
															+
														
 
															+        for not_table_bbox in not_b_table_list:
														
 
															+            print('not_table bbox ', not_table_bbox)
														
 
															+
														
 
															+    # log('get_b_table_by_blank_colon cost: ' + str(time.time()-start_time))
														
 
															+    return result_table_list, not_b_table_list
														
 
															+
														
 
															+
														
 
															+def get_b_table_by_lt_text_row(lt_text_row_list, show=0):
														
 
															+    # 先大致确定区域，列数大于2的区域
														
 
															+    b_table_list1 = []
														
 
															+    b_table = []
														
 
															+
														
 
															+    for lt_text_row in lt_text_row_list:
														
 
															+        if len(lt_text_row) >= 2:
														
 
															+            b_table.append(lt_text_row)
														
 
															+        else:
														
 
															+            if len(b_table) >= 2:
														
 
															+                b_table_list1.append(b_table)
														
 
															+            b_table = []
														
 
															+    if len(b_table) >= 2:
														
 
															+        b_table_list1.append(b_table)
														
 
															+
														
 
															+    # 获取bbox
														
 
															+    b_table_bbox_list = []
														
 
															+    for b_table in b_table_list1:
														
 
															+        x1 = min([y.bbox[0] for x in b_table for y in x])
														
 
															+        y1 = min([y.bbox[1] for x in b_table for y in x])
														
 
															+        x2 = max([y.bbox[2] for x in b_table for y in x])
														
 
															+        y2 = max([y.bbox[3] for x in b_table for y in x])
														
 
															+
														
 
															+        b_table_bbox_list.append([x1, y1, x2, y2])
														
 
															+
														
 
															+    if show:
														
 
															+        for b_table in b_table_list1:
														
 
															+            print('b_table')
														
 
															+            for lt_text_row in b_table:
														
 
															+                print('b_table lt_text_row', lt_text_row)
														
 
															+    return b_table_list1, b_table_bbox_list
														
 
															+
														
 
															+
														
 
															+def row1_row2_has_same_col(row1, row2):
														
 
															+    threshold = 5
														
 
															+    blank_len = 2
														
 
															+    cross_flag = 0
														
 
															+    for lt_text1 in row1:
														
 
															+        for lt_text2 in row2:
														
 
															+            if lt_text2.bbox[0] - lt_text1.bbox[2] >= blank_len \
														
 
															+                    or lt_text1.bbox[0] - lt_text2.bbox[2] >= blank_len \
														
 
															+                    or lt_text1.bbox[0] - threshold <= lt_text2.bbox[0] < lt_text2.bbox[2] <= lt_text1.bbox[
														
 
															+                2] + threshold \
														
 
															+                    or lt_text2.bbox[0] - threshold <= lt_text1.bbox[0] < lt_text1.bbox[2] <= lt_text2.bbox[
														
 
															+                2] + threshold:
														
 
															+                pass
														
 
															+            else:
														
 
															+                cross_flag = 1
														
 
															+    if cross_flag:
														
 
															+        return False
														
 
															+    else:
														
 
															+        return True
														
 
															+
														
 
															+
														
 
															+def get_blank_row(lt_text_row_list, blank_min_width, show=0):
														
 
															+    # 获取空白行
														
 
															+    blank_row_list = []
														
 
															+    # blank_min_width = avg_char_width * 3
														
 
															+    for lt_text_row in lt_text_row_list:
														
 
															+        lt_text_row.sort(key=lambda x: x.bbox[0])
														
 
															+        blank_row = []
														
 
															+        if len(lt_text_row) < 2:
														
 
															+            blank_row_list.append([])
														
 
															+        else:
														
 
															+            # 行内lt_text两两生成空白
														
 
															+            for lt_text1 in lt_text_row:
														
 
															+                sub_row = []
														
 
															+                for lt_text2 in lt_text_row:
														
 
															+                    if lt_text1 == lt_text2:
														
 
															+                        continue
														
 
															+                    # 必须从左到右
														
 
															+                    if lt_text1.bbox[2] > lt_text2.bbox[0]:
														
 
															+                        continue
														
 
															+                    line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
														
 
															+                    line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
														
 
															+                    if line_iou(line1, line2) > 0:
														
 
															+                        continue
														
 
															+                    sub_row.append([min(lt_text1.bbox[2], lt_text2.bbox[0]),
														
 
															+                                    min(lt_text1.bbox[3], lt_text2.bbox[1]),
														
 
															+                                    max(lt_text1.bbox[2], lt_text2.bbox[0]),
														
 
															+                                    max(lt_text1.bbox[3], lt_text2.bbox[1]),
														
 
															+                                    ])
														
 
															+                    if show:
														
 
															+                        print('sub_row', lt_text1.get_text(), lt_text2.get_text(), sub_row[-1])
														
 
															+
														
 
															+                # 每个lt_text只找出其对应的最小的空白
														
 
															+                if not sub_row:
														
 
															+                    continue
														
 
															+                sub_row.sort(key=lambda x: abs(x[0] - x[2]))
														
 
															+                if show:
														
 
															+                    print('sub_row[-1]', lt_text1.get_text(), sub_row[-1])
														
 
															+
														
 
															+                blank_row.append(sub_row[0])
														
 
															+
														
 
															+            # 判断最小距离，一行至少有一段空白大于最小距离
														
 
															+            match_flag = 0
														
 
															+            for r in blank_row:
														
 
															+                if abs(r[2] - r[0]) >= blank_min_width:
														
 
															+                    match_flag = 1
														
 
															+                    break
														
 
															+            if match_flag:
														
 
															+                blank_row_list.append(blank_row)
														
 
															+            else:
														
 
															+                blank_row_list.append([])
														
 
															+
														
 
															+    return blank_row_list
														
 
															+
														
 
															+
														
 
															+def row1_row2_has_same_blank(row1, row2):
														
 
															+    # row1的任一空白，都能和row2的任一空白相交
														
 
															+    cross_flag = 0
														
 
															+    for blank1 in row1:
														
 
															+        if cross_flag == 1:
														
 
															+            break
														
 
															+        for blank2 in row2:
														
 
															+            if blank1[0] <= blank2[0] <= blank1[2] \
														
 
															+                    or blank1[0] <= blank2[2] <= blank1[2] \
														
 
															+                    or blank2[0] <= blank1[0] <= blank2[2] \
														
 
															+                    or blank2[0] <= blank1[2] <= blank2[2]:
														
 
															+                cross_flag = 1
														
 
															+                break
														
 
															+
														
 
															+    if cross_flag:
														
 
															+        return True
														
 
															+    else:
														
 
															+        return False
														
 
															+
														
 
															+
														
 
															+@memory_decorator
														
 
															+def get_b_table_by_colon(b_table, blank_width, show=0):
														
 
															+    # print('into get_b_table_by_colon')
														
 
															+
														
 
															+    table_bbox = get_table_bbox(b_table)
														
 
															+
														
 
															+    # 有些确定为非表格，也输出，防止后续YOLO判断为表格，搞乱数据
														
 
															+    not_table_bbox_list = []
														
 
															+
														
 
															+    #
														
 
															+    # row_cnt_list = [len(x) in [2, 3, 4] for x in b_table]
														
 
															+
														
 
															+    # 所有行需是2列或4列，同一列算作一列
														
 
															+    row_cnt_list = []
														
 
															+    head_cnt_list = []
														
 
															+    for row in b_table:
														
 
															+        if not row:
														
 
															+            continue
														
 
															+        row.sort(key=lambda x: (x.bbox[0]))
														
 
															+        col_cnt = 1
														
 
															+        head_cnt = 0
														
 
															+        if re.search('[：:]', row[0].get_text()):
														
 
															+            head_cnt += 1
														
 
															+        for ci, col in enumerate(row):
														
 
															+            if ci == 0:
														
 
															+                continue
														
 
															+            col1 = row[ci - 1]
														
 
															+            col2 = row[ci]
														
 
															+            line1 = [(col1.bbox[0], 0), (col1.bbox[2], 0)]
														
 
															+            line2 = [(col2.bbox[0], 0), (col2.bbox[2], 0)]
														
 
															+            if line_iou(line1, line2) >= 0.5:
														
 
															+                continue
														
 
															+            else:
														
 
															+                col_cnt += 1
														
 
															+                if re.search('[：:]', col2.get_text()):
														
 
															+                    head_cnt += 1
														
 
															+        row_cnt_list.append(col_cnt in [2, 3, 4])
														
 
															+        head_cnt_list.append(head_cnt)
														
 
															+
														
 
															+    if show:
														
 
															+        print('row_cnt_list', row_cnt_list)
														
 
															+        print('head_cnt_list', head_cnt_list)
														
 
															+
														
 
															+    if max(head_cnt_list) > 2:
														
 
															+        if show:
														
 
															+            for row in b_table:
														
 
															+                print('head_cnt_list row', row)
														
 
															+        return [], None, not_table_bbox_list, table_bbox
														
 
															+
														
 
															+    # 最后一行年月日可能会影响列数，不是234列
														
 
															+    if row_cnt_list[-1] is False:
														
 
															+        row_cnt_list = row_cnt_list[:-1]
														
 
															+        b_table = b_table[:-1]
														
 
															+        table_bbox = get_table_bbox(b_table)
														
 
															+
														
 
															+    row_cnt_list = list(set(row_cnt_list))
														
 
															+    if not (len(row_cnt_list) == 1 and row_cnt_list[0] is True):
														
 
															+        return [], None, not_table_bbox_list, table_bbox
														
 
															+
														
 
															+    # 至少有2个以上文本包含冒号
														
 
															+    colon_cnt = 0
														
 
															+    for lt_text_row in b_table:
														
 
															+        for lt_text in lt_text_row:
														
 
															+            if re.search('[:：]', lt_text.get_text()) and re.search('[\u4e00-\u9fff]', lt_text.get_text()):
														
 
															+                colon_cnt += 1
														
 
															+    if show:
														
 
															+        print('colon_cnt, len(table)', colon_cnt, len(b_table))
														
 
															+    # if colon_cnt < 2:
														
 
															+    if colon_cnt < len(b_table) / 2:
														
 
															+        return [], None, not_table_bbox_list, table_bbox
														
 
															+
														
 
															+    blank_row_list = get_blank_row(b_table, blank_width)
														
 
															+    if show:
														
 
															+        print('b_table get_blank_row colon', b_table)
														
 
															+        print('blank_row_list colon', blank_row_list)
														
 
															+    # blank_row_list = [y for x in blank_row_list for y in x]
														
 
															+    # print('blank_row_list2', blank_row_list)
														
 
															+    # # 先选最长空白包含的所有空白
														
 
															+    # blank_row_list.sort(key=lambda x: abs(x[0]-x[2]), reverse=True)
														
 
															+    # max_blank = blank_row_list[0]
														
 
															+    # if show:
														
 
															+    #     print('max_blank', max_blank)
														
 
															+    # if abs(max_blank[0]-max_blank[2]) <= 4 * avg_char_width:
														
 
															+    #     return []
														
 
															+    # max_col = []
														
 
															+    # for blank_row_bbox in blank_row_list:
														
 
															+    #     if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
														
 
															+    #         max_col.append(blank_row_bbox)
														
 
															+    # if show:
														
 
															+    #     print('max_col', max_col)
														
 
															+    # if not max_col:
														
 
															+    #     return []
														
 
															+    # # 选取被包含最多的空白
														
 
															+    # blank_contain_cnt_dict = {}
														
 
															+    # for bi, blank_row_bbox in enumerate(max_col):
														
 
															+    #     blank_contain_cnt_dict[bi] = 0
														
 
															+    #     for blank_row_bbox2 in max_col:
														
 
															+    #         if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
														
 
															+    #             blank_contain_cnt_dict[bi] += 1
														
 
															+    # blank_contain_cnt_list = [[k, v] for k, v in blank_contain_cnt_dict.items()]
														
 
															+    # blank_contain_cnt_list.sort(key=lambda x: x[1])
														
 
															+    # if show:
														
 
															+    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
														
 
															+    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
														
 
															+
														
 
															+    center_blank_row = choose_center_blank(blank_row_list, blank_width)
														
 
															+    if show:
														
 
															+        print('center_blank_row', center_blank_row)
														
 
															+
														
 
															+    # 获取中心最短的空白，作为参考
														
 
															+    # blank_list = [get_blank_row(x) for x in b_table]
														
 
															+    # blank_list = [x[0] if len(x) == 1 else x[1] for x in blank_list]
														
 
															+    # blank_list.sort(key=lambda x: abs(x[2] - x[0]))
														
 
															+    # center_blank = blank_list[0]
														
 
															+    #
														
 
															+    # print('center_blank', center_blank)
														
 
															+
														
 
															+    # 根据中心空白，分为两列
														
 
															+    # col_list1 = []
														
 
															+    # col_list2 = []
														
 
															+    # col_box_dict = {}
														
 
															+    # for lt_text_row in b_table:
														
 
															+    #     lt_text_row.sort(key=lambda x: x.bbox[0])
														
 
															+    #     # if len(lt_text_row) == 4:
														
 
															+    #     #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
														
 
															+    #     #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
														
 
															+    #     #     box1 = [
														
 
															+    #     #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
														
 
															+    #     #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
														
 
															+    #     #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
														
 
															+    #     #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
														
 
															+    #     #     ]
														
 
															+    #     #     box2 = [
														
 
															+    #     #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
														
 
															+    #     #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
														
 
															+    #     #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
														
 
															+    #     #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
														
 
															+    #     #     ]
														
 
															+    #     #
														
 
															+    #     #     # col_list1.append(text1)
														
 
															+    #     #     # col_list2.append(text2)
														
 
															+    #     # else:
														
 
															+    #     #     text1 = lt_text_row[0].get_text()
														
 
															+    #     #     text2 = lt_text_row[1].get_text()
														
 
															+    #     #     box1 = lt_text_row[0].bbox
														
 
															+    #     #     box2 = lt_text_row[1].bbox
														
 
															+    #
														
 
															+    #     left_col = []
														
 
															+    #     right_col = []
														
 
															+    #     for lt_text in lt_text_row:
														
 
															+    #         if lt_text.bbox[2] <= center_blank_row[0]:
														
 
															+    #             left_col.append(lt_text)
														
 
															+    #         else:
														
 
															+    #             right_col.append(lt_text)
														
 
															+    #
														
 
															+    #     left_text = [x.get_text() for x in left_col]
														
 
															+    #     left_text = ''.join(left_text)
														
 
															+    #     right_text = [x.get_text() for x in right_col]
														
 
															+    #     right_text = ''.join(right_text)
														
 
															+    #
														
 
															+    #     text1 = left_text.strip()
														
 
															+    #     text2 = right_text.strip()
														
 
															+    #
														
 
															+    #     # if text1 in col_box_dict.keys():
														
 
															+    #     #     col_box_dict[text1] += [box1]
														
 
															+    #     # else:
														
 
															+    #     #     col_box_dict[text1] = [box1]
														
 
															+    #     # if text2 in col_box_dict.keys():
														
 
															+    #     #     col_box_dict[text2] += [box2]
														
 
															+    #     # else:
														
 
															+    #     #     col_box_dict[text2] = [box2]
														
 
															+    #
														
 
															+    #     col_list1.append(text1)
														
 
															+    #     col_list2.append(text2)
														
 
															+    #
														
 
															+    # if show:
														
 
															+    #     print('col_list1', col_list1)
														
 
															+    #     print('col_list2', col_list2)
														
 
															+
														
 
															+    # col_key_value_list1 = []
														
 
															+    # last_key = ""
														
 
															+    # for col1 in col_list1:
														
 
															+    #     match = re.search('[:：]+', col1)
														
 
															+    #     # 有冒号的
														
 
															+    #     if match:
														
 
															+    #         key = col1[:match.end()]
														
 
															+    #         if last_key:
														
 
															+    #             key = last_key + key
														
 
															+    #             last_key = ""
														
 
															+    #         value = col1[match.end():]
														
 
															+    #         col_key_value_list1.append([key, value])
														
 
															+    #     # 没有冒号的
														
 
															+    #     else:
														
 
															+    #         # 如果该值也存在在col_list2里，则看做表头，和下一行的表头连在一起
														
 
															+    #         if col1 in col_list2:
														
 
															+    #             if show:
														
 
															+    #                 print('col1 in col_list2')
														
 
															+    #             last_key = col1
														
 
															+    #         # 不存在，则是上一行的值，和上一行的值连在一起
														
 
															+    #         else:
														
 
															+    #             if col_key_value_list1 and re.search('[:：]', col_key_value_list1[-1][1]):
														
 
															+    #                 col_key_value_list1[-1][1] += col1
														
 
															+    #             else:
														
 
															+    #                 col_key_value_list1.append(["", col1])
														
 
															+    #
														
 
															+    # if show:
														
 
															+    #     print('col_key_value_list1', col_key_value_list1)
														
 
															+    #
														
 
															+    # col_key_value_list2 = []
														
 
															+    # last_key = ""
														
 
															+    # for col2 in col_list2:
														
 
															+    #     match = re.search('[:：]+', col2)
														
 
															+    #     if match:
														
 
															+    #         key = col2[:match.end()]
														
 
															+    #         if last_key:
														
 
															+    #             key = last_key + key
														
 
															+    #             last_key = ""
														
 
															+    #         value = col2[match.end():]
														
 
															+    #         col_key_value_list2.append([key, value])
														
 
															+    #     else:
														
 
															+    #         # 如果该值也存在在col_list1里，则看做表头，和下一行的表头连在一起
														
 
															+    #         if col2 in col_list1:
														
 
															+    #             if show:
														
 
															+    #                 print('col2 in col_list1')
														
 
															+    #             last_key = col2
														
 
															+    #         # 不存在，则是上一行的值，和上一行的值连在一起
														
 
															+    #         else:
														
 
															+    #             if col_key_value_list2 and re.search('[:：]', col_key_value_list2[-1][1]):
														
 
															+    #                 col_key_value_list2[-1][1] += col2
														
 
															+    #             else:
														
 
															+    #                 col_key_value_list2.append(["", col2])
														
 
															+    #
														
 
															+    # if show:
														
 
															+    #     print('col_key_value_list2', col_key_value_list2)
														
 
															+
														
 
															+    if not center_blank_row:
														
 
															+        return [], None, not_table_bbox_list, table_bbox
														
 
															+
														
 
															+    # 根据中心空白，分为两列
														
 
															+    col_list1, col_list2 = divide_2_col_by_center_blank(b_table, center_blank_row)
														
 
															+    # 非表格，一般是那种一行里键值离的较远的单列，加入非表格，后续yolo判断也忽略
														
 
															+    if not col_list1 and not col_list2:
														
 
															+        not_table_bbox = get_table_bbox(b_table)
														
 
															+        not_table_bbox_list.append(not_table_bbox)
														
 
															+        return [], None, not_table_bbox_list, table_bbox
														
 
															+
														
 
															+    # 两列中，分别设置head value
														
 
															+    col_key_value_list1 = set_head_value_in_col(col_list1, col_list2)
														
 
															+    col_key_value_list2 = set_head_value_in_col(col_list2, col_list1)
														
 
															+
														
 
															+    # 根据两列head value，形成行
														
 
															+    b_table_row_list = []
														
 
															+    for i in range(max(len(col_key_value_list1), len(col_key_value_list2))):
														
 
															+        if i >= len(col_key_value_list1):
														
 
															+            col1 = ["", ""]
														
 
															+        else:
														
 
															+            col1 = col_key_value_list1[i]
														
 
															+        if i >= len(col_key_value_list2):
														
 
															+            col2 = ["", ""]
														
 
															+        else:
														
 
															+            col2 = col_key_value_list2[i]
														
 
															+
														
 
															+        row = col1[:2] + col2[:2]
														
 
															+        b_table_row_list.append(row)
														
 
															+
														
 
															+    # 删除空白列
														
 
															+    # col_dict = {}
														
 
															+    # for row in b_table_row_list:
														
 
															+    #     for col_i, col in enumerate(row):
														
 
															+    #         if col_i in col_dict.keys():
														
 
															+    #             col_dict[col_i] += [col]
														
 
															+    #         else:
														
 
															+    #             col_dict[col_i] = [col]
														
 
															+    # delete_col_i = []
														
 
															+    # for col_i, cols in col_dict.items():
														
 
															+    #     cols = list(set(cols))
														
 
															+    #     if len(cols) == 1 and cols[0] == '':
														
 
															+    #         delete_col_i.append(col_i)
														
 
															+    #
														
 
															+    # temp_list = []
														
 
															+    # for row in b_table_row_list:
														
 
															+    #     new_col = []
														
 
															+    #     for col_i, col in enumerate(row):
														
 
															+    #         if col_i in delete_col_i:
														
 
															+    #             continue
														
 
															+    #         new_col.append(col)
														
 
															+    #     temp_list.append(new_col)
														
 
															+    # b_table_row_list = temp_list
														
 
															+
														
 
															+    # 去掉删除空白列
														
 
															+    # b_table_row_list = delete_blank_col(b_table_row_list)
														
 
															+
														
 
															+    # 修复因表头和值是同一列上下排列，导致的错位
														
 
															+    b_table_row_list = fix_head_value_match(b_table_row_list)
														
 
															+
														
 
															+    if show:
														
 
															+        print('b_table_row_list', b_table_row_list)
														
 
															+    return b_table_row_list, center_blank_row, not_table_bbox_list, table_bbox
														
 
															+
														
 
															+
														
 
															+@memory_decorator
														
 
															+def get_text_row_by_blank(lt_text_list, layout_h, show=0):
														
 
															+    if show:
														
 
															+        for lt_text_row in lt_text_list:
														
 
															+            print('lt_text_111', lt_text_row)
														
 
															+    lt_text_blank_list = get_up_down_blank(lt_text_list)
														
 
															+    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
														
 
															+    if show:
														
 
															+        for lt_text_row in lt_text_row_list:
														
 
															+            print('lt_text_row', lt_text_row)
														
 
															+
														
 
															+    return lt_text_row_list
														
 
															+
														
 
															+
														
 
															+def get_text_row_by_center_blank(b_table, lt_text_list, blank_width, layout_h, show=0):
														
 
															+    # 获取行空白
														
 
															+    blank_row_list = get_blank_row(b_table, blank_width)
														
 
															+    if show:
														
 
															+        print('b_table get_blank_row center_blank', b_table)
														
 
															+        print('blank_row_list center_blank', blank_row_list)
														
 
															+
														
 
															+    # 获取中心空白
														
 
															+    center_blank_row = choose_center_blank(blank_row_list, blank_width)
														
 
															+    if show:
														
 
															+        print('center_blank_row center', center_blank_row)
														
 
															+    if not center_blank_row:
														
 
															+        return [], []
														
 
															+
														
 
															+    center_x = (center_blank_row[2] + center_blank_row[0]) / 2
														
 
															+
														
 
															+    lt_text_blank_list = get_up_down_blank(lt_text_list, center_x=center_x)
														
 
															+
														
 
															+    lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
														
 
															+
														
 
															+    if show:
														
 
															+        for lt_text_row in lt_text_row_list:
														
 
															+            print('lt_text_row center', lt_text_row)
														
 
															+
														
 
															+    return lt_text_row_list, center_blank_row
														
 
															+
														
 
															+
														
 
															+def table_list_to_dict(table):
														
 
															+    table_dict_list = []
														
 
															+    for row in table:
														
 
															+        new_row = []
														
 
															+        for col in row:
														
 
															+            col_dict = {
														
 
															+                'rowspan': 1,
														
 
															+                'columnspan': 1,
														
 
															+                'text': col
														
 
															+            }
														
 
															+            new_row.append(col_dict)
														
 
															+        table_dict_list.append(new_row)
														
 
															+    return table_dict_list
														
 
															+
														
 
															+
														
 
															+@memory_decorator
														
 
															+def get_up_down_blank(lt_text_list, center_x=None, show=0):
														
 
															+    # 根据文本上下的空白分行
														
 
															+    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
														
 
															+    lt_text_blank_list = []
														
 
															+    for i in range(len(lt_text_list)):
														
 
															+        lt_text1 = lt_text_list[i]
														
 
															+        line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
														
 
															+        if center_x is not None:
														
 
															+            left_or_right1 = 0 if (lt_text1.bbox[0] + lt_text1.bbox[2]) / 2 <= center_x else 1
														
 
															+
														
 
															+        up_blank_list = []
														
 
															+        down_blank_list = []
														
 
															+        for j in range(len(lt_text_list)):
														
 
															+            lt_text2 = lt_text_list[j]
														
 
															+            if lt_text1 == lt_text2:
														
 
															+                continue
														
 
															+
														
 
															+            # 没有中间列分割
														
 
															+            if center_x is None:
														
 
															+                line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
														
 
															+                iou = line_iou(line1, line2)
														
 
															+                if lt_text2.bbox[1] > lt_text1.bbox[3] and iou > 0:
														
 
															+                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
														
 
															+                if lt_text2.bbox[3] < lt_text1.bbox[1] and iou > 0:
														
 
															+                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
														
 
															+                # if lt_text1.bbox[1] > lt_text2.bbox[3] and iou > 0:
														
 
															+                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
														
 
															+                # if lt_text1.bbox[3] < lt_text2.bbox[1] and iou > 0:
														
 
															+                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
														
 
															+            # 有中间列分割
														
 
															+            else:
														
 
															+                left_or_right2 = 0 if (lt_text2.bbox[0] + lt_text2.bbox[2]) / 2 <= center_x else 1
														
 
															+                if lt_text2.bbox[1] > lt_text1.bbox[3] and left_or_right1 == left_or_right2:
														
 
															+                    down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
														
 
															+                if lt_text2.bbox[3] < lt_text1.bbox[1] and left_or_right1 == left_or_right2:
														
 
															+                    up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
														
 
															+                # if lt_text1.bbox[1] > lt_text2.bbox[3] and left_or_right1 == left_or_right2:
														
 
															+                #     down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
														
 
															+                # if lt_text1.bbox[3] < lt_text2.bbox[1] and left_or_right1 == left_or_right2:
														
 
															+                #     up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
														
 
															+
														
 
															+        # 找不到的，空白设置为自身text高度
														
 
															+        text_h = abs(lt_text1.bbox[3] - lt_text1.bbox[1])
														
 
															+        if not up_blank_list:
														
 
															+            up_blank_list.append([max(0, lt_text1.bbox[1] - text_h), lt_text1.bbox[1]])
														
 
															+        if not down_blank_list:
														
 
															+            down_blank_list.append([lt_text1.bbox[3], lt_text1.bbox[3] + text_h])
														
 
															+
														
 
															+        down_blank = down_blank_list[0]
														
 
															+        up_blank = up_blank_list[-1]
														
 
															+
														
 
															+        if show:
														
 
															+            print('lt_text1.get_text()', lt_text1.get_text(), lt_text1.bbox)
														
 
															+            if center_x is not None:
														
 
															+                print('center_x', center_x)
														
 
															+            print('up_blank', up_blank)
														
 
															+            print('down_blank', down_blank)
														
 
															+
														
 
															+        lt_text_blank_list.append([lt_text1, up_blank, down_blank])
														
 
															+    return lt_text_blank_list
														
 
															+
														
 
															+
														
 
															+@memory_decorator
														
 
															+def filter_large_blank_row(lt_text_blank_list, layout_h, show=0):
														
 
															+    # 先过滤空白过大的，单独成行
														
 
															+    lt_text_row_list = []
														
 
															+    single_lt_text_list = []
														
 
															+    max_blank_h = layout_h / 6
														
 
															+    index = 0
														
 
															+    threshold = 20
														
 
															+    lt_text_blank_list.sort(key=lambda x: (x[0].bbox[1], x[0].bbox[0]))
														
 
															+    for lt_text1, up_blank1, down_blank1 in lt_text_blank_list:
														
 
															+        row = []
														
 
															+        # 空白高度大于一定值，单独一行
														
 
															+        match_flag = 0
														
 
															+        # 在最下方的lt_text，判断上空白
														
 
															+        if index >= len(lt_text_blank_list) - 4 \
														
 
															+                and abs(up_blank1[0] - up_blank1[1]) >= max_blank_h:
														
 
															+            if show:
														
 
															+                print('match single lt_text 1')
														
 
															+            match_flag = 1
														
 
															+        # 在最上方的lt_text，判断下空白
														
 
															+        elif index <= 2 \
														
 
															+                and abs(down_blank1[0] - down_blank1[1]) >= max_blank_h:
														
 
															+            if show:
														
 
															+                print('match single lt_text 2')
														
 
															+            match_flag = 1
														
 
															+        # 在中间的，上下一起判断
														
 
															+        elif 2 <= index <= len(lt_text_blank_list) - 4 \
														
 
															+                and abs(up_blank1[0] - down_blank1[1]) >= max_blank_h:
														
 
															+            # 判断没有同行的
														
 
															+            has_same_row_flag = 0
														
 
															+            for lt_text2, _, _ in lt_text_blank_list:
														
 
															+                if lt_text1 == lt_text2:
														
 
															+                    continue
														
 
															+                if lt_text1.bbox[1] - threshold <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= lt_text1.bbox[3] + threshold:
														
 
															+                    has_same_row_flag = 1
														
 
															+                    break
														
 
															+            if has_same_row_flag:
														
 
															+                match_flag = 0
														
 
															+            else:
														
 
															+                match_flag = 1
														
 
															+            if show:
														
 
															+                print('match single lt_text 3')
														
 
															+
														
 
															+        if match_flag:
														
 
															+            row.append(lt_text1)
														
 
															+            lt_text_row_list.append(row)
														
 
															+            single_lt_text_list.append(lt_text1)
														
 
															+        index += 1
														
 
															+
														
 
															+    if show:
														
 
															+        print('single_lt_text_list', single_lt_text_list)
														
 
															+    return lt_text_row_list, single_lt_text_list
														
 
															+
														
 
															+
														
 
															+@memory_decorator
														
 
															+def get_contain_blank_row(lt_text_blank_list, layout_h, show=0):
														
 
															+    from format_convert.convert_tree import TextBox
														
 
															+    lt_text_row_list, single_lt_text_list = filter_large_blank_row(lt_text_blank_list, layout_h)
														
 
															+    single_lt_text_list = set(single_lt_text_list)
														
 
															+
														
 
															+    # 空白互相包含的就是同一行
														
 
															+    time1 = time.time()
														
 
															+    threshold = 5
														
 
															+    used_lt_text_list = set([])
														
 
															+    another_used_lt_text_list = set([])
														
 
															+    for i1 in range(len(lt_text_blank_list)):
														
 
															+        time2 = time.time()
														
 
															+        lt_text1, up_blank1, down_blank1 = lt_text_blank_list[i1]
														
 
															+        row = []
														
 
															+        if lt_text1 in single_lt_text_list:
														
 
															+            continue
														
 
															+        for i2 in range(len(lt_text_blank_list)):
														
 
															+            lt_text2, up_blank2, down_blank2 = lt_text_blank_list[i2]
														
 
															+            if lt_text1 == lt_text2:
														
 
															+                continue
														
 
															+            if lt_text2 in another_used_lt_text_list:
														
 
															+                continue
														
 
															+            if lt_text2 in used_lt_text_list and lt_text1.bbox[1] >= lt_text2.bbox[3]:
														
 
															+                continue
														
 
															+            if lt_text2 in single_lt_text_list:
														
 
															+                continue
														
 
															+
														
 
															+            # 单独上空白包含上空白，下空白包含下空白
														
 
															+            if (up_blank1[0] - threshold <= up_blank2[0] <= up_blank2[1] <= up_blank1[1] + threshold) \
														
 
															+                    or (down_blank1[0] - threshold <= down_blank2[0] <= down_blank2[1] <= down_blank1[1] + threshold):
														
 
															+                    # or (up_blank2[0] - threshold <= up_blank1[0] <= up_blank1[1] <= up_blank2[1] + threshold) \
														
 
															+                    # or (down_blank2[0] - threshold <= down_blank1[0] <= down_blank1[1] <= down_blank2[1] + threshold):
														
 
															+                if lt_text2 not in row:
														
 
															+                    row.append(lt_text2)
														
 
															+                    used_lt_text_list.add(lt_text2)
														
 
															+
														
 
															+            # 若是上下空白包含了另一个的文本部分，也成立
														
 
															+            # if up_blank1[0] <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= down_blank1[1]:
														
 
															+            #     if lt_text2 not in row:
														
 
															+            #         row.append(lt_text2)
														
 
															+            #         used_lt_text_list.append(lt_text2)
														
 
															+
														
 
															+
														
 
															+
														
 
															+        if lt_text1 not in row:
														
 
															+            row.append(lt_text1)
														
 
															+
														
 
															+        if show:
														
 
															+            print('get_contain_blank_row loop2 cost:', time.time()-time2)
														
 
															+
														
 
															+        # 若一个row中有3个带冒号的，说明误把一个单独行合进来了，分开
														
 
															+        time2 = time.time()
														
 
															+        colon_cnt = 0
														
 
															+        colon_lt_text = []
														
 
															+        for lt in row:
														
 
															+            if re.search('[:：]', lt.get_text()):
														
 
															+                colon_cnt += 1
														
 
															+                colon_lt_text.append(lt)
														
 
															+        if colon_cnt >= 3:
														
 
															+            if show:
														
 
															+                print('colon_cnt >= 3 row', row)
														
 
															+
														
 
															+            another_lt_text_list = find_outline_lt_text(row)
														
 
															+
														
 
															+            # # 把y最大的lt_text单独放一行
														
 
															+            # colon_lt_text.sort(key=lambda x: x.bbox[1])
														
 
															+            # # 除了前两个，其他都单放一行
														
 
															+            # another_lt_text_list = colon_lt_text[2:]
														
 
															+            for lt_text in another_lt_text_list:
														
 
															+                if lt_text in row:
														
 
															+                    row.remove(lt_text)
														
 
															+                if lt_text in colon_lt_text:
														
 
															+                    colon_lt_text.remove(lt_text)
														
 
															+
														
 
															+            if show:
														
 
															+                print('another_lt_text_list', another_lt_text_list)
														
 
															+                print('colon_lt_text', colon_lt_text)
														
 
															+
														
 
															+            if not colon_lt_text:
														
 
															+                continue
														
 
															+
														
 
															+            colon_lt_text.sort(key=lambda x: x.bbox[0])
														
 
															+            lt_text_row_list.append(row)
														
 
															+            for another_lt_text in another_lt_text_list:
														
 
															+                if abs(another_lt_text.bbox[0] - colon_lt_text[0].bbox[0]) > abs(
														
 
															+                        another_lt_text.bbox[0] - colon_lt_text[-1].bbox[0]):
														
 
															+                    new_bbox = [colon_lt_text[0].bbox[0], another_lt_text.bbox[1],
														
 
															+                                colon_lt_text[0].bbox[2], another_lt_text.bbox[3]]
														
 
															+                    another_row = [TextBox(text="@@:", bbox=new_bbox), another_lt_text]
														
 
															+                else:
														
 
															+                    new_bbox = [colon_lt_text[-1].bbox[0], another_lt_text.bbox[1],
														
 
															+                                colon_lt_text[-1].bbox[2], another_lt_text.bbox[3]]
														
 
															+                    # 新增一列占位
														
 
															+                    another_row = [another_lt_text, TextBox(text="@@:", bbox=new_bbox)]
														
 
															+                if show:
														
 
															+                    print('another_row', another_row)
														
 
															+                for lt_text3 in another_row:
														
 
															+                    another_used_lt_text_list.add(lt_text3)
														
 
															+                lt_text_row_list.append(another_row)
														
 
															+        else:
														
 
															+            lt_text_row_list.append(row)
														
 
															+
														
 
															+        if show:
														
 
															+            print('get_contain_blank_row judge colon cost:', time.time()-time2)
														
 
															+
														
 
															+    if show:
														
 
															+        print('get_contain_blank_row double loop cost: ', time.time()-time1)
														
 
															+
														
 
															+    # 去重
														
 
															+    lt_text_row_list.sort(key=lambda x: len(x), reverse=True)
														
 
															+    if show:
														
 
															+        for lt_text_row in lt_text_row_list:
														
 
															+            print('before dedup lt_text_row', lt_text_row)
														
 
															+
														
 
															+    lt_text_row_list = merge_intersecting_lists(lt_text_row_list)
														
 
															+
														
 
															+    if show:
														
 
															+        for lt_text_row in lt_text_row_list:
														
 
															+            print('after dedup lt_text_row', lt_text_row)
														
 
															+
														
 
															+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
														
 
															+
														
 
															+    # 剔除全是空白的行
														
 
															+    temp_list = []
														
 
															+    for lt_text_row in lt_text_row_list:
														
 
															+        row_text = ""
														
 
															+        for lt_text in lt_text_row:
														
 
															+            row_text += lt_text.get_text()
														
 
															+        if re.sub('\s+', '', row_text) == "":
														
 
															+            continue
														
 
															+        temp_list.append(lt_text_row)
														
 
															+    lt_text_row_list = temp_list
														
 
															+    return lt_text_row_list
														
 
															+
														
 
															+
														
 
															+def choose_center_blank(blank_row_list, blank_width, show=0):
														
 
															+    if not blank_row_list:
														
 
															+        return []
														
 
															+
														
 
															+    # 先选最长空白包含的所有空白
														
 
															+    blank_list = [y for x in blank_row_list for y in x]
														
 
															+    if not blank_list:
														
 
															+        return []
														
 
															+
														
 
															+    blank_list.sort(key=lambda x: abs(x[0] - x[2]), reverse=True)
														
 
															+    max_blank = blank_list[0]
														
 
															+    if show:
														
 
															+        print('max_blank', max_blank)
														
 
															+    if abs(max_blank[0] - max_blank[2]) <= blank_width:
														
 
															+        return []
														
 
															+
														
 
															+    max_col = []
														
 
															+    for blank_row in blank_row_list:
														
 
															+        if not blank_row:
														
 
															+            continue
														
 
															+
														
 
															+        # # 找出每一行最大的空白列，但是同一列中则选列中最小的空白
														
 
															+        # # 空白分列
														
 
															+        # blank_row.sort(key=lambda x: (x[0], x[1]))
														
 
															+        # last_blank_bbox = blank_row[0]
														
 
															+        # blank_col = []
														
 
															+        # blank_col_list = []
														
 
															+        # for blank_bbox in blank_row[1:]:
														
 
															+        #     line1 = ([blank_bbox[0], 0], [blank_bbox[2], 0])
														
 
															+        #     line2 = ([last_blank_bbox[0], 0], [last_blank_bbox[2], 0])
														
 
															+        #     if line_iou(line1, line2) >= 0.7:
														
 
															+        #         blank_col += [blank_bbox, last_blank_bbox]
														
 
															+        #     else:
														
 
															+        #         blank_col.sort(key=lambda x: abs(x[2] - x[0]))
														
 
															+        #         blank_col_list.append(blank_col)
														
 
															+        #         blank_col = []
														
 
															+        #     last_blank_bbox = blank_bbox
														
 
															+
														
 
															+        # 选最大的列
														
 
															+        max_blank_bbox = blank_row[0]
														
 
															+        for blank_bbox in blank_row[1:]:
														
 
															+            if abs(blank_bbox[0] - blank_bbox[2]) > abs(max_blank_bbox[0] - max_blank_bbox[2]):
														
 
															+                max_blank_bbox = blank_bbox
														
 
															+
														
 
															+        if show:
														
 
															+            print('max_blank_bbox, blank_row', max_blank_bbox, blank_row)
														
 
															+
														
 
															+        line1 = ([max_blank[0], 0], [max_blank[2], 0])
														
 
															+        line2 = ([max_blank_bbox[0], 0], [max_blank_bbox[2], 0])
														
 
															+        iou = line_iou(line1, line2)
														
 
															+        # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
														
 
															+        if iou >= 0.5:
														
 
															+            max_col.append(max_blank_bbox)
														
 
															+    if show:
														
 
															+        print('max_col', max_col)
														
 
															+    if not max_col:
														
 
															+        return []
														
 
															+
														
 
															+    # # 选取被包含最多的空白
														
 
															+    # # 选取交集最多的空白，相同数量则最短
														
 
															+    # blank_contain_cnt_dict = {}
														
 
															+    # for bi, blank_row_bbox in enumerate(max_col):
														
 
															+    #     blank_contain_cnt_dict[bi] = 0
														
 
															+    #     for blank_row_bbox2 in max_col:
														
 
															+    #         line1 = ([blank_row_bbox2[0], 0], [blank_row_bbox2[2], 0])
														
 
															+    #         line2 = ([blank_row_bbox[0], 0], [blank_row_bbox[2], 0])
														
 
															+    #         iou = line_iou(line1, line2)
														
 
															+    #         # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
														
 
															+    #         if iou >= 0.2:
														
 
															+    #             blank_contain_cnt_dict[bi] += 1
														
 
															+    # blank_contain_cnt_list = [[k, v, abs(max_col[k][2] - max_col[k][0])/2] for k, v in blank_contain_cnt_dict.items()]
														
 
															+    # blank_contain_cnt_list.sort(key=lambda x: (x[1], -x[2]))
														
 
															+    # if show:
														
 
															+    #     print('blank_contain_cnt_list', blank_contain_cnt_list)
														
 
															+    # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
														
 
															+
														
 
															+    # 选取交集部分
														
 
															+    center_blank_row = get_inter_part(max_col)
														
 
															+    return center_blank_row
														
 
															+
														
 
															+
														
 
															+def set_head_value_in_col(col_list1, col_list2, show=0):
														
 
															+    # 在列中设置 表头和值
														
 
															+    col_key_value_list = []
														
 
															+    last_key = ""
														
 
															+    for col1 in col_list1:
														
 
															+        match = re.search('[:：]+', col1)
														
 
															+        # 有冒号的
														
 
															+        if match:
														
 
															+            key = col1[:match.end()]
														
 
															+            if last_key:
														
 
															+                key = last_key + key
														
 
															+                last_key = ""
														
 
															+            value = col1[match.end():]
														
 
															+            col_key_value_list.append([key, value])
														
 
															+        # 没有冒号的
														
 
															+        else:
														
 
															+            # 如果该值也存在在col_list2里，则看做表头，和下一行的表头连在一起
														
 
															+            if col1 in col_list2:
														
 
															+                if show:
														
 
															+                    print('col1 in col_list2')
														
 
															+                # 若上一行也是无冒号的，直接加入一行
														
 
															+                if last_key:
														
 
															+                    col_key_value_list.append(["", last_key])
														
 
															+                    last_key = ''
														
 
															+                last_key = col1
														
 
															+            # 不存在，则是上一行的值，和上一行的值连在一起
														
 
															+            else:
														
 
															+                if col_key_value_list and re.search('[:：]', col_key_value_list[-1][1]):
														
 
															+                    col_key_value_list[-1][1] += col1
														
 
															+                else:
														
 
															+                    col_key_value_list.append(["", col1])
														
 
															+
														
 
															+    # 如果是最后一行没有冒号的，col1 col2都有的，直接当做一行
														
 
															+    if last_key:
														
 
															+        col_key_value_list.append(["", last_key])
														
 
															+
														
 
															+    if show:
														
 
															+        print('col_key_value_list', col_key_value_list)
														
 
															+
														
 
															+    return col_key_value_list
														
 
															+
														
 
															+
														
 
															+def divide_2_col_by_center_blank(b_table, center_blank_row, show=0):
														
 
															+    # 根据中心空白，分为两列
														
 
															+    col_list1 = []
														
 
															+    col_list2 = []
														
 
															+    col_box_dict = {}
														
 
															+    for lt_text_row in b_table:
														
 
															+        lt_text_row.sort(key=lambda x: x.bbox[0])
														
 
															+        # if len(lt_text_row) == 4:
														
 
															+        #     text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
														
 
															+        #     text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
														
 
															+        #     box1 = [
														
 
															+        #         min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
														
 
															+        #         max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
														
 
															+        #         min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
														
 
															+        #         max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
														
 
															+        #     ]
														
 
															+        #     box2 = [
														
 
															+        #         min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
														
 
															+        #         max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
														
 
															+        #         min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
														
 
															+        #         max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
														
 
															+        #     ]
														
 
															+        #
														
 
															+        #     # col_list1.append(text1)
														
 
															+        #     # col_list2.append(text2)
														
 
															+        # else:
														
 
															+        #     text1 = lt_text_row[0].get_text()
														
 
															+        #     text2 = lt_text_row[1].get_text()
														
 
															+        #     box1 = lt_text_row[0].bbox
														
 
															+        #     box2 = lt_text_row[1].bbox
														
 
															+
														
 
															+        left_col = []
														
 
															+        right_col = []
														
 
															+        for lt_text in lt_text_row:
														
 
															+            if (lt_text.bbox[2] + lt_text.bbox[0]) / 2 <= abs(center_blank_row[0] + center_blank_row[2]) / 2:
														
 
															+                left_col.append(lt_text)
														
 
															+            else:
														
 
															+                right_col.append(lt_text)
														
 
															+
														
 
															+        # 按阅读顺序排序
														
 
															+        left_col = sort_by_read_order(left_col)
														
 
															+        left_text = [x.get_text() for x in left_col]
														
 
															+        left_text = ''.join(left_text)
														
 
															+        right_col = sort_by_read_order(right_col)
														
 
															+        right_text = [x.get_text() for x in right_col]
														
 
															+        right_text = ''.join(right_text)
														
 
															+
														
 
															+        text1 = left_text.strip()
														
 
															+        text2 = right_text.strip()
														
 
															+
														
 
															+        col_list1.append(text1)
														
 
															+        col_list2.append(text2)
														
 
															+
														
 
															+    if show:
														
 
															+        print('col_list1', col_list1)
														
 
															+        print('col_list2', col_list2)
														
 
															+
														
 
															+    # 两列都必须有冒号，否则就是非2列表格
														
 
															+    colon_cnt1 = 0
														
 
															+    colon_cnt2 = 0
														
 
															+    for col in col_list1:
														
 
															+        if re.search('[：:]', col):
														
 
															+            colon_cnt1 += 1
														
 
															+    for col in col_list2:
														
 
															+        if re.search('[：:]', col):
														
 
															+            colon_cnt2 += 1
														
 
															+
														
 
															+    if colon_cnt1 < len(col_list1) / 3 or colon_cnt2 < len(col_list2) / 3:
														
 
															+        col_list1 = []
														
 
															+        col_list2 = []
														
 
															+        if show:
														
 
															+            print('col_list1 colon_cnt1 less', colon_cnt1)
														
 
															+            print('col_list2 colon_cnt2 less', colon_cnt2)
														
 
															+
														
 
															+    return col_list1, col_list2
														
 
															+
														
 
															+
														
 
															+def delete_blank_col(b_table_row_list):
														
 
															+    # 删除空白列
														
 
															+    col_dict = {}
														
 
															+    for row in b_table_row_list:
														
 
															+        for col_i, col in enumerate(row):
														
 
															+            if col_i in col_dict.keys():
														
 
															+                col_dict[col_i] += [col]
														
 
															+            else:
														
 
															+                col_dict[col_i] = [col]
														
 
															+    delete_col_i = []
														
 
															+    for col_i, cols in col_dict.items():
														
 
															+        cols = list(set(cols))
														
 
															+        if len(cols) == 1 and cols[0] == '':
														
 
															+            delete_col_i.append(col_i)
														
 
															+
														
 
															+    temp_list = []
														
 
															+    for row in b_table_row_list:
														
 
															+        new_col = []
														
 
															+        for col_i, col in enumerate(row):
														
 
															+            if col_i in delete_col_i:
														
 
															+                continue
														
 
															+            new_col.append(col)
														
 
															+        temp_list.append(new_col)
														
 
															+    b_table_row_list = temp_list
														
 
															+    return b_table_row_list
														
 
															+
														
 
															+
														
 
															+def fix_head_value_match(b_table, show=0):
														
 
															+    if not b_table:
														
 
															+        return b_table
														
 
															+    if len(b_table[0]) != 4:
														
 
															+        return b_table
														
 
															+    maybe_head_index = None
														
 
															+    match_head_value_dict = {}
														
 
															+    # 修复值跨行
														
 
															+    for row_i, row in enumerate(b_table):
														
 
															+        if maybe_head_index is None:
														
 
															+            if row[1] in ["", '@@:'] and row[3] in ["", '@@:']:
														
 
															+                match1 = re.search("[:：]", row[0])
														
 
															+                match2 = re.search("[:：]", row[2])
														
 
															+                if match1 and match2:
														
 
															+                    maybe_head_index = row_i
														
 
															+        else:
														
 
															+            if row[0] in ["", '@@:'] and row[2] in ["", '@@:'] and row[1] not in ["", '@@:'] and row[3] not in ["", '@@:']:
														
 
															+                if maybe_head_index in match_head_value_dict.keys():
														
 
															+                    match_head_value_dict[maybe_head_index] += [row_i]
														
 
															+                else:
														
 
															+                    match_head_value_dict[maybe_head_index] = [row_i]
														
 
															+            else:
														
 
															+                maybe_head_index = None
														
 
															+
														
 
															+    if show:
														
 
															+        print('match_head_value_dict', match_head_value_dict)
														
 
															+
														
 
															+    add_row_dict = {}
														
 
															+    delete_head_index_list = []
														
 
															+    delete_value_index_list = []
														
 
															+    for row_index, value_index_list in match_head_value_dict.items():
														
 
															+        head_row = b_table[row_index]
														
 
															+        delete_head_index_list.append(row_index)
														
 
															+        left_value_text = ""
														
 
															+        right_value_text = ""
														
 
															+        for value_index in value_index_list:
														
 
															+            value_row = b_table[value_index]
														
 
															+            delete_value_index_list.append(value_index)
														
 
															+            for col in value_row[:2]:
														
 
															+                left_value_text += col
														
 
															+            for col in value_row[2:]:
														
 
															+                right_value_text += col
														
 
															+        head_row[1] = left_value_text
														
 
															+        head_row[3] = right_value_text
														
 
															+        add_row_dict[row_index] = head_row
														
 
															+
														
 
															+    # 删掉原来的，加上新的row
														
 
															+    temp_list = []
														
 
															+    for row_i, row in enumerate(b_table):
														
 
															+        if row_i in delete_head_index_list:
														
 
															+            temp_list.append(add_row_dict.get(row_i))
														
 
															+            continue
														
 
															+        if row_i in delete_value_index_list:
														
 
															+            continue
														
 
															+        temp_list.append(row)
														
 
															+    b_table = temp_list
														
 
															+    return b_table
														
 
															+
														
 
															+
														
 
															+def add_last_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
														
 
															+                  table_lt_text_row_list, show=0):
														
 
															+    if not b_table:
														
 
															+        return b_table
														
 
															+    if len(b_table[0]) not in [4]:
														
 
															+        return b_table
														
 
															+
														
 
															+    blank_h_list = []
														
 
															+    max_h_list = []
														
 
															+    for lt_text_row in table_lt_text_row_list:
														
 
															+        if not lt_text_row:
														
 
															+            continue
														
 
															+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
														
 
															+        max_h_list.append(max_h)
														
 
															+    max_h_list.sort(key=lambda x: x)
														
 
															+    for i in range(1, len(max_h_list)):
														
 
															+        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
														
 
															+    mean_blank_h = np.mean(blank_h_list)
														
 
															+    if show:
														
 
															+        print('add_last_rows blank_width_list', blank_h_list)
														
 
															+        print('add_last_rows mean_blank_h', mean_blank_h)
														
 
															+
														
 
															+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
														
 
															+    match_row_list = []
														
 
															+    threshold = 5
														
 
															+    add_blank_h = mean_blank_h + threshold
														
 
															+    for li, lt_text_row in enumerate(lt_text_row_list):
														
 
															+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
														
 
															+        if show:
														
 
															+            print('max_h > table_bbox[3]', lt_text_row, max_h, table_bbox[3])
														
 
															+        # 高度需要在表格y2和y2加上空白的距离间
														
 
															+        if table_bbox[3] < max_h < table_bbox[3] + add_blank_h:
														
 
															+            # lt_text x轴上穿过了中心bbox，则跳过
														
 
															+            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
														
 
															+                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
														
 
															+                continue
														
 
															+
														
 
															+            # 左边需在表格x1和中心x1之间
														
 
															+            if table_bbox[0] - threshold <= min_w < center_blank_bbox[0]:
														
 
															+                match_row_list.append([lt_text_row, 0, max_h])
														
 
															+            # 右边需在表格x2和中心x2之间
														
 
															+            elif center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3:
														
 
															+                match_row_list.append([lt_text_row, 1, max_h])
														
 
															+            else:
														
 
															+                print('center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3')
														
 
															+                break
														
 
															+
														
 
															+            add_blank_h = add_blank_h + mean_blank_h + threshold
														
 
															+
														
 
															+    if show:
														
 
															+        print('add_last_rows match_row_list', match_row_list)
														
 
															+
														
 
															+    add_b_table = []
														
 
															+    real_max_h = None
														
 
															+    for mi, match_row in enumerate(match_row_list):
														
 
															+        lt_text_row, is_right, max_h = match_row
														
 
															+        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
														
 
															+        # 只有一列
														
 
															+        if len(lt_text_row) == 1:
														
 
															+            text = lt_text_row[0].get_text()
														
 
															+            match = re.search('[:：]+', text)
														
 
															+            real_max_h = max_h
														
 
															+            if not match:
														
 
															+                head = ""
														
 
															+                value = text
														
 
															+            else:
														
 
															+                head = text[:match.end()]
														
 
															+                value = text[match.end():]
														
 
															+        # 或 两列，其实是表头由于空白被隔开
														
 
															+        elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
														
 
															+                and lt_text_row[1].get_text()[-1] in [':', "："]:
														
 
															+            text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
														
 
															+            head = text
														
 
															+            value = ''
														
 
															+        # 两列
														
 
															+        elif len(lt_text_row) == 2:
														
 
															+            text1 = lt_text_row[0].get_text()
														
 
															+            match = re.search('[:：]+', text1)
														
 
															+            if not match:
														
 
															+                break
														
 
															+            real_max_h = max_h
														
 
															+            head = text1
														
 
															+            value = lt_text_row[1].get_text()
														
 
															+        else:
														
 
															+            if show:
														
 
															+                print('add_last_rows len(lt_text_row) break', len(lt_text_row))
														
 
															+            break
														
 
															+
														
 
															+        # 获取上一行，可能需要将值补到上一行
														
 
															+        if mi == 0 or len(add_b_table) == 0:
														
 
															+            last_row = b_table[-1]
														
 
															+            last_flag = 0
														
 
															+        else:
														
 
															+            last_row = add_b_table[-1]
														
 
															+            last_flag = 1
														
 
															+
														
 
															+        if is_right:
														
 
															+            if last_row[2] and not last_row[3] and not head and value:
														
 
															+                b_table[-1][3] = value
														
 
															+                current_row = ["", "", last_row[2], value]
														
 
															+            else:
														
 
															+                current_row = ["", "", head, value]
														
 
															+        else:
														
 
															+            if last_row[0] and not last_row[1] and not head and value:
														
 
															+                current_row = [last_row[0], value, "", ""]
														
 
															+            else:
														
 
															+                current_row = [head, value, "", ""]
														
 
															+
														
 
															+        # if last_flag == 0:
														
 
															+        #     b_table = b_table[:-1]
														
 
															+        add_b_table.append(current_row)
														
 
															+
														
 
															+        if show:
														
 
															+            print('current_row', current_row)
														
 
															+
														
 
															+    if show:
														
 
															+        print('add_b_table', add_b_table)
														
 
															+
														
 
															+    b_table += add_b_table
														
 
															+    if real_max_h is not None:
														
 
															+        table_bbox[3] = real_max_h
														
 
															+    return b_table
														
 
															+
														
 
															+
														
 
															+def add_first_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
														
 
															+                   table_lt_text_row_list, show=0):
														
 
															+    if not b_table:
														
 
															+        return b_table
														
 
															+    if len(b_table[0]) not in [4]:
														
 
															+        return b_table
														
 
															+
														
 
															+    blank_h_list = []
														
 
															+    max_h_list = []
														
 
															+    for lt_text_row in table_lt_text_row_list:
														
 
															+        if not lt_text_row:
														
 
															+            continue
														
 
															+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
														
 
															+        max_h_list.append(max_h)
														
 
															+    max_h_list.sort(key=lambda x: x)
														
 
															+    for i in range(1, len(max_h_list)):
														
 
															+        blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
														
 
															+    mean_blank_h = np.mean(blank_h_list)
														
 
															+    if show:
														
 
															+        print('add_first_rows blank_width_list', blank_h_list)
														
 
															+        print('add_first_rows mean_blank_h', mean_blank_h)
														
 
															+
														
 
															+    lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
														
 
															+    match_row_list = []
														
 
															+    threshold = 5
														
 
															+    add_blank_h = mean_blank_h + threshold
														
 
															+    for li, lt_text_row in enumerate(lt_text_row_list):
														
 
															+        min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
														
 
															+        if show:
														
 
															+            print('min_h < table_bbox[3]', lt_text_row, min_h, table_bbox[3])
														
 
															+        # 高度需要有一部分在在表格中
														
 
															+        if min_h <= table_bbox[1] < max_h:
														
 
															+            # lt_text x轴上穿过了中心bbox，则跳过
														
 
															+            if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
														
 
															+                print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
														
 
															+                continue
														
 
															+            # match_row_list.append([lt_text_row, 1, min_h])
														
 
															+
														
 
															+            # 中心x1左边
														
 
															+            if min_w < center_blank_bbox[0]:
														
 
															+                match_row_list.append([lt_text_row, 0, min_h])
														
 
															+            # 中心x2右边
														
 
															+            elif center_blank_bbox[2] < max_w:
														
 
															+                match_row_list.append([lt_text_row, 1, min_h])
														
 
															+            else:
														
 
															+                break
														
 
															+
														
 
															+    if show:
														
 
															+        print('add_first_rows match_row_list', match_row_list)
														
 
															+
														
 
															+    real_min_h = None
														
 
															+    for mi, match_row in enumerate(match_row_list):
														
 
															+        lt_text_row, is_right, min_h = match_row
														
 
															+        lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
														
 
															+        # 只有一列
														
 
															+        if len(lt_text_row) == 1:
														
 
															+            text = lt_text_row[0].get_text()
														
 
															+            match = re.search('[:：]+', text)
														
 
															+            real_min_h = min_h
														
 
															+            if not match:
														
 
															+                head = ""
														
 
															+                value = text
														
 
															+            else:
														
 
															+                head = text[:match.end()]
														
 
															+                value = text[match.end():]
														
 
															+        # # 或 两列，其实是表头由于空白被隔开
														
 
															+        # elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
														
 
															+        #         and lt_text_row[1].get_text()[-1] in [':', "："]:
														
 
															+        #     text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
														
 
															+        #     head = text
														
 
															+        #     value = ''
														
 
															+        # # 两列
														
 
															+        # elif len(lt_text_row) == 2:
														
 
															+        #     text1 = lt_text_row[0].get_text()
														
 
															+        #     match = re.search('[:：]+', text1)
														
 
															+        #     if not match:
														
 
															+        #         break
														
 
															+        #     real_max_h = max_h
														
 
															+        #     head = text1
														
 
															+        #     value = lt_text_row[1].get_text()
														
 
															+        else:
														
 
															+            if show:
														
 
															+                print('add_first_rows len(lt_text_row) break', len(lt_text_row))
														
 
															+            break
														
 
															+
														
 
															+        # 获取表格第一行，可能需要将值补进去
														
 
															+        if not head and value:
														
 
															+            if is_right:
														
 
															+                b_table[0][3] = value + b_table[0][3]
														
 
															+            else:
														
 
															+                b_table[0][1] = value + b_table[0][1]
														
 
															+
														
 
															+    if real_min_h is not None:
														
 
															+        table_bbox[1] = real_min_h
														
 
															+    return b_table
														
 
															+
														
 
															+
														
 
															+def get_row_bbox(row, mode='list'):
														
 
															+    # 提取所有x1, y1, x2, y2的值
														
 
															+
														
 
															+    if mode == 'list':
														
 
															+        x1_values = [x[0] for x in row]
														
 
															+        y1_values = [x[1] for x in row]
														
 
															+        x2_values = [x[2] for x in row]
														
 
															+        y2_values = [x[3] for x in row]
														
 
															+    elif mode == '.bbox':
														
 
															+        x1_values = [x.bbox[0] for x in row]
														
 
															+        y1_values = [x.bbox[1] for x in row]
														
 
															+        x2_values = [x.bbox[2] for x in row]
														
 
															+        y2_values = [x.bbox[3] for x in row]
														
 
															+
														
 
															+    min_x = min(x1_values)
														
 
															+    max_x = max(x2_values)
														
 
															+    min_y = min(y1_values)
														
 
															+    max_y = max(y2_values)
														
 
															+    return min_x, min_y, max_x, max_y
														
 
															+
														
 
															+
														
 
															+def shrink_bbox(img, bbox_list):
														
 
															+    def return_not_most_color_index(image_np, match_color):
														
 
															+        # 计算每个像素与背景色的欧几里得距离的平方
														
 
															+        diff = np.sum(np.sqrt((image_np.astype(np.int32) - match_color.astype(np.int32)) ** 2), axis=2)
														
 
															+        threshold = 100  # 假设阈值为 10000，可以调整
														
 
															+        diff_mask = diff > threshold
														
 
															+        # 获取与背景色相差较大的像素的索引
														
 
															+        diff_index = np.where(diff_mask)
														
 
															+        # print('diff_index.size', diff_index[0].size)
														
 
															+        return diff_index
														
 
															+
														
 
															+    def return_not_most_color_index_fast(image_np, match_color):
														
 
															+        # 将图像和匹配颜色转换为整数类型
														
 
															+        # image_int = image_np.astype(np.int32)
														
 
															+        # match_color_int = match_color.astype(np.int32)
														
 
															+
														
 
															+        # 计算每个像素与背景色的欧几里得距离的平方
														
 
															+        diff = np.sum((image_np - match_color) ** 2, axis=2)
														
 
															+        threshold = 20 # 假设阈值为 10000，可以调整
														
 
															+        threshold = threshold ** 2
														
 
															+        diff_mask = diff > threshold
														
 
															+        # 获取与背景色相差较大的像素的索引
														
 
															+        diff_index = np.where(diff_mask)
														
 
															+        # print('diff_index.size', diff_index[0].size)
														
 
															+        return diff_index
														
 
															+
														
 
															+
														
 
															+    # def count_colors_with_histogram(img):
														
 
															+    #     time00 = time.time()
														
 
															+    #
														
 
															+    #     # 计算每个颜色通道的直方图
														
 
															+    #     hist_b = cv2.calcHist([img], [0], None, [256], [0, 256])
														
 
															+    #     hist_g = cv2.calcHist([img], [1], None, [256], [0, 256])
														
 
															+    #     hist_r = cv2.calcHist([img], [2], None, [256], [0, 256])
														
 
															+    #
														
 
															+    #     # 将直方图合并成一个数组
														
 
															+    #     hist = np.concatenate((hist_b.flatten(), hist_g.flatten(), hist_r.flatten()))
														
 
															+    #
														
 
															+    #     # 获取非零值的索引及其数量
														
 
															+    #     non_zero_indices = np.nonzero(hist)[0]
														
 
															+    #     counts = hist[non_zero_indices]
														
 
															+    #
														
 
															+    #     # 将索引转换为颜色值
														
 
															+    #     colors = np.unravel_index(non_zero_indices, (256, 256, 256))
														
 
															+    #     colors = np.transpose(colors)
														
 
															+    #
														
 
															+    #     log("count_colors_with_histogram Time taken: " + str(time.time() - time00))
														
 
															+    #     return colors, counts
														
 
															+    #
														
 
															+    #
														
 
															+    # def count_colors_with_kmeans(img):
														
 
															+    #     time00 = time.time()
														
 
															+    #     img_color = img.reshape(-1, 3)
														
 
															+    #
														
 
															+    #     # 使用 KMeans 聚类，将颜色聚类为 16 种
														
 
															+    #     kmeans = KMeans(n_clusters=4, random_state=0, n_init=2, max_iter=10)
														
 
															+    #     kmeans.fit(img_color)
														
 
															+    #
														
 
															+    #     # 获取聚类后的标签和中心
														
 
															+    #     labels = kmeans.labels_
														
 
															+    #     centers = kmeans.cluster_centers_
														
 
															+    #
														
 
															+    #     # 统计每个聚类中心的数量
														
 
															+    #     unique_labels, counts = np.unique(labels, return_counts=True)
														
 
															+    #
														
 
															+    #     print("Time taken: ", time.time() - time00)
														
 
															+    #     return centers[unique_labels], counts
														
 
															+    #
														
 
															+    # def count_colors_with_bincount(img):
														
 
															+    #     time00 = time.time()
														
 
															+    #     img_color = img.reshape(-1, 3)
														
 
															+    #
														
 
															+    #     # 将颜色编码为一个整数
														
 
															+    #     colors_encoded = img_color[:, 0] * 256 * 256 + img_color[:, 1] * 256 + img_color[:, 2]
														
 
															+    #
														
 
															+    #     # 使用 bincount 计算每个颜色的数量
														
 
															+    #     counts = np.bincount(colors_encoded)
														
 
															+    #
														
 
															+    #     # 获取非零值的索引及其数量
														
 
															+    #     non_zero_indices = np.nonzero(counts)[0]
														
 
															+    #
														
 
															+    #     # 解码颜色值
														
 
															+    #     colors_decoded = []
														
 
															+    #     for index in non_zero_indices:
														
 
															+    #         r = (index // (256 * 256)) % 256
														
 
															+    #         g = (index // 256) % 256
														
 
															+    #         b = index % 256
														
 
															+    #         colors_decoded.append([r, g, b])
														
 
															+    #
														
 
															+    #     colors_decoded = np.array(colors_decoded)
														
 
															+    #     counts_non_zero = counts[non_zero_indices]
														
 
															+    #
														
 
															+    #     print("Time taken: ", time.time() - time00)
														
 
															+    #     return colors_decoded, counts_non_zero
														
 
															+
														
 
															+    # 统计每种颜色的出现次数
														
 
															+    # time00 = time.time()
														
 
															+
														
 
															+    # 对图像进行降采样
														
 
															+
														
 
															+    time0 = time.time()
														
 
															+    down_sample_factor = 8
														
 
															+    down_sampled_img = img[::down_sample_factor, ::down_sample_factor, :]
														
 
															+    down_sampled_img_color = down_sampled_img.reshape(-1, 3)
														
 
															+    colors, counts = np.unique(down_sampled_img_color, return_counts=True, axis=0)
														
 
															+    log('shrink_bbox 0 ' + str(time.time()-time0))
														
 
															+
														
 
															+    # 找到出现次数最多的颜色
														
 
															+    time0 = time.time()
														
 
															+    max_count_index = np.argmax(counts)
														
 
															+    most_frequent_color = colors[max_count_index]
														
 
															+    most_frequent_color = most_frequent_color.astype(np.int32)
														
 
															+    log('shrink_bbox 1 ' + str(time.time()-time0))
														
 
															+
														
 
															+    new_bbox_list = []
														
 
															+    img_int = img.astype(np.int32)
														
 
															+    time0 = time.time()
														
 
															+    for bbox in bbox_list:
														
 
															+        # img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
														
 
															+        # img_bbox = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
														
 
															+        img_bbox_int = img_int[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
														
 
															+
														
 
															+        if 0 in img_bbox_int.shape:
														
 
															+            new_bbox_list.append(bbox)
														
 
															+            continue
														
 
															+
														
 
															+        # 左右上下开始扫描，碰到黑像素即停
														
 
															+        # index_list = return_first_black_index(img_bbox[:, :, :])
														
 
															+        index_list = return_not_most_color_index_fast(img_bbox_int, most_frequent_color)
														
 
															+
														
 
															+        if index_list[0].size == 0 or index_list[1].size == 0:
														
 
															+            new_bbox_list.append(bbox)
														
 
															+            continue
														
 
															+        min_h = index_list[0][0]
														
 
															+        max_h = index_list[0][-1]
														
 
															+
														
 
															+        img_bbox1 = np.swapaxes(img_bbox_int, 0, 1)
														
 
															+        # index_list = return_first_black_index(img_bbox1[:, :, :])
														
 
															+        index_list = return_not_most_color_index_fast(img_bbox1, most_frequent_color)
														
 
															+
														
 
															+        if index_list[0].size == 0 or index_list[1].size == 0:
														
 
															+            new_bbox_list.append(bbox)
														
 
															+            continue
														
 
															+        min_w = index_list[0][0]
														
 
															+        max_w = index_list[0][-1]
														
 
															+
														
 
															+        real_min_w = bbox[0] + min_w
														
 
															+        real_max_w = bbox[0] + max_w
														
 
															+        real_min_h = bbox[1] + min_h
														
 
															+        real_max_h = bbox[1] + max_h
														
 
															+        new_bbox = [real_min_w, real_min_h, real_max_w, real_max_h]
														
 
															+        new_bbox_list.append(new_bbox)
														
 
															+
														
 
															+        # cv2.imshow('img', img_bbox)
														
 
															+        # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :])
														
 
															+        # cv2.waitKey(0)
														
 
															+    log('shrink_bbox 2 ' + str(time.time() - time0))
														
 
															+    return new_bbox_list
														
 
															+
														
 
															+
														
 
															+def shrink_bbox_by_pixel(lt_text_list):
														
 
															+    for lt_text in lt_text_list:
														
 
															+        bbox = lt_text.bbox
														
 
															+        bbox_h = abs(bbox[3] - bbox[1])
														
 
															+        shrink_h = bbox_h / 2
														
 
															+        new_bbox = [bbox[0], int(bbox[1] + shrink_h / 2),
														
 
															+                    bbox[2], int(bbox[3] - shrink_h / 2)
														
 
															+                    ]
														
 
															+        lt_text.bbox = new_bbox
														
 
															+    return lt_text_list
														
 
															+
														
 
															+
														
 
															+def get_inter_part(bbox_list, show=0):
														
 
															+    if not bbox_list:
														
 
															+        return None
														
 
															+
														
 
															+    # xs = [[x[0], x[2]] for x in bbox_list]
														
 
															+    # xs = [y for x in xs for y in x]
														
 
															+    #
														
 
															+    # ys = [[x[1], x[3]] for x in bbox_list]
														
 
															+    # ys = [y for x in ys for y in x]
														
 
															+    #
														
 
															+    # xs.sort(key=lambda x: x)
														
 
															+    # ys.sort(key=lambda x: x)
														
 
															+    #
														
 
															+    # max_index = len(bbox_list)
														
 
															+    # min_index = max_index - 1
														
 
															+    #
														
 
															+    # min_x, max_x = xs[min_index], xs[max_index]
														
 
															+    # min_y, max_y = ys[min_index], ys[max_index]
														
 
															+
														
 
															+    # min_x, min_y, max_x, max_y = bbox_list[0]
														
 
															+    # for bbox in bbox_list:
														
 
															+    #     # if min_x < bbox[0]:
														
 
															+    #     #     min_x = bbox[0]
														
 
															+    #     # if min_y < bbox[1]:
														
 
															+    #     #     min_y = bbox[1]
														
 
															+    #     # if max_x > bbox[2]:
														
 
															+    #     #     max_x = bbox[2]
														
 
															+    #     # if max_y > bbox[3]:
														
 
															+    #     #     max_y = bbox[3]
														
 
															+    #     if min_x < min(bbox[0], bbox[2]):
														
 
															+    #         min_x = min(bbox[0], bbox[2])
														
 
															+    #     if min_y < min(bbox[1], bbox[3]):
														
 
															+    #         min_y = min(bbox[1], bbox[3])
														
 
															+    #     if max_x > max(bbox[0], bbox[2]):
														
 
															+    #         max_x = max(bbox[0], bbox[2])
														
 
															+    #     if max_y > max(bbox[1], bbox[3]):
														
 
															+    #         max_y = max(bbox[1], bbox[3])
														
 
															+    #     # print('min_x, min_y, max_x, max_y', min_x, min_y, max_x, max_y)
														
 
															+    # _min_x = min(min_x, max_x)
														
 
															+    # _max_x = max(min_x, max_x)
														
 
															+    # _min_y = min(min_y, max_y)
														
 
															+    # _max_y = max(min_y, max_y)
														
 
															+
														
 
															+    # # 同一行的bbox去重，取最大的
														
 
															+    # # used_bbox_list = []
														
 
															+    # current_bbox = bbox_list[0]
														
 
															+    # delete_bbox_list = []
														
 
															+    # bbox_list.sort(key=lambda x: (x[1], x[3]))
														
 
															+    # threshold = 5
														
 
															+    # for bbox in bbox_list:
														
 
															+    #     if bbox == current_bbox:
														
 
															+    #         continue
														
 
															+    #     if current_bbox in delete_bbox_list:
														
 
															+    #         current_bbox = bbox
														
 
															+    #         continue
														
 
															+    #     if current_bbox[1] - threshold <= bbox[1] <= bbox[3] <= current_bbox[3] + threshold:
														
 
															+    #         if abs(current_bbox[0] - current_bbox[2]) > abs(bbox[0] - bbox[2]):
														
 
															+    #             delete_bbox_list.append(bbox)
														
 
															+    #         else:
														
 
															+    #             delete_bbox_list.append(current_bbox)
														
 
															+    #     else:
														
 
															+    #         current_bbox = bbox
														
 
															+    #
														
 
															+    # for bbox in delete_bbox_list:
														
 
															+    #     if bbox in bbox_list:
														
 
															+    #         bbox_list.remove(bbox)
														
 
															+
														
 
															+    bbox_list.sort(key=lambda x: (x[0], x[2]))
														
 
															+    min_x, min_y, max_x, max_y = bbox_list[0]
														
 
															+    for bbox in bbox_list:
														
 
															+        if min_x < bbox[0]:
														
 
															+            min_x = bbox[0]
														
 
															+        if min_y < bbox[1]:
														
 
															+            min_y = bbox[1]
														
 
															+        if max_x > bbox[2]:
														
 
															+            max_x = bbox[2]
														
 
															+        if max_y > bbox[3]:
														
 
															+            max_y = bbox[3]
														
 
															+    _min_x = min(min_x, max_x)
														
 
															+    _max_x = max(min_x, max_x)
														
 
															+    _min_y = min(min_y, max_y)
														
 
															+    _max_y = max(min_y, max_y)
														
 
															+    if show:
														
 
															+        print('get_inter_part', [_min_x, _min_y, _max_x, _max_y])
														
 
															+    return [_min_x, _min_y, _max_x, _max_y]
														
 
															+
														
 
															+
														
 
															+def get_inter_part_250530(bbox_list, show=0):
														
 
															+    if not bbox_list:
														
 
															+        return None
														
 
															+
														
 
															+    x1_list = [x[0] for x in bbox_list]
														
 
															+    x2_list = [x[2] for x in bbox_list]
														
 
															+    y1_list = [x[1] for x in bbox_list]
														
 
															+    y2_list = [x[3] for x in bbox_list]
														
 
															+
														
 
															+    x1_list.sort(key=lambda x: x, reverse=True)
														
 
															+    x2_list.sort(key=lambda x: x)
														
 
															+
														
 
															+
														
 
															+def get_straight_lines_from_image(image_np, threshold=50):
														
 
															+    # 读取图像
														
 
															+    if image_np is None:
														
 
															+        print("无法读取图像")
														
 
															+        return False
														
 
															+
														
 
															+    # 转换为灰度图像
														
 
															+    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
														
 
															+
														
 
															+    # 使用Canny算子进行边缘检测
														
 
															+    edges = cv2.Canny(gray, 20, 150)
														
 
															+
														
 
															+    cv2.imshow('edges', edges)
														
 
															+
														
 
															+    # 使用霍夫直线变换检测直线
														
 
															+    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold,
														
 
															+                            minLineLength=50, maxLineGap=2)
														
 
															+
														
 
															+    for line in lines:
														
 
															+        line = line[0]
														
 
															+        print('line', line)
														
 
															+        cv2.line(image_np, line[:2], line[2:], (0, 0, 255))
														
 
															+
														
 
															+    cv2.imshow('img', image_np)
														
 
															+    cv2.waitKey(0)
														
 
															+
														
 
															+    print('lines', lines)
														
 
															+
														
 
															+
														
 
															+def get_table_bbox(table):
														
 
															+    x1 = min([y.bbox[0] for x in table for y in x])
														
 
															+    y1 = min([y.bbox[1] for x in table for y in x])
														
 
															+    x2 = max([y.bbox[2] for x in table for y in x])
														
 
															+    y2 = max([y.bbox[3] for x in table for y in x])
														
 
															+    return [x1, y1, x2, y2]
														
 
															+
														
 
															+
														
 
															+@memory_decorator
														
 
															+def merge_intersecting_lists(lists):
														
 
															+    merged_lists = []
														
 
															+    for current_list in lists:
														
 
															+        # 当前列表转换为集合，方便后续操作
														
 
															+        current_set = set(current_list)
														
 
															+        merged = False
														
 
															+        # 遍历已合并的列表，检查是否有交集
														
 
															+        for i in range(len(merged_lists)):
														
 
															+            merged_set = set(merged_lists[i])
														
 
															+            # 如果存在交集
														
 
															+            if current_set & merged_set:
														
 
															+                # 合并两个列表，并去重
														
 
															+                merged_lists[i] = list(merged_set.union(current_set))
														
 
															+                merged = True
														
 
															+                break
														
 
															+        # 如果没有与任何已合并列表交集，则添加为新的合并列表
														
 
															+        if not merged:
														
 
															+            merged_lists.append(current_list.copy())
														
 
															+    return merged_lists
														
 
															+
														
 
															+
														
 
															+def merge_same_bbox(lt_text_list, avg_char_width, show=0):
														
 
															+    from format_convert.convert_tree import TextBox
														
 
															+    for i in range(len(lt_text_list)):
														
 
															+        lt_text1 = lt_text_list[i]
														
 
															+        line1_x = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
														
 
															+        line1_y = ((lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0))
														
 
															+
														
 
															+        for j in range(i+1, len(lt_text_list)):
														
 
															+            lt_text2 = lt_text_list[j]
														
 
															+            # if lt_text1 == lt_text2:
														
 
															+            #     continue
														
 
															+            if lt_text1.bbox[2] >= lt_text2.bbox[0]:
														
 
															+                continue
														
 
															+
														
 
															+            # x轴上不相交
														
 
															+            line2_x = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
														
 
															+            if line_iou(line1_x, line2_x) > 0:
														
 
															+                continue
														
 
															+
														
 
															+            # y轴上iou大于一定值
														
 
															+            line2_y = ((lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0))
														
 
															+            if line_iou(line1_y, line2_y) > 0.9 \
														
 
															+                    and abs(lt_text1.bbox[2] - lt_text2.bbox[0]) < avg_char_width * 5 \
														
 
															+                    and re.search('[:：]', lt_text2.get_text()) \
														
 
															+                    and not re.search('[:：]', lt_text1.get_text()) \
														
 
															+                    and len(lt_text1.get_text()) <= 2:
														
 
															+                new_lt_text = TextBox(text=lt_text1.get_text() + lt_text2.get_text(),
														
 
															+                                      bbox=[lt_text1.bbox[0], min(lt_text1.bbox[1], lt_text2.bbox[1]),
														
 
															+                                            lt_text2.bbox[2], max(lt_text1.bbox[3], lt_text2.bbox[3])
														
 
															+                                            ])
														
 
															+                lt_text_list[i] = new_lt_text
														
 
															+                lt_text_list[j] = new_lt_text
														
 
															+                if show:
														
 
															+                    print('new_lt_text', new_lt_text)
														
 
															+
														
 
															+    lt_text_list = list(set(lt_text_list))
														
 
															+    lt_text_list.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
														
 
															+
														
 
															+    return lt_text_list
														
 
															+
														
 
															+
														
 
															+def sort_by_read_order(lt_text_list, threshold=10):
														
 
															+    if not lt_text_list:
														
 
															+        return lt_text_list
														
 
															+
														
 
															+    # 按 y1 升序排序
														
 
															+    lt_text_list.sort(key=lambda x: x.bbox[1])
														
 
															+
														
 
															+    # 初始化变量
														
 
															+    sorted_lt_text_list = []
														
 
															+    current_row = [lt_text_list[0]]
														
 
															+
														
 
															+    for i in range(1, len(lt_text_list)):
														
 
															+        # 如果当前边界框的 y1 与前一个边界框的 y1 差距小于阈值，认为是同一行
														
 
															+        if abs(lt_text_list[i].bbox[1] - lt_text_list[i - 1].bbox[1]) < threshold:
														
 
															+            current_row.append(lt_text_list[i])
														
 
															+        else:
														
 
															+            # 对当前行按 x1 排序并添加到结果中
														
 
															+            current_row.sort(key=lambda x: x.bbox[0])
														
 
															+            sorted_lt_text_list += current_row
														
 
															+            current_row = [lt_text_list[i]]
														
 
															+
														
 
															+    # 添加最后一行
														
 
															+    current_row.sort(key=lambda x: x.bbox[0])
														
 
															+    sorted_lt_text_list += current_row
														
 
															+    return sorted_lt_text_list
														
 
															+
														
 
															+
														
 
															+def delete_empty_bbox(lt_text_list, show=0):
														
 
															+    temp_list = []
														
 
															+    for lt_text in lt_text_list:
														
 
															+        if lt_text.get_text() in [':', "：", ";", "；"] \
														
 
															+                or re.sub('\s', '', lt_text.get_text()) == "":
														
 
															+            continue
														
 
															+        temp_list.append(lt_text)
														
 
															+    lt_text_list = temp_list
														
 
															+    return lt_text_list
														
 
															+
														
 
															+
														
 
															+def standard_table(table, show=0):
														
 
															+    if not table:
														
 
															+        return table
														
 
															+
														
 
															+    # 去掉占位符
														
 
															+    for ri, row in enumerate(table):
														
 
															+        for ci, col in enumerate(row):
														
 
															+            if '@@:' in col.get('text'):
														
 
															+                col['text'] = re.sub('@@:', '', col.get('text'))
														
 
															+
														
 
															+    # 修复一些表头冒号ocr提取不到被作为值的问题
														
 
															+    for ri, row in enumerate(table):
														
 
															+        if row[0].get('text') == '' and row[1].get('text') != '' and row[2].get('text') != '' and row[3].get('text') == '':
														
 
															+            row[0]['text'] = row[1].get('text')
														
 
															+            row[1]['text'] = ''
														
 
															+            if show:
														
 
															+                print('standard_table, add colon head', table[ri])
														
 
															+
														
 
															+    # 修复表头值上下错位的情况
														
 
															+    # head          head
														
 
															+    #       value           value
														
 
															+    delete_row_index_list = []
														
 
															+    for ri, row in enumerate(table):
														
 
															+        if ri == 0:
														
 
															+            continue
														
 
															+        last_row = table[ri - 1]
														
 
															+        if last_row[0].get('text') != '' and last_row[1].get('text') == '' \
														
 
															+                and row[0].get('text') == '' and row[1].get('text') != '' \
														
 
															+                and last_row[2].get('text') != '' and last_row[3].get('text') == '' \
														
 
															+                and row[2].get('text') == '' and row[3].get('text') != '':
														
 
															+            # 补上表头
														
 
															+            row[0]['text'] = last_row[0].get('text')
														
 
															+            row[2]['text'] = last_row[2].get('text')
														
 
															+            delete_row_index_list.append(ri - 1)
														
 
															+            if show:
														
 
															+                print('standard_table, fix head value 1', table[ri])
														
 
															+
														
 
															+    temp_list = []
														
 
															+    for ri, row in enumerate(table):
														
 
															+        if ri in delete_row_index_list:
														
 
															+            continue
														
 
															+        temp_list.append(row)
														
 
															+    table = temp_list
														
 
															+
														
 
															+    # 修复值未被合进上一行的情况
														
 
															+    # head  value   head    value
														
 
															+    #       value           value
														
 
															+    delete_row_index_list = []
														
 
															+    for ri, row in enumerate(table):
														
 
															+        if ri == 0:
														
 
															+            continue
														
 
															+        last_row = table[ri - 1]
														
 
															+        if last_row[0].get('text') != '' and last_row[1].get('text') != '' \
														
 
															+                and row[0].get('text') == '' and row[1].get('text') != '' \
														
 
															+                and last_row[2].get('text') != '' and last_row[3].get('text') != '' \
														
 
															+                and row[2].get('text') == '' and row[3].get('text') != '':
														
 
															+            # 补上值
														
 
															+            last_row[1]['text'] += row[1]['text']
														
 
															+            last_row[3]['text'] += row[3]['text']
														
 
															+            delete_row_index_list.append(ri)
														
 
															+    temp_list = []
														
 
															+    for ri, row in enumerate(table):
														
 
															+        if ri in delete_row_index_list:
														
 
															+            continue
														
 
															+        temp_list.append(row)
														
 
															+    table = temp_list
														
 
															+    return table
														
 
															+
														
 
															+
														
 
															+@memory_decorator
														
 
															+def find_outline_lt_text(lt_text_list, show=0):
														
 
															+    lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
														
 
															+    used_lt_text_list = []
														
 
															+    row_list = []
														
 
															+    for lt_text1 in lt_text_list:
														
 
															+        if lt_text1 in used_lt_text_list:
														
 
															+            continue
														
 
															+        row = [lt_text1]
														
 
															+        used_lt_text_list.append(lt_text1)
														
 
															+        for lt_text2 in lt_text_list:
														
 
															+            if lt_text2 in used_lt_text_list:
														
 
															+                continue
														
 
															+            line1 = [(lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)]
														
 
															+            line2 = [(lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)]
														
 
															+            if line_iou(line1, line2) > 0:
														
 
															+                row.append(lt_text2)
														
 
															+                used_lt_text_list.append(lt_text2)
														
 
															+        row_list.append(row)
														
 
															+
														
 
															+    outline_lt_text_list = []
														
 
															+    for row in row_list:
														
 
															+        if len(row) >= 2:
														
 
															+            continue
														
 
															+        outline_lt_text_list += row
														
 
															+
														
 
															+    if show:
														
 
															+        print('outline_lt_text_list', outline_lt_text_list)
														
 
															+    return outline_lt_text_list
														
 
															+
														
 
															+
														
 
															+def get_iou(bbox1, bbox2):
														
 
															+    # 提取边界框的坐标
														
 
															+    x1_1, y1_1, x2_1, y2_1 = bbox1
														
 
															+    x1_2, y1_2, x2_2, y2_2 = bbox2
														
 
															+
														
 
															+    # 判断是否完全包含
														
 
															+    if (x1_1 <= x1_2 and y1_1 <= y1_2 and x2_1 >= x2_2 and y2_1 >= y2_2) or \
														
 
															+            (x1_2 <= x1_1 and y1_2 <= y1_1 and x2_2 >= x2_1 and y2_2 >= y2_1):
														
 
															+        return 1.0
														
 
															+
														
 
															+    # 计算交集区域的坐标
														
 
															+    inter_x1 = max(x1_1, x1_2)
														
 
															+    inter_y1 = max(y1_1, y1_2)
														
 
															+    inter_x2 = min(x2_1, x2_2)
														
 
															+    inter_y2 = min(y2_1, y2_2)
														
 
															+
														
 
															+    # 计算交集区域的面积
														
 
															+    inter_width = max(0, inter_x2 - inter_x1 + 1)
														
 
															+    inter_height = max(0, inter_y2 - inter_y1 + 1)
														
 
															+    inter_area = inter_width * inter_height
														
 
															+
														
 
															+    # 计算两个边界框的面积
														
 
															+    bbox1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
														
 
															+    bbox2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)
														
 
															+
														
 
															+    # 计算并集区域的面积
														
 
															+    union_area = bbox1_area + bbox2_area - inter_area
														
 
															+
														
 
															+    # 计算 IoU
														
 
															+    iou = inter_area / union_area if union_area != 0 else 0
														
 
															+
														
 
															+    return iou
														
 
															+
														
 
															+
														
 
															+def fix_cross_bbox(lt_text_list, show=0):
														
 
															+    for lt_text1 in lt_text_list:
														
 
															+        for lt_text2 in lt_text_list:
														
 
															+            if lt_text1 == lt_text2:
														
 
															+                continue
														
 
															+            if get_iou(lt_text1.bbox, lt_text2.bbox) > 0:
														
 
															+                if show:
														
 
															+                    print('fix_cross_bbox1', lt_text1, lt_text2)
														
 
															+                x10, x11, x12, x13 = lt_text1.bbox
														
 
															+                x20, x21, x22, x23 = lt_text2.bbox
														
 
															+
														
 
															+                # 右侧相交，且交集不能过大，过大则不是这一维相交
														
 
															+                if x10 < x20 < x12 and x12 - x20 < max(abs(x12 - x10), abs(x20 - x22)) / 2:
														
 
															+                    x12 = min(lt_text1.bbox[2], lt_text2.bbox[0])
														
 
															+                    x20 = max(lt_text1.bbox[2], lt_text2.bbox[0])
														
 
															+
														
 
															+                # 下方相交，且交集不能过大，过大则不是这一维相交
														
 
															+                if x11 < x21 < x13 and x13 - x21 < max(abs(x13 - x11), abs(x21 - x23)) / 2:
														
 
															+                    x13 = min(lt_text1.bbox[3], lt_text2.bbox[1])
														
 
															+                    x21 = max(lt_text1.bbox[3], lt_text2.bbox[1])
														
 
															+
														
 
															+                lt_text1.bbox = [x10, x11, x12, x13]
														
 
															+                lt_text2.bbox = [x20, x21, x22, x23]
														
 
															+                if show:
														
 
															+                    print('fix_cross_bbox2', lt_text1, lt_text2)
														
 
															+    return lt_text_list
														
 
															+
														
 
															+
														
 
															+def split_lt_text_by_many_space(lt_text_list, show=0):
														
 
															+    from format_convert.convert_tree import TextBox
														
 
															+
														
 
															+    # 先处理前后空格
														
 
															+    add_lt_text_list = []
														
 
															+    delete_lt_text_list = []
														
 
															+    for lt_text in lt_text_list:
														
 
															+        text = lt_text.get_text()
														
 
															+        bbox = lt_text.bbox
														
 
															+
														
 
															+        if len(text) == 0:
														
 
															+            continue
														
 
															+        text_unicode_len = get_char_unicode_length(text)
														
 
															+        if text_unicode_len == 0:
														
 
															+            continue
														
 
															+        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
														
 
															+
														
 
															+        space1 = re.findall('^[ 　]+', text)
														
 
															+        if space1:
														
 
															+            space1 = ''.join(space1)
														
 
															+            space1_unicode_len = get_char_unicode_length(space1)
														
 
															+            space1_pixel_len = space1_unicode_len * ratio
														
 
															+            text = re.sub('^[ 　]+', '', text)
														
 
															+            bbox = [bbox[0] + space1_pixel_len, bbox[1], bbox[2], bbox[3]]
														
 
															+            if len(text) == 0:
														
 
															+                continue
														
 
															+            text_unicode_len = get_char_unicode_length(text)
														
 
															+            if text_unicode_len == 0:
														
 
															+                continue
														
 
															+            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
														
 
															+
														
 
															+        space2 = re.findall('[ 　]+$', text)
														
 
															+        if space2:
														
 
															+            space2 = ''.join(space2)
														
 
															+            space2_unicode_len = get_char_unicode_length(space2)
														
 
															+            space2_pixel_len = space2_unicode_len * ratio
														
 
															+            text = re.sub('[ 　]+$', '', text)
														
 
															+            bbox = [bbox[0], bbox[1], bbox[2] - space2_pixel_len, bbox[3]]
														
 
															+            if len(text) == 0:
														
 
															+                continue
														
 
															+            text_unicode_len = get_char_unicode_length(text)
														
 
															+            if text_unicode_len == 0:
														
 
															+                continue
														
 
															+            ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
														
 
															+
														
 
															+        if space1 or space2:
														
 
															+            new_lt_text = TextBox(text=text, bbox=bbox)
														
 
															+            add_lt_text_list.append(new_lt_text)
														
 
															+            delete_lt_text_list.append(lt_text)
														
 
															+
														
 
															+    for lt_text in delete_lt_text_list:
														
 
															+        if lt_text in lt_text_list:
														
 
															+            lt_text_list.remove(lt_text)
														
 
															+    lt_text_list += add_lt_text_list
														
 
															+
														
 
															+    # 处理表头中间隔着几个空格 电  话：        电  话：
														
 
															+    add_lt_text_list = []
														
 
															+    delete_lt_text_list = []
														
 
															+    for lt_text in lt_text_list:
														
 
															+        text = lt_text.get_text()
														
 
															+        bbox = lt_text.bbox
														
 
															+
														
 
															+        if len(text) == 0:
														
 
															+            continue
														
 
															+
														
 
															+        space_list = re.findall('[ 　]+', text)
														
 
															+        if len(space_list) >= 2:
														
 
															+            space_list.sort(key=lambda x: len(x))
														
 
															+            max_space = space_list[-1]
														
 
															+            match = re.search(max_space, text)
														
 
															+            if show:
														
 
															+                print('max_space', max_space)
														
 
															+                print('space_list', space_list)
														
 
															+            if match:
														
 
															+                part1 = text[:match.start()]
														
 
															+                part2 = text[match.end():]
														
 
															+                ss1 = re.split('[ 　]+', part1)
														
 
															+                ss2 = re.split('[ 　]+', part2)
														
 
															+
														
 
															+                if len(ss1) == 2 and len(ss1[0]) == 1 and len(ss1[1]) == 2 and ss1[1][-1] in [':', '：'] \
														
 
															+                        and len(ss2) == 2 and len(ss2[0]) == 1 and len(ss2[1]) == 2 and ss2[1][-1] in [':', '：']:
														
 
															+                    new_text = ''.join(ss1) + max_space + ''.join(ss2)
														
 
															+                    new_lt_text = TextBox(text=new_text, bbox=bbox)
														
 
															+                    add_lt_text_list.append(new_lt_text)
														
 
															+                    delete_lt_text_list.append(lt_text)
														
 
															+
														
 
															+    if show:
														
 
															+        print('split_lt_text_by_many_space add_lt_text_list222', add_lt_text_list)
														
 
															+        print('split_lt_text_by_many_space delete_lt_text_list222', delete_lt_text_list)
														
 
															+
														
 
															+    for lt_text in delete_lt_text_list:
														
 
															+        if lt_text in lt_text_list:
														
 
															+            lt_text_list.remove(lt_text)
														
 
															+    lt_text_list += add_lt_text_list
														
 
															+
														
 
															+    # 处理中间多个空格，并拆分为两个
														
 
															+    add_lt_text_list = []
														
 
															+    delete_lt_text_list = []
														
 
															+    for lt_text in lt_text_list:
														
 
															+        text = lt_text.get_text()
														
 
															+        bbox = lt_text.bbox
														
 
															+
														
 
															+        if len(text) == 0:
														
 
															+            continue
														
 
															+
														
 
															+        text_unicode_len = get_char_unicode_length(text)
														
 
															+        if text_unicode_len == 0:
														
 
															+            continue
														
 
															+        ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
														
 
															+
														
 
															+        # 中间有多个空格，且空格分割为两部分
														
 
															+        match = re.search('[ 　]{4,}', text)
														
 
															+        ss = re.split('[ 　]+', text)
														
 
															+        if match and len(ss) == 2:
														
 
															+            # if match:
														
 
															+            part1 = text[:match.start()]
														
 
															+            part2 = text[match.end():]
														
 
															+
														
 
															+            l1 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part1)
														
 
															+            l2 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part2)
														
 
															+            # 两边字符数都足够
														
 
															+            if len(l1) >= 2 and len(l2) >= 2:
														
 
															+                part1_unicode_len = get_char_unicode_length(part1)
														
 
															+                part2_unicode_len = get_char_unicode_length(part2)
														
 
															+
														
 
															+                part1_pixel_len = ratio * part1_unicode_len
														
 
															+                part2_pixel_len = ratio * part2_unicode_len
														
 
															+
														
 
															+                # avg_char_w = abs(bbox[0] - bbox[2]) / len(text)
														
 
															+                bbox1 = [bbox[0], bbox[1], bbox[0] + part1_pixel_len, bbox[3]]
														
 
															+                bbox2 = [bbox[2] - part2_pixel_len, bbox[1], bbox[2], bbox[3]]
														
 
															+                # 用自己的对象新增
														
 
															+                new_lt_text1 = TextBox(text=part1, bbox=bbox1)
														
 
															+                new_lt_text2 = TextBox(text=part2, bbox=bbox2)
														
 
															+                add_lt_text_list += [new_lt_text1, new_lt_text2]
														
 
															+                delete_lt_text_list.append(lt_text)
														
 
															+
														
 
															+    for lt_text in delete_lt_text_list:
														
 
															+        if lt_text in lt_text_list:
														
 
															+            lt_text_list.remove(lt_text)
														
 
															+    lt_text_list += add_lt_text_list
														
 
															+
														
 
															+    if show:
														
 
															+        print('split_lt_text_by_many_space add_lt_text_list333', add_lt_text_list)
														
 
															+        print('split_lt_text_by_many_space delete_lt_text_list333', delete_lt_text_list)
														
 
															+
														
 
															+    return lt_text_list
														
 
															+
														
 
															+
														
 
															+def get_char_unicode_length(text, show=0):
														
 
															+    # char_reg_len_dict = {
														
 
															+    #     '[ ]': 1,
														
 
															+    #     '[　]': 1.5,
														
 
															+    #     '[\u4e00-\u9fff]': 1.5,
														
 
															+    #     '[a-zA-Z0-9#@,^.+=\(\)<>\-@#$%&*\[\]\'":;?~!’‘“”{}/]': 1,
														
 
															+    #     '[：，。！￥……（）【】；？《》、]': 1.5
														
 
															+    # }
														
 
															+    #
														
 
															+    # text_real_len = 0
														
 
															+    # for reg, char_len in char_reg_len_dict.items():
														
 
															+    #     cs = re.findall(reg, text)
														
 
															+    #     text_real_len += len(cs) * char_len
														
 
															+    #
														
 
															+    # real_avg_char_len = abs(bbox[2] - bbox[0]) / text_real_len
														
 
															+    #
														
 
															+    # char_reg_real_len_dict = {}
														
 
															+    # for reg, char_len in char_reg_len_dict.items():
														
 
															+    #     char_reg_real_len_dict[reg] = real_avg_char_len * char_len
														
 
															+    #
														
 
															+    # return char_reg_real_len_dict
														
 
															+
														
 
															+    width = wcwidth.wcswidth(text)
														
 
															+    if show:
														
 
															+        print('text unicode_length', text, width)
														
 
															+    return width
														
 
															+
														
 
															+
														
 
															+def fix_final_row(table, show=0):
														
 
															+    # print('fix_final_row table', table)
														
 
															+    if len(table) < 2:
														
 
															+        return table
														
 
															+    last_row = table[-2]
														
 
															+    final_row = table[-1]
														
 
															+    print('final_row', final_row)
														
 
															+    print('last_row', last_row)
														
 
															+    delete_final_flag = 0
														
 
															+    if final_row[0] in ['', '@@:'] and final_row[1] in ['', '@@:'] \
														
 
															+            and final_row[2] in ['', '@@:'] and final_row[3] not in ['', '@@:']:
														
 
															+        table[-2][3] = final_row[3]
														
 
															+        delete_final_flag = 1
														
 
															+        if show:
														
 
															+            print('fix_final_row right', table[-2])
														
 
															+
														
 
															+    if final_row[0] in ['', '@@:'] and final_row[1] not in ['', '@@:'] \
														
 
															+            and final_row[2] in ['', '@@:'] and final_row[3] in ['', '@@:']:
														
 
															+        table[-2][1] = final_row[1]
														
 
															+        delete_final_flag = 1
														
 
															+        if show:
														
 
															+            print('fix_final_row left', table[-2])
														
 
															+
														
 
															+    if delete_final_flag:
														
 
															+        table = table[:-1]
														
 
															+
														
 
															+    return table
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    # from format_convert.convert_pdf import PDFConvert
														
 
															+    # pdf_c = PDFConvert(None, None, None)
														
 
															+    # from format_convert.convert_image import ImageProcess
														
 
															+    # img_p = ImageProcess(None, None)
														
 
															+    #
														
 
															+    # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_not_detect\*')
														
 
															+    # image_np_list = [[x, cv2.imread(x)] for x in ps]
														
 
															+    # for p, image_np in image_np_list:
														
 
															+    #     # 整体分辨率限制
														
 
															+    #     image_np = img_p.resize_process(image_np)
														
 
															+    #     # 文字识别
														
 
															+    #     text_list, box_list = img_p.ocr_process(image_np)
														
 
															+    #     # 转换为lt_text_box
														
 
															+    #     _lt_text_list = text_bbox_to_lt(text_list, box_list)
														
 
															+    # 先bbox预先判断可能有无边框
														
 
															+    # _flag = judge_has_b_table_by_bbox(_lt_text_list, [], 0)
														
 
															+    # print('path', p, 'has b table', _flag)
														
 
															+
														
 
															+    _pp = r'D:\Project\format_conversion_maxcompute\save_b_table\15-8292f767be81f404b813c119058a8a75.png'
														
 
															+    img111 = cv2.imread(_pp)
														
 
															+    img111 = pil_resize(img111, 1024, 768)
														
 
															+    get_straight_lines_from_image(img111)
														
 
															+    pass
														
--- a/botr/utils.py
+++ b/botr/utils.py
@@ -38,6 +38,11 @@ def request_post(url, param, time_out=1000, use_zlib=False):
 
															 def line_iou(line1, line2, axis=0):
														
 
															+    if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
														
 
															+        return 1.
														
 
															+    if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
														
 
															+        return 1.
														
 
															+
														
 
															     inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
														
 
															     # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
														
 
															     union = min(abs(line1[0][axis]-line1[1][axis]), abs(line2[0][axis]-line2[1][axis]))
														
--- a/config/interface_new.yml
+++ b/config/interface_new.yml
@@ -58,7 +58,7 @@
 
															     "tika": {
														
 
															       "port": [ 16020 ],
														
 
															-      "port_num": [ 2 ],
														
 
															+      "port_num": [ 1 ],
														
 
															       "gpu": [ -1 ]
														
 
															     }
														
 
															   },
														
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -1,4 +1,4 @@
 
															-#-*- coding: utf-8 -*-
														
 
															+# -*- coding: utf-8 -*-
														
 
															 import gc
														
 
															 import json
														
 
															 import sys
														
@@ -6,8 +6,20 @@ import os
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															 # 强制tf使用cpu
														
 
															 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
														
 
															+
														
 
															+# 动态添加 VERSION 属性到 Image 类
														
 
															+import PIL
														
 
															+from PIL import Image
														
 
															+Image.VERSION = PIL.__version__
														
 
															+
														
 
															 from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
														
 
															-    set_flask_global, get_md5_from_bytes, memory_decorator
														
 
															+    set_flask_global, get_md5_from_bytes, memory_decorator, register_all_fonts
														
 
															+
														
 
															+# 调用函数注册字体
														
 
															+# register_all_fonts("/usr/share/fonts/opentype/noto/")
														
 
															+# register_all_fonts("/usr/share/fonts/truetype/arphic")
														
 
															+# register_all_fonts("/usr/share/fonts/")
														
 
															+
														
 
															 from format_convert.convert_doc import doc2text, DocConvert
														
 
															 from format_convert.convert_docx import docx2text, DocxConvert
														
 
															 from format_convert.convert_image import picture2text, ImageConvert
														
@@ -18,6 +30,8 @@ from format_convert.convert_txt import txt2text, TxtConvert
 
															 from format_convert.convert_xls import xls2text, XlsConvert
														
 
															 from format_convert.convert_xlsx import xlsx2text, XlsxConvert
														
 
															 from format_convert.convert_zip import zip2text, ZipConvert
														
 
															+from format_convert.convert_wps import WpsConvert
														
 
															+from format_convert.convert_ofd import OfdConvert
														
 
															 from format_convert.convert_need_interface import from_atc_interface
														
 
															 import hashlib
														
@@ -33,12 +47,28 @@ import logging
 
															 from bs4 import BeautifulSoup
														
 
															 from flask import Flask, request, g
														
 
															 import inspect
														
 
															+
														
 
															 logging.getLogger("pdfminer").setLevel(logging.WARNING)
														
 
															 from format_convert.table_correct import *
														
 
															 from format_convert.wrapt_timeout_decorator import *
														
 
															 from format_convert import _global
														
 
															 from config.max_compute_config import MAX_COMPUTE
														
 
															+support_file_types = [
														
 
															+    'txt',
														
 
															+    'pdf',
														
 
															+    'doc',
														
 
															+    'docx',
														
 
															+    'xls',
														
 
															+    'xlsx',
														
 
															+    'zip',
														
 
															+    'rar',
														
 
															+    'jpg',
														
 
															+    'png',
														
 
															+    'jpeg',
														
 
															+    'swf',
														
 
															+    'wps',
														
 
															+]
														
 
															 if get_platform() == "Windows":
														
 
															     globals().update({"time_out": 1000})
														
@@ -64,6 +94,9 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
 
															     except:
														
 
															         unique_type_dir = path_or_stream + "_" + _type + os.sep
														
 
															+    if not os.path.exists(unique_type_dir):
														
 
															+        os.mkdir(unique_type_dir)
														
 
															+
														
 
															     if _type == "pdf":
														
 
															         if MAX_COMPUTE:
														
 
															             return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html()
														
@@ -102,11 +135,19 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
 
															         if MAX_COMPUTE:
														
 
															             return TxtConvert(path_or_stream, unique_type_dir).get_html()
														
 
															         return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
														
 
															+    if _type == "wps":
														
 
															+        if MAX_COMPUTE:
														
 
															+            return WpsConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(WpsConvert(path_or_stream, unique_type_dir))
														
 
															+    if _type == "ofd":
														
 
															+        if MAX_COMPUTE:
														
 
															+            return OfdConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(OfdConvert(path_or_stream, unique_type_dir))
														
 
															     return [""]
														
 
															 def to_html(path, text):
														
 
															-    with open(path, 'w',encoding="utf8") as f:
														
 
															+    with open(path, 'w', encoding="utf8") as f:
														
 
															         f.write("<!DOCTYPE HTML>")
														
 
															         f.write('<head><meta charset="UTF-8"></head>')
														
 
															         f.write("<body>")
														
@@ -154,6 +195,11 @@ def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_m
 
															     if get_platform() == "Windows":
														
 
															         _global._init()
														
 
															+    if MAX_COMPUTE:
														
 
															+        _path = "/home/admin"
														
 
															+    else:
														
 
															+        _path = os.path.dirname(os.path.abspath(__file__))
														
 
															+
														
 
															     globals().update({"md5": _md5})
														
 
															     _global.update({"md5": _md5})
														
 
															     log("into unique_temp_file_process")
														
@@ -247,7 +293,7 @@ def cut_str(text_list, only_text_list, max_bytes_length=2000000):
 
															             return only_text_list
														
 
															         # 截取字符
														
 
															-        all_text = all_text[:int(max_bytes_length/3)]
														
 
															+        all_text = all_text[:int(max_bytes_length / 3)]
														
 
															         return [all_text]
														
 
															     except Exception as e:
														
 
															         log("cut_str " + str(e))
														
@@ -336,7 +382,7 @@ def convert_maxcompute(data, ocr_model, otr_model):
 
															             print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
														
 
															         else:
														
 
															             print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
														
 
															-                  "is_success": 1}, time.time() - start_time)
														
 
															+                                        "is_success": 1}, time.time() - start_time)
														
 
															         return {"result_html": text, "result_text": only_text, "is_success": 1}
														
 
															     except Exception as e:
														
 
															         print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
														
@@ -350,6 +396,20 @@ app = Flask(__name__)
 
															 @app.route('/convert', methods=['POST'])
														
 
															 def _convert():
														
 
															+    try:
														
 
															+        data = request.form
														
 
															+    except Exception:
														
 
															+        log_convert_result("1" + "0" * 15, [-1], "", 0,
														
 
															+                           None, None, time.time())
														
 
															+        traceback.print_exc()
														
 
															+        return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
														
 
															+                           "is_success": 0, "swf_images": str([]),
														
 
															+                           "classification": ""})
														
 
															+    result = convert(data)
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def _convert_old_250613():
														
 
															     """
														
 
															     接口返回值：
														
 
															     {[str], 1}: 处理成功
														
@@ -377,11 +437,11 @@ def _convert():
 
															     # snapshot = tracemalloc.take_snapshot()
														
 
															     _global._init()
														
 
															-    _global.update({"md5": "1"+"0"*15})
														
 
															+    _global.update({"md5": "1" + "0" * 15})
														
 
															     set_flask_global()
														
 
															     # _global.update({"port": str(port)})
														
 
															-    log("into convert")
														
 
															+    log("into _convert")
														
 
															     start_time = time.time()
														
 
															     _md5 = _global.get("md5")
														
 
															     _type = None
														
@@ -395,12 +455,12 @@ def _convert():
 
															         file_path = data.get("file_path")
														
 
															         if file_path is None:
														
 
															             stream = base64.b64decode(data.get("file"))
														
 
															-            log("get bytes from file " + str(time.time()-_time))
														
 
															+            log("get bytes from file " + str(time.time() - _time))
														
 
															         # 有路径则直接取路径打开文件
														
 
															         else:
														
 
															             with open(file_path, "rb") as f:
														
 
															                 stream = f.read()
														
 
															-            log("get bytes from file_path " + str(time.time()-_time))
														
 
															+            log("get bytes from file_path " + str(time.time() - _time))
														
 
															         _type = data.get("type")
														
 
															         _md5 = get_md5_from_bytes(stream)
														
 
															         _md5 = _md5[0]
														
@@ -427,7 +487,8 @@ def _convert():
 
															             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
														
 
															             # text, swf_images = origin_unique_temp_file_process(stream, _type)
														
 
															             try:
														
 
															-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
														
 
															+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
														
 
															+                                                            time_out=globals().get('time_out'), save_middle=save_middle)
														
 
															             except TimeoutError:
														
 
															                 log("convert time out! 300 sec")
														
 
															                 text = [-5]
														
@@ -435,7 +496,8 @@ def _convert():
 
															         else:
														
 
															             # Linux 通过装饰器设置整个转换超时时间
														
 
															             try:
														
 
															-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
														
 
															+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
														
 
															+                                                            time_out=globals().get('time_out'), save_middle=save_middle)
														
 
															             except TimeoutError:
														
 
															                 log("convert time out! 300 sec")
														
 
															                 text = [-5]
														
@@ -447,11 +509,12 @@ def _convert():
 
															                 is_success = 1
														
 
															             else:
														
 
															                 is_success = 0
														
 
															-            log("md5: " + str(_md5)
														
 
															-                         + " finished result: " + str(text)
														
 
															-                         + " is_success: " + str(is_success) + " "
														
 
															-                         + str(_type) + " "
														
 
															-                         + " " + str(time.time() - start_time))
														
 
															+            log("md5: " + str(_md5) + " "
														
 
															+                + "finished result: " + str(text) + " "
														
 
															+                + "is_success: " + str(is_success) + " "
														
 
															+                + str(_type) + " "
														
 
															+                + 'None '
														
 
															+                + str(round(time.time() - start_time, 2)))
														
 
															             return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
														
 
															                                "is_success": is_success, "swf_images": str(swf_images)})
														
@@ -484,16 +547,17 @@ def _convert():
 
															         if only_text[0] == '' and len(only_text) <= 1:
														
 
															             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
														
 
															             log("md5: " + str(_md5) + " "
														
 
															-                + " finished result: ['', 0] is_success: 1 "
														
 
															+                + "finished result: ['', 0] is_success: 1 "
														
 
															                 + str(_type) + " "
														
 
															-                + str(time.time() - start_time))
														
 
															+                + 'None '
														
 
															+                + str(round(time.time() - start_time, 2)))
														
 
															         else:
														
 
															-            log("md5: " + str(_md5) +
														
 
															-                " finished result: " + str(only_text)[:20] + " "
														
 
															+            log("md5: " + str(_md5) + " "
														
 
															+                + "finished result: " + str(only_text)[:20] + " "
														
 
															                 + str(len(str(text))) + " is_success: 1 "
														
 
															                 + str(_type) + " "
														
 
															                 + str(classification) + " "
														
 
															-                + str(time.time() - start_time))
														
 
															+                + str(round(time.time() - start_time, 2)))
														
 
															         # log("growth end" + str(objgraph.growth()))
														
 
															         # log("most_common_types end" + str(objgraph.most_common_types(20)))
														
@@ -502,15 +566,24 @@ def _convert():
 
															                            "classification": classification})
														
 
															     except ConnectionError:
														
 
															-        log("convert post has no data!" + " failed result: [-2] is_success: 0 "
														
 
															-            + str(time.time() - start_time))
														
 
															+        # log("convert post has no data!" + " failed result: [-2] is_success: 0 "
														
 
															+        #     + str(round(time.time() - start_time, 2)))
														
 
															+        log("md5: " + str(_md5) + " "
														
 
															+            + "failed result: [-2] is_success: 0 "
														
 
															+            + str(_type) + " "
														
 
															+            + "None "
														
 
															+            + str(round(time.time() - start_time, 2))
														
 
															+            )
														
 
															         return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
														
 
															                            "is_success": 0, "swf_images": str([]),
														
 
															                            "classification": ""})
														
 
															     except Exception as e:
														
 
															-        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
														
 
															-            + str(_type) + " " +
														
 
															-            str(time.time() - start_time))
														
 
															+        log("md5: " + str(_md5) + " "
														
 
															+            + "failed result: [-1] is_success: 0 "
														
 
															+            + str(_type) + " "
														
 
															+            + "None "
														
 
															+            + str(round(time.time() - start_time, 2))
														
 
															+            )
														
 
															         traceback.print_exc()
														
 
															         return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
														
 
															                            "is_success": 0, "swf_images": str([]),
														
@@ -545,6 +618,146 @@ def _convert():
 
															 def convert(data):
														
 
															+    """
														
 
															+    接口返回值：
														
 
															+    :return: {"result_html": [str], "result_text": [str],
														
 
															+              "is_success": int, "swf_images": str(list)}
														
 
															+    """
														
 
															+    log("into convert")
														
 
															+    start_time = time.time()
														
 
															+
														
 
															+    # 初始化
														
 
															+    _global._init()
														
 
															+    _global.update({"md5": "1" + "0" * 15})
														
 
															+    set_flask_global()
														
 
															+    # 文件md5
														
 
															+    _md5 = _global.get("md5")
														
 
															+    # 文件类型
														
 
															+    _type = None
														
 
															+    try:
														
 
															+        if not data:
														
 
															+            log("convert no data!")
														
 
															+            raise ConnectionError
														
 
															+
														
 
															+        file_path = data.get("file_path")
														
 
															+        if file_path is None:
														
 
															+            stream = base64.b64decode(data.get("file"))
														
 
															+            log("get bytes from file " + str(time.time() - start_time))
														
 
															+        # 有路径则直接取路径打开文件
														
 
															+        else:
														
 
															+            with open(file_path, "rb") as f:
														
 
															+                stream = f.read()
														
 
															+            log("get bytes from file_path " + str(time.time() - start_time))
														
 
															+
														
 
															+        # 获取真实值
														
 
															+        _type = data.get("type")
														
 
															+        _md5 = get_md5_from_bytes(stream)
														
 
															+        _md5 = _md5[0]
														
 
															+        _global.update({"md5": _md5})
														
 
															+
														
 
															+        # 指定页码范围
														
 
															+        _page_no = data.get('page_no')
														
 
															+
														
 
															+        # 指定timeout
														
 
															+        _timeout = data.get('timeout')
														
 
															+        if _timeout is not None:
														
 
															+            globals().update({"time_out": _timeout})
														
 
															+
														
 
															+        # 是否保留中间文件
														
 
															+        save_middle = data.get('save_middle')
														
 
															+
														
 
															+        # 最终结果截取的最大字节数
														
 
															+        max_bytes = data.get("max_bytes")
														
 
															+
														
 
															+        # 开始转换，并且控制时间
														
 
															+        try:
														
 
															+            text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
														
 
															+                                                        time_out=globals().get('time_out'), save_middle=save_middle)
														
 
															+        except TimeoutError:
														
 
															+            log("convert time out! 300 sec")
														
 
															+            text = [-5]
														
 
															+            swf_images = []
														
 
															+
														
 
															+        # 报错依然成功的
														
 
															+        still_success_code = [-3, -4, -7]
														
 
															+        if judge_error_code(text):
														
 
															+            if judge_error_code(text, still_success_code):
														
 
															+                is_success = 1
														
 
															+            else:
														
 
															+                is_success = 0
														
 
															+            log_convert_result(_md5, text, "", is_success,
														
 
															+                               _type, None, start_time)
														
 
															+            return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
														
 
															+                               "is_success": is_success, "swf_images": str(swf_images)})
														
 
															+
														
 
															+        # 结果保存result.html
														
 
															+        text_str = ""
														
 
															+        for t in text:
														
 
															+            text_str += t
														
 
															+        to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
														
 
															+
														
 
															+        # 取纯文本
														
 
															+        only_text = []
														
 
															+        for t in text:
														
 
															+            new_t = BeautifulSoup(t, "lxml").get_text()
														
 
															+            new_t = re.sub("\n", "", new_t)
														
 
															+            only_text.append(new_t)
														
 
															+
														
 
															+        # 判断附件类型
														
 
															+        classification = from_atc_interface(' '.join(only_text))
														
 
															+        if judge_error_code(classification):
														
 
															+            classification = [str(classification[0])]
														
 
															+
														
 
															+        # 判断长度，过长截取
														
 
															+        text = cut_str(text, only_text, max_bytes)
														
 
															+        only_text = cut_str(only_text, only_text)
														
 
															+
														
 
															+        if len(only_text) == 0:
														
 
															+            only_text = [""]
														
 
															+
														
 
															+        if only_text[0] == '' and len(only_text) <= 1:
														
 
															+            log_convert_result(_md5, '', '', 1,
														
 
															+                               _type, None, start_time)
														
 
															+        else:
														
 
															+            log_convert_result(_md5, only_text, text, 1,
														
 
															+                               _type, classification, start_time)
														
 
															+        return json.dumps({"result_html": text, "result_text": only_text,
														
 
															+                           "is_success": 1, "swf_images": str(swf_images),
														
 
															+                           "classification": classification})
														
 
															+
														
 
															+    except ConnectionError:
														
 
															+        log_convert_result(_md5, [-2], "", 0,
														
 
															+                           _type, None, start_time)
														
 
															+        return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
														
 
															+                           "is_success": 0, "swf_images": str([]),
														
 
															+                           "classification": ""})
														
 
															+    except Exception:
														
 
															+        log_convert_result(_md5, [-1], "", 0,
														
 
															+                           _type, None, start_time)
														
 
															+        traceback.print_exc()
														
 
															+        return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
														
 
															+                           "is_success": 0, "swf_images": str([]),
														
 
															+                           "classification": ""})
														
 
															+    finally:
														
 
															+        pass
														
 
															+        # log("finally")
														
 
															+
														
 
															+
														
 
															+def log_convert_result(_md5, only_text, text, is_success, _type, _attach_class, start_time):
														
 
															+    str_list = [
														
 
															+        "md5: " + str(_md5),
														
 
															+        "finished result: " + re.sub(' ', '', str(only_text)[:20]),
														
 
															+        str(len(str(text))),
														
 
															+        "is_success: " + str(is_success),
														
 
															+        str(_type),
														
 
															+        str(_attach_class),
														
 
															+        str(round(time.time()-start_time, 3)),
														
 
															+    ]
														
 
															+    info = ' '.join(str_list)
														
 
															+    log(info)
														
 
															+
														
 
															+
														
 
															+def convert_old_250613(data):
														
 
															     """
														
 
															     接口返回值：
														
 
															     {[str], 1}: 处理成功
														
@@ -558,7 +771,7 @@ def convert(data):
 
															     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
														
 
															     """
														
 
															     _global._init()
														
 
															-    _global.update({"md5": "1"+"0"*15})
														
 
															+    _global.update({"md5": "1" + "0" * 15})
														
 
															     set_flask_global()
														
 
															     log("into convert")
														
@@ -584,7 +797,8 @@ def convert(data):
 
															             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
														
 
															             # text, swf_images = origin_unique_temp_file_process(stream, _type)
														
 
															             try:
														
 
															-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
														
 
															+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
														
 
															+                                                            time_out=globals().get('time_out'))
														
 
															             except TimeoutError:
														
 
															                 log("convert time out! 300 sec")
														
 
															                 text = [-5]
														
@@ -592,7 +806,8 @@ def convert(data):
 
															         else:
														
 
															             # Linux 通过装饰器设置整个转换超时时间
														
 
															             try:
														
 
															-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
														
 
															+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
														
 
															+                                                            time_out=globals().get('time_out'))
														
 
															             except TimeoutError:
														
 
															                 log("convert time out! 300 sec")
														
 
															                 text = [-5]
														
@@ -604,11 +819,12 @@ def convert(data):
 
															                 is_success = 1
														
 
															             else:
														
 
															                 is_success = 0
														
 
															-            log("md5: " + str(_md5)
														
 
															-                + " finished result: " + str(text)
														
 
															-                + " is_success: " + str(is_success) + " "
														
 
															+            log("md5: " + str(_md5) + " "
														
 
															+                + "finished result: " + str(text) + " "
														
 
															+                + "is_success: " + str(is_success) + " "
														
 
															                 + str(_type) + " "
														
 
															-                + " " + str(time.time() - start_time))
														
 
															+                + "None "
														
 
															+                + str(round(time.time() - start_time, 2)))
														
 
															             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
														
 
															                     "is_success": is_success, "swf_images": str(swf_images)}
														
@@ -639,18 +855,19 @@ def convert(data):
 
															             only_text = [""]
														
 
															         if only_text[0] == '' and len(only_text) <= 1:
														
 
															-            print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
														
 
															+            # print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
														
 
															             log("md5: " + str(_md5) + " "
														
 
															-                + " finished result: ['', 0] is_success: 1 "
														
 
															+                + "finished result: ['', 0] is_success: 1 "
														
 
															                 + str(_type) + " "
														
 
															-                + str(time.time() - start_time))
														
 
															+                + "None "
														
 
															+                + str(round(time.time() - start_time, 2)))
														
 
															         else:
														
 
															-            log("md5: " + str(_md5) +
														
 
															-                " finished result: " + str(only_text)[:20] + " "
														
 
															+            log("md5: " + str(_md5) + " "
														
 
															+                + "finished result: " + str(only_text)[:20] + " "
														
 
															                 + str(len(str(text))) + " is_success: 1 "
														
 
															                 + str(_type) + " "
														
 
															                 + str(classification) + " "
														
 
															-                + str(time.time() - start_time))
														
 
															+                + str(round(time.time() - start_time, 2)))
														
 
															         return {"result_html": text, "result_text": only_text,
														
 
															                 "is_success": 1, "swf_images": str(swf_images),
														
@@ -658,7 +875,7 @@ def convert(data):
 
															     except ConnectionError:
														
 
															         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
														
 
															-            + str(time.time() - start_time))
														
 
															+            + str(round(time.time() - start_time, 2)))
														
 
															         return {"result_html": ["-2"], "result_text": ["-2"],
														
 
															                 "is_success": 0, "swf_images": str([]),
														
 
															                 "classification": ""}
														
@@ -689,7 +906,7 @@ def convert_old(data, ocr_model, otr_model):
 
															     """
														
 
															     log("into convert")
														
 
															     _global._init()
														
 
															-    _global.update({"md5": "1"+"0"*15})
														
 
															+    _global.update({"md5": "1" + "0" * 15})
														
 
															     # set_flask_global()
														
 
															     start_time = time.time()
														
@@ -706,7 +923,7 @@ def convert_old(data, ocr_model, otr_model):
 
															         _md5 = get_md5_from_bytes(stream)
														
 
															         _md5 = _md5[0]
														
 
															         _global.update({"md5": _md5})
														
 
															-        log("get bytes from file " + str(time.time()-_time))
														
 
															+        log("get bytes from file " + str(time.time() - _time))
														
 
															         if get_platform() == "Windows":
														
 
															             try:
														
@@ -730,11 +947,12 @@ def convert_old(data, ocr_model, otr_model):
 
															                 is_success = 1
														
 
															             else:
														
 
															                 is_success = 0
														
 
															-            log("md5: " + str(_md5)
														
 
															-                + " finished result: " + str(text)
														
 
															-                + " is_success: " + str(is_success) + " "
														
 
															+            log("md5: " + str(_md5) + " "
														
 
															+                + "finished result: " + str(text) + " "
														
 
															+                + "is_success: " + str(is_success) + " "
														
 
															                 + str(_type) + " "
														
 
															-                + " " + str(time.time() - start_time))
														
 
															+                + "None "
														
 
															+                + str(round(time.time() - start_time, 2)))
														
 
															             return {"result_html": [str(text[0])], "result_text": [str(text[0])],
														
 
															                     "is_success": is_success, "swf_images": str(swf_images)}
														
@@ -761,22 +979,24 @@ def convert_old(data, ocr_model, otr_model):
 
															         if only_text[0] == '' and len(only_text) <= 1:
														
 
															             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
														
 
															             log("md5: " + str(_md5) + " "
														
 
															-                + " finished result: ['', 0] is_success: 1 "
														
 
															+                + "finished result: ['', 0] is_success: 1 "
														
 
															                 + str(_type) + " "
														
 
															-                + str(time.time() - start_time))
														
 
															+                + "None "
														
 
															+                + str(round(time.time() - start_time, 2)))
														
 
															         else:
														
 
															-            log("md5: " + str(_md5) +
														
 
															-                " finished result: " + str(only_text)[:20] + " "
														
 
															+            log("md5: " + str(_md5) + " "
														
 
															+                + "finished result: " + str(only_text)[:20] + " "
														
 
															                 + str(len(str(text))) + " is_success: 1 "
														
 
															                 + str(_type) + " "
														
 
															-                + str(time.time() - start_time))
														
 
															+                + "None "
														
 
															+                + str(round(time.time() - start_time, 2)))
														
 
															         return {"result_html": text, "result_text": only_text,
														
 
															                 "is_success": 1, "swf_images": str(swf_images)}
														
 
															     except ConnectionError:
														
 
															         log("convert post has no data!" + " failed result: [-2] is_success: 0 "
														
 
															-            + str(time.time() - start_time))
														
 
															+            + str(round(time.time() - start_time, 2)))
														
 
															         return {"result_html": ["-2"], "result_text": ["-2"],
														
 
															                 "is_success": 0, "swf_images": str([])}
														
 
															     except Exception as e:
														
@@ -801,9 +1021,9 @@ def test_more(_dir, process_no=None):
 
															     for p in file_path_list:
														
 
															         if i % 10 == 0:
														
 
															             if process_no is not None:
														
 
															-                print("Process", process_no, i, time.time()-start_time)
														
 
															+                print("Process", process_no, i, time.time() - start_time)
														
 
															             else:
														
 
															-                print("Loop", i, time.time()-start_time)
														
 
															+                print("Loop", i, time.time() - start_time)
														
 
															         test_one(p, from_remote=True)
														
 
															         i += 1
														
@@ -847,79 +1067,28 @@ def test_duplicate(path_list, process_no=None):
 
															     for i in range(500):
														
 
															         if i % 10 == 0:
														
 
															             if process_no is not None:
														
 
															-                print("Process", process_no, i*len(path_list), time.time()-start_time)
														
 
															+                print("Process", process_no, i * len(path_list), time.time() - start_time)
														
 
															             else:
														
 
															-                print("Loop", i*len(path_list), time.time()-start_time)
														
 
															+                print("Loop", i * len(path_list), time.time() - start_time)
														
 
															         for p in path_list:
														
 
															             test_one(p, from_remote=True)
														
 
															-global_type = ""
														
 
															-local_url = "http://127.0.0.1"
														
 
															-if get_platform() == "Windows":
														
 
															-    _path = os.path.abspath(os.path.dirname(__file__))
														
 
															-else:
														
 
															-    _path = "/home/admin"
														
 
															-    if not os.path.exists(_path):
														
 
															-        _path = os.path.dirname(os.path.abspath(__file__))
														
 
															+# global_type = ""
														
 
															+# local_url = "http://127.0.0.1"
														
 
															+# if get_platform() == "Windows":
														
 
															+#     _path = os.path.abspath(os.path.dirname(__file__))
														
 
															+# else:
														
 
															+#     _path = "/home/admin"
														
 
															+#     if not os.path.exists(_path):
														
 
															+#         _path = os.path.dirname(os.path.abspath(__file__))
														
 
															 if __name__ == '__main__':
														
 
															-    # convert interface
														
 
															-    if len(sys.argv) == 2:
														
 
															-        port = int(sys.argv[1])
														
 
															-    else:
														
 
															-        port = 15010
														
 
															-
														
 
															-    globals().update({"md5": "1"+"0"*15})
														
 
															+    port = 15010
														
 
															+    globals().update({"md5": "1" + "0" * 15})
														
 
															     globals().update({"port": str(port)})
														
 
															-    # _global._init()
														
 
															-    # _global.update({"md5": "1"+"0"*15})
														
 
															-    # _global.update({"port": str(port)})
														
 
															-
														
 
															-    # ip = get_intranet_ip()
														
 
															-    # log("my ip"+str(ip))
														
 
															-    # ip = "http://" + ip
														
 
															     ip_port_dict = get_ip_port()
														
 
															-
														
 
															     set_flask_global()
														
 
															+    app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
														
 
															-    if get_platform() == "Windows":
														
 
															-        app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
														
 
															-    else:
														
 
															-        # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
														
 
															-        app.run(port=15011)
														
 
															-
														
 
															-    # if get_platform() == "Windows":
														
 
															-    #     file_path = "C:/Users/Administrator/Desktop/test_image/error29.png"
														
 
															-    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
														
 
															-    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
														
 
															-    #     # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
														
 
															-    # else:
														
 
															-    #     file_path = "test1.doc"
														
 
															-    # test_one(file_path, from_remote=True)
														
 
															-
														
 
															-    # if get_platform() == "Windows":
														
 
															-    #     file_dir = "D:/BIDI_DOC/比地_文档/table_images/"
														
 
															-    # else:
														
 
															-    #     file_dir = "../table_images/"
														
 
															-    #
														
 
															-    # for j in range(10):
														
 
															-    #     p = Process(target=test_more, args=(file_dir, j, ))
														
 
															-    #     p.start()
														
 
															-    # p.join()
														
 
															-
														
 
															-    # if get_platform() == "Windows":
														
 
															-    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
														
 
															-    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
														
 
															-    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
														
 
															-    #     file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
														
 
															-    #                       "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
														
 
															-    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc"]
														
 
															-    #
														
 
															-    # else:
														
 
															-    #     file_path_list = ["test1.pdf"]
														
 
															-    # for j in range(10):
														
 
															-    #     p = Process(target=test_duplicate, args=(file_path_list, j, ))
														
 
															-    #     p.start()
														
 
															-    # p.join()
														
--- a/format_convert/convert_doc.py
+++ b/format_convert/convert_doc.py
@@ -6,7 +6,7 @@ import sys
 
															 import chardet
														
 
															 from bs4 import BeautifulSoup
														
 
															 sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															-from format_convert.convert_tree import _Document, _Sentence, _Page
														
 
															+from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
														
 
															 import logging
														
 
															 import traceback
														
 
															 from format_convert import get_memory_info
														
@@ -35,11 +35,71 @@ def doc2text(path, unique_type_dir):
 
															 class DocConvert:
														
 
															     def __init__(self, path, unique_type_dir):
														
 
															         self._doc = _Document(path)
														
 
															+        self._page = _Page(None, 0)
														
 
															         self.path = path
														
 
															         self.unique_type_dir = unique_type_dir
														
 
															         self.tika_html = None
														
 
															+        print('into DocConvert __init__')
														
 
															     def convert(self):
														
 
															+        print('into DocConvert convert')
														
 
															+        # 先判断特殊doc文件，可能是html文本
														
 
															+        # is_html_doc = False
														
 
															+        # try:
														
 
															+        #     try:
														
 
															+        #         with open(self.path, 'r') as f:
														
 
															+        #             html_str = f.read()
														
 
															+        #     except UnicodeDecodeError:
														
 
															+        #         with open(self.path, 'r', errors='ignore') as f:
														
 
															+        #             html_str = f.read()
														
 
															+        #     # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
														
 
															+        #     if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
														
 
															+        #         log('doc as html!')
														
 
															+        #         soup = BeautifulSoup(html_str, 'lxml')
														
 
															+        #         text = soup.text
														
 
															+        #         is_html_doc = True
														
 
															+        # except:
														
 
															+        #     pass
														
 
															+        #
														
 
															+        # if is_html_doc:
														
 
															+        #     self._page = _Page(None, 0)
														
 
															+        #     _sen = _Sentence(text, (0, 0, 0, 0))
														
 
															+        #     self._page.add_child(_sen)
														
 
															+        #     self._doc.add_child(self._page)
														
 
															+
														
 
															+        # 先判断特殊doc文件，可能是html文本
														
 
															+        is_html_doc = self.maybe_html()
														
 
															+
														
 
															+        if not is_html_doc:
														
 
															+            # 调用office格式转换
														
 
															+            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
														
 
															+            if judge_error_code(file_path):
														
 
															+                # office转换失败，调用tika，提取各个类型对象
														
 
															+                try:
														
 
															+                    self.use_tika(self.path)
														
 
															+                except:
														
 
															+                    traceback.print_exc()
														
 
															+                    self._doc.error_code = [-17]
														
 
															+                    log('doc tika failed too')
														
 
															+                return
														
 
															+
														
 
															+            _docx = DocxConvert(file_path, self.unique_type_dir)
														
 
															+            _docx.convert()
														
 
															+            self._doc = _docx._doc
														
 
															+            # if self._doc.error_code is not None:
														
 
															+            #     # docx提取失败，调用tika，提取各个类型对象
														
 
															+            #     print('DocxConvert failed use_tika')
														
 
															+            #     self.use_tika(self.path)
														
 
															+            #     self._doc.error_code = None
														
 
															+            #     # # 调用tika提取
														
 
															+            #     # html = from_tika_interface(self.path)
														
 
															+            #     # if judge_error_code(html):
														
 
															+            #     #     self._doc.error_code = html
														
 
															+            #     # self.tika_html = html
														
 
															+            #     # self._doc.error_code = None
														
 
															+            #     return
														
 
															+
														
 
															+    def maybe_html(self):
														
 
															         # 先判断特殊doc文件，可能是html文本
														
 
															         is_html_doc = False
														
 
															         try:
														
@@ -63,27 +123,39 @@ class DocConvert:
 
															             _sen = _Sentence(text, (0, 0, 0, 0))
														
 
															             self._page.add_child(_sen)
														
 
															             self._doc.add_child(self._page)
														
 
															-        else:
														
 
															-            # 调用office格式转换
														
 
															-            file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
														
 
															-            if judge_error_code(file_path):
														
 
															-                # 调用tika提取
														
 
															-                html = from_tika_interface(self.path)
														
 
															-                if judge_error_code(html):
														
 
															-                    self._doc.error_code = html
														
 
															-                self.tika_html = html
														
 
															-                return
														
 
															-            _docx = DocxConvert(file_path, self.unique_type_dir)
														
 
															-            _docx.convert()
														
 
															-            self._doc = _docx._doc
														
 
															-            if self._doc.error_code is not None:
														
 
															-                # 调用tika提取
														
 
															-                html = from_tika_interface(self.path)
														
 
															-                if judge_error_code(html):
														
 
															-                    self._doc.error_code = html
														
 
															-                self.tika_html = html
														
 
															-                self._doc.error_code = None
														
 
															-                return
														
 
															+
														
 
															+        return is_html_doc
														
 
															+
														
 
															+    def use_tika(self, _path):
														
 
															+        # 调用tika提取
														
 
															+        # html = from_tika_interface(self.path)
														
 
															+        # if judge_error_code(html):
														
 
															+        #     self._doc.error_code = html
														
 
															+        # self.tika_html = html
														
 
															+        data = from_tika_interface(_path)
														
 
															+        if judge_error_code(data):
														
 
															+            self._doc.error_code = data
														
 
															+            return
														
 
															+        current_y = 5
														
 
															+        for di, d in enumerate(data):
														
 
															+            data_type, value = d
														
 
															+            bbox = [0, current_y, 20, current_y+10]
														
 
															+            current_y += 20
														
 
															+            if data_type == 'text':
														
 
															+                _sen = _Sentence(value, bbox)
														
 
															+                _sen.combine = False
														
 
															+                self._page.add_child(_sen)
														
 
															+            elif data_type == 'img':
														
 
															+                with open(value, "rb") as f:
														
 
															+                    img = f.read()
														
 
															+                _img = _Image(img, value, bbox)
														
 
															+                _img.is_from_docx = True
														
 
															+                self._page.add_child(_img)
														
 
															+            elif data_type == 'table':
														
 
															+                _table = _Table(value, bbox)
														
 
															+                _table.is_html = True
														
 
															+                self._page.add_child(_table)
														
 
															+        self._doc.add_child(self._page)
														
 
															     def get_html(self):
														
 
															         try:
														
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -10,7 +10,8 @@ import xml
 
															 import zipfile
														
 
															 import docx
														
 
															 from bs4 import BeautifulSoup
														
 
															-from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
														
 
															+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code, \
														
 
															+    get_table_html
														
 
															 from format_convert.wrapt_timeout_decorator import timeout
														
 
															 from format_convert.convert_image import ImageConvert
														
 
															 from format_convert.convert_need_interface import from_tika_interface
														
@@ -313,7 +314,7 @@ def read_xml_order(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
															 @timeout(50, timeout_exception=TimeoutError)
														
 
															 def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_rels):
														
 
															-    def recursion_read_table(table):
														
 
															+    def recursion_read_table(table, show=0):
														
 
															         table_text = '<table border="1">'
														
 
															         tr_index = 0
														
 
															         tr_text_list = []
														
@@ -349,6 +350,7 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
															                             if is_merge == "continue":
														
 
															                                 row_span_dict[tc_index][0] += 1
														
 
															                                 tc_index += col_span
														
 
															+                                tc_text_list.append([tc_text, col_span])
														
 
															                                 # 跳过，不增加td
														
 
															                                 continue
														
 
															                                 # col_span_index = 0
														
@@ -403,6 +405,11 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
															                 tr_index += 1
														
 
															                 tr_text_list.append(tc_text_list)
														
 
															+        if show:
														
 
															+            for row in tr_text_list:
														
 
															+                print('row', row)
														
 
															+                print('len(row)', len(row))
														
 
															+
														
 
															         # 替换所有row_span
														
 
															         for key in row_span_dict.keys():
														
 
															             row_span, finish_row_span_flag = row_span_dict.get(key)
														
@@ -420,7 +427,8 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
															         for node in body_nodes:
														
 
															             if 'w:tbl' in str(node).split(' '):
														
 
															                 _table = node
														
 
															-                _table_text = recursion_read_table(_table)
														
 
															+                # _table_text = recursion_read_table(_table)
														
 
															+                _table_text = xml_table_to_html(_table, unique_type_dir, numbering_xml, document_xml_rels)
														
 
															                 table_text_list.append(_table_text)
														
 
															         return table_text_list
														
@@ -430,6 +438,146 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
 
															         return [-1]
														
 
															+def xml_table_to_html(table, unique_type_dir, numbering_xml, document_xml_rels, show=0):
														
 
															+    tr_index = 0
														
 
															+    tr_text_list = []
														
 
															+    last_node_level = 0
														
 
															+    num_pr_dict = {}
														
 
															+
														
 
															+    # 直接子节点用child表示，所有子节点用all表示
														
 
															+    for table_child in table.childNodes:
														
 
															+        if 'w:tr' in str(table_child):
														
 
															+            tr = table_child
														
 
															+            tr_child_nodes = tr.childNodes
														
 
															+            tc_index = 0
														
 
															+            tc_text_list = []
														
 
															+            for tr_child in tr_child_nodes:
														
 
															+                if 'w:tc' in str(tr_child).split(' '):
														
 
															+                    tc_text = ""
														
 
															+                    tc = tr_child
														
 
															+                    # 获取一格占多少列，相当于colspan
														
 
															+                    col_span = tc.getElementsByTagName("w:gridSpan")
														
 
															+                    if col_span:
														
 
															+                        col_span = int(col_span[0].getAttribute("w:val"))
														
 
															+                    else:
														
 
															+                        col_span = 1
														
 
															+                    # 获取是否是合并单元格的下一个空单元格，相当于rowspan
														
 
															+                    is_merge = tc.getElementsByTagName("w:vMerge")
														
 
															+                    if is_merge:
														
 
															+                        is_merge = is_merge[0].getAttribute("w:val")
														
 
															+                        if is_merge == "continue":
														
 
															+                            tc_index += col_span
														
 
															+                            tc_text = '@continue@'
														
 
															+                            tc_text_list.append([tc_text, col_span])
														
 
															+                            # 跳过，不增加td
														
 
															+                            continue
														
 
															+
														
 
															+                    # 放入文本
														
 
															+                    tc_child_nodes = tc.childNodes
														
 
															+                    for tc_child in tc_child_nodes:
														
 
															+                        # 处理嵌套在tc中的表格
														
 
															+                        if 'w:tbl' in str(tc_child).split(' '):
														
 
															+                            tc_text += xml_table_to_html(tc_child, unique_type_dir, numbering_xml, document_xml_rels)
														
 
															+                        # 处理编号
														
 
															+                        if 'w:p' in str(tc_child).split(' '):
														
 
															+                            _t_list, _, num_pr_dict, last_node_level = read_p_text(unique_type_dir,
														
 
															+                                                                                   tc_child,
														
 
															+                                                                                   last_node_level,
														
 
															+                                                                                   num_pr_dict,
														
 
															+                                                                                   numbering_xml,
														
 
															+                                                                                   document_xml_rels)
														
 
															+                            tc_text += ''.join(_t_list)
														
 
															+                    # 结束该tc
														
 
															+                    tc_index += col_span
														
 
															+                    tc_text_list.append([tc_text, col_span])
														
 
															+            # 结束该tr
														
 
															+            tr_index += 1
														
 
															+            tr_text_list.append(tc_text_list)
														
 
															+
														
 
															+    if show:
														
 
															+        for row in tr_text_list:
														
 
															+            print('row', row)
														
 
															+            print('len(row)', len(row))
														
 
															+
														
 
															+    table_html = row_list_to_table(tr_text_list)
														
 
															+    return table_html
														
 
															+
														
 
															+
														
 
															+def row_list_to_table(row_list, show=0):
														
 
															+    if show:
														
 
															+        print('='*50)
														
 
															+
														
 
															+    # 复制合并列
														
 
															+    new_row_list = []
														
 
															+    for row in row_list:
														
 
															+        new_row = []
														
 
															+        for col, col_span in row:
														
 
															+            new_row += [[col, col_span]]
														
 
															+            if col_span > 1:
														
 
															+                new_row += [[col, 0]] * (col_span - 1)
														
 
															+        new_row_list.append(new_row)
														
 
															+    row_list = new_row_list
														
 
															+
														
 
															+    if show:
														
 
															+        for row in row_list:
														
 
															+            print('copy row', row)
														
 
															+
														
 
															+    # 计算是不是每行都有相等列数
														
 
															+    row_cnt_list = []
														
 
															+    for row in row_list:
														
 
															+        row_cnt_list.append(len(row))
														
 
															+
														
 
															+    if len(set(row_cnt_list)) != 1:
														
 
															+        log('表格有列数不同，直接返回text' + str(row_cnt_list))
														
 
															+        # 直接返回所有col的text
														
 
															+        text = ''
														
 
															+        for row in row_list:
														
 
															+            for col, col_span in row:
														
 
															+                text += col
														
 
															+        return text
														
 
															+
														
 
															+    new_row_list = []
														
 
															+    for ri, row in enumerate(row_list):
														
 
															+        new_row = []
														
 
															+        for ci, col in enumerate(row):
														
 
															+            col, col_span = col
														
 
															+            row_span = 1
														
 
															+            # 判断下面行同列有没有需合并的
														
 
															+            for ri2 in range(ri+1, len(row_list)):
														
 
															+                col2, col_span2 = row_list[ri2][ci]
														
 
															+                if col2 == '@continue@':
														
 
															+                    row_span += 1
														
 
															+                else:
														
 
															+                    break
														
 
															+
														
 
															+            # 需跳过的列
														
 
															+            if col == '@continue@' or col_span == 0:
														
 
															+                delete = 1
														
 
															+            else:
														
 
															+                delete = 0
														
 
															+
														
 
															+            col_dict = {
														
 
															+                'text': col,
														
 
															+                'rowspan': row_span,
														
 
															+                'columnspan': col_span,
														
 
															+                'delete': delete,
														
 
															+            }
														
 
															+            new_row.append(col_dict)
														
 
															+        new_row_list.append(new_row)
														
 
															+
														
 
															+    if show:
														
 
															+        for new_row in new_row_list:
														
 
															+            print('new_row', new_row)
														
 
															+
														
 
															+    table_html = get_table_html(new_row_list)
														
 
															+
														
 
															+    # soup = BeautifulSoup(table_html, 'lxml')
														
 
															+    # print(soup.prettify())
														
 
															+    if show:
														
 
															+        print('-' * 50)
														
 
															+    return table_html
														
 
															+
														
 
															+
														
 
															 @timeout(25, timeout_exception=TimeoutError)
														
 
															 def parse_xml(path):
														
 
															     # 解析xml
														
@@ -449,6 +597,7 @@ def parse_xml2(path):
 
															 class DocxConvert:
														
 
															     def __init__(self, path, unique_type_dir):
														
 
															         self._doc = _Document(path)
														
 
															+        self._page = _Page(None, 0)
														
 
															         self.path = path
														
 
															         self.unique_type_dir = unique_type_dir
														
@@ -497,8 +646,6 @@ class DocxConvert:
 
															             self._doc.error_code = [-3]
														
 
															     def convert(self):
														
 
															-        self._page = _Page(None, 0)
														
 
															-
														
 
															         # 先判断特殊doc文件，可能是html文本
														
 
															         is_html_doc = False
														
 
															         try:
														
@@ -630,23 +777,62 @@ class DocxConvert:
 
															     def get_doc_object(self):
														
 
															         return self._doc
														
 
															+    def use_tika(self, _path):
														
 
															+        # 调用tika提取
														
 
															+        # html = from_tika_interface(self.path)
														
 
															+        # if judge_error_code(html):
														
 
															+        #     self._doc.error_code = html
														
 
															+        # self.tika_html = html
														
 
															+        data = from_tika_interface(_path)
														
 
															+        if judge_error_code(data):
														
 
															+            self._doc.error_code = data
														
 
															+            return
														
 
															+        current_y = 5
														
 
															+        for di, d in enumerate(data):
														
 
															+            data_type, value = d
														
 
															+            bbox = [0, current_y, 20, current_y+10]
														
 
															+            current_y += 20
														
 
															+            if data_type == 'text':
														
 
															+                _sen = _Sentence(value, bbox)
														
 
															+                _sen.combine = False
														
 
															+                self._page.add_child(_sen)
														
 
															+            elif data_type == 'img':
														
 
															+                with open(value, "rb") as f:
														
 
															+                    img = f.read()
														
 
															+                _img = _Image(img, value, bbox)
														
 
															+                _img.is_from_docx = True
														
 
															+                self._page.add_child(_img)
														
 
															+            elif data_type == 'table':
														
 
															+                _table = _Table(value, bbox)
														
 
															+                _table.is_html = True
														
 
															+                self._page.add_child(_table)
														
 
															+        self._doc.add_child(self._page)
														
 
															+
														
 
															     def get_html(self):
														
 
															         if self._doc.error_code is not None:
														
 
															             return self._doc.error_code
														
 
															         try:
														
 
															+            # raise
														
 
															             self.convert()
														
 
															         except:
														
 
															             traceback.print_exc()
														
 
															             self._doc.error_code = [-1]
														
 
															         # log('docx error code ' + str(self._doc.error_code))
														
 
															         if self._doc.error_code is not None:
														
 
															-            # 调用tika提取
														
 
															-            html = from_tika_interface(self.path)
														
 
															-            if judge_error_code(html):
														
 
															-                self._doc.error_code = html
														
 
															-                return self._doc.error_code
														
 
															-            else:
														
 
															-                return [html]
														
 
															+            # # 调用tika提取
														
 
															+            # html = from_tika_interface(self.path)
														
 
															+            # if judge_error_code(html):
														
 
															+            #     self._doc.error_code = html
														
 
															+            #     return self._doc.error_code
														
 
															+            # else:
														
 
															+            #     return [html]
														
 
															+            try:
														
 
															+                self.use_tika(self.path)
														
 
															+                self._doc.error_code = None
														
 
															+            except:
														
 
															+                traceback.print_exc()
														
 
															+                log('docx tika failed too')
														
 
															+                self._doc.error_code = [-17]
														
 
															         return self._doc.get_html()
														
@@ -791,9 +977,10 @@ class DocxConvertNew:
 
															 if __name__ == '__main__':
														
 
															-    c = DocxConvert("C:/Users/Administrator/Downloads/dsdsd.docx", "C:/Users/Administrator/Downloads/1/")
														
 
															-    print(c.get_html())
														
 
															-
														
 
															-    # c = DocxConvertNew()
														
 
															-    # # c.read_docx(r'C:\Users\Administrator\Desktop\test_doc\error14.docx')
														
 
															-    # c.read_docx(r'C:/Users/Administrator/Downloads/dsdsd.docx')
														
 
															+    _p = r'C:/Users/Administrator/Downloads/1723004790329.docx'
														
 
															+    # _p = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
														
 
															+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
														
 
															+    c = DocxConvert(_p, save_dir)
														
 
															+    _html = c.get_html()
														
 
															+    with open('../result.html', 'w', encoding='utf-8') as f:
														
 
															+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + str(_html[0]))
														
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -21,7 +21,7 @@ from format_convert.utils import judge_error_code, add_div, LineTable, get_table
 
															 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
														
 
															     from_idc_interface, from_isr_interface
														
 
															 from format_convert.table_correct import get_rotated_image
														
 
															-from botr.extract_table import get_table
														
 
															+from botr.extract_table import get_table, get_b_table_by_blank_colon
														
 
															 def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
														
@@ -66,7 +66,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															     def merge_textbox(textbox_list, in_objs):
														
 
															         delete_obj = []
														
 
															         threshold = 5
														
 
															-        textbox_list.sort(key=lambda x:x.bbox[0])
														
 
															+        textbox_list.sort(key=lambda x: x.bbox[0])
														
 
															         for k in range(len(textbox_list)):
														
 
															             tb1 = textbox_list[k]
														
 
															             if tb1 not in in_objs and tb1 not in delete_obj:
														
@@ -74,6 +74,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															                     tb2 = textbox_list[m]
														
 
															                     if tb2 in in_objs:
														
 
															                         continue
														
 
															+                    # print('tb1 tb2', tb1, tb2)
														
 
															                     if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
														
 
															                             and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
														
 
															                         if tb1.bbox[0] <= tb2.bbox[0]:
														
@@ -88,9 +89,9 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															                 textbox_list.remove(_obj)
														
 
															         return textbox_list
														
 
															-    def resize_process(_image_np):
														
 
															+    def resize_process(_image_np, threshold=2048):
														
 
															+    # def resize_process(_image_np, threshold=1280):
														
 
															         # 整体分辨率限制
														
 
															-        threshold = 2048
														
 
															         if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
														
 
															             h, w = get_best_predict_size2(_image_np, threshold=threshold)
														
 
															             log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
														
@@ -169,14 +170,24 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															         log("isr total time "+str(time.time()-_isr_time))
														
 
															         return _image_np
														
 
															-    def ocr_process(_image_np, _threshold=2048):
														
 
															+    # def ocr_process(_image_np, _threshold=2048):
														
 
															+    def ocr_process(_image_np, _threshold=1080):
														
 
															         log("ocr_process image shape " + str(_image_np.shape))
														
 
															+        # 过小直接返回
														
 
															+        if _image_np.shape[0] <= 10 or _image_np.shape[1] <= 10:
														
 
															+            return [], []
														
 
															+        if _image_np.shape[0] < 50 and _image_np.shape[1] / _image_np.shape[0] > 20:
														
 
															+            return [], []
														
 
															+        if _image_np.shape[1] < 50 and _image_np.shape[0] / _image_np.shape[1] > 20:
														
 
															+            return [], []
														
 
															+
														
 
															         # ocr图片过大内存溢出，需resize
														
 
															         # 大图按比例缩小，小图维持不变；若统一拉伸成固定大小如1024会爆显存
														
 
															         ratio = (1, 1)
														
 
															         if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
														
 
															-            best_h, best_w = get_best_predict_size2(_image_np, _threshold)
														
 
															+            # best_h, best_w = get_best_predict_size2(_image_np, _threshold)
														
 
															+            best_h, best_w = get_best_predict_size_by_area(_image_np, _threshold)
														
 
															             _image_np = pil_resize(_image_np, best_h, best_w)
														
 
															             log("ocr_process image resize " + str(_image_np.shape))
														
 
															             ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
														
@@ -189,7 +200,13 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															         # 调用ocr模型接口
														
 
															         image_bytes = np2bytes(_image_np)
														
 
															-        text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
														
 
															+        result = from_ocr_interface(image_bytes, is_table=1)
														
 
															+        # print('from_ocr_interface result ', result)
														
 
															+        if len(result) != 2:
														
 
															+            return result, result
														
 
															+
														
 
															+        text_list, bbox_list = result
														
 
															+        # text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
														
 
															         if judge_error_code(text_list):
														
 
															             return text_list, text_list
														
@@ -264,6 +281,13 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															     def botr_process(_image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
														
 
															                      from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
														
 
															+
														
 
															+        temp_list = []
														
 
															+        for _table2 in table_list2:
														
 
															+            _table2 = _Table(_table2["table"], _table2["bbox"])
														
 
															+            temp_list.append(_table2)
														
 
															+        table_list2 = temp_list
														
 
															+
														
 
															         if from_pdf:
														
 
															             # 交叉验证 ocr结果与pdf obj，暂时使用pdf提取的
														
 
															             h_ratio = _image_np.shape[0] / pdf_layout_size[1]
														
@@ -300,14 +324,55 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															             box_list2 = pdf_box_list
														
 
															             text_box_list2 = pdf_text_box_list
														
 
															-        _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, table_list2, text_list2, box_list2, text_box_list2)
														
 
															-
														
 
															-        # 保存无边框表格文件
														
 
															-        if _table_list:
														
 
															+            _b_table_list = []
														
 
															+            _not_b_table_list = []
														
 
															+        else:
														
 
															+            # 无边框新规则，补充添加 2505015
														
 
															+            # 根据text规律，判断该页是否可能有无边框表格
														
 
															             try:
														
 
															-                save_b_table(_image_np, text_box_list2, from_pdf)
														
 
															+                _b_table_list, _not_b_table_list = get_b_table_by_blank_colon(text_box_list2, table_list2, (
														
 
															+                0, 0, _image_np.shape[1], _image_np.shape[0]), _image_np)
														
 
															             except:
														
 
															-                pass
														
 
															+                traceback.print_exc()
														
 
															+                return [-23], [], []
														
 
															+
														
 
															+            # print('_b_table_list111', _b_table_list)
														
 
															+            if _b_table_list:
														
 
															+                temp_list = []
														
 
															+                for _b_table in _b_table_list:
														
 
															+                    _b_table = _Table(_b_table[0], _b_table[1])
														
 
															+                    # table_list2 += [_b_table]
														
 
															+                    temp_list.append(_b_table)
														
 
															+                _b_table_list = temp_list
														
 
															+            if _not_b_table_list:
														
 
															+                temp_list = []
														
 
															+                for _b_table in _not_b_table_list:
														
 
															+                    _b_table = _Table(_b_table[0], _b_table[1])
														
 
															+                    temp_list.append(_b_table)
														
 
															+                _not_b_table_list = temp_list
														
 
															+
														
 
															+        ignore_table_list = table_list2 + _b_table_list + _not_b_table_list
														
 
															+        # yolo检测出的表格，忽略两列的，因为已经补充了两列的新规则 250529
														
 
															+        _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, ignore_table_list, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
														
 
															+        # print('_table_list', _table_list)
														
 
															+        # print('_b_table_list222', _b_table_list)
														
 
															+
														
 
															+        # 无边框新规则，补充添加 2505015
														
 
															+        _table_list = [_Table(x.get('table'), x.get('bbox')) for x in _table_list]
														
 
															+        _table_list += _b_table_list
														
 
															+        for _b_table in _b_table_list:
														
 
															+            for _text_box in text_box_list2:
														
 
															+                if _b_table.bbox[1] <= _text_box.bbox[1] <= _text_box.bbox[3] <= _b_table.bbox[3]:
														
 
															+                    # print('add _obj_in_table_list 250515', _text_box)
														
 
															+                    _obj_in_table_list.append(_text_box)
														
 
															+        # print('_b_table_list233', _table_list)
														
 
															+
														
 
															+        # 保存无边框表格文件
														
 
															+        # if _table_list:
														
 
															+        #     try:
														
 
															+        #         save_b_table(_image_np, text_box_list2, from_pdf)
														
 
															+        #     except:
														
 
															+        #         pass
														
 
															         # print('_text_box_list', _text_box_list)
														
 
															         # print('_table_list', _table_list)
														
@@ -496,7 +561,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															             else:
														
 
															                 # 根据index拆开图片，重新ocr
														
 
															                 split_index_list.insert(0, 0)
														
 
															-                print('split_index_list1', split_index_list)
														
 
															+                # print('split_index_list1', split_index_list)
														
 
															                 for _i, index in enumerate(split_index_list):
														
 
															                     if _i == len(split_index_list) - 1:
														
 
															                         split_image_np = sub_image_np[:, index:, :]
														
@@ -602,12 +667,12 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															                 # 生成TextBox对象
														
 
															                 text_box_list = get_text_box_obj(text_list, box_list)
														
 
															                 # for t in text_box_list:
														
 
															-                #     print('text_box0', t.get_text())
														
 
															+                #     print('text_box0', t)
														
 
															                 # 表格生成
														
 
															                 text_box_list, table_list, obj_in_table_list = table_process(line_list, text_box_list, image_np)
														
 
															                 # for t in text_box_list:
														
 
															-                #     print('text_box1', t.get_text())
														
 
															+                #     print('text_box1', t)
														
 
															                 # print('table_list', table_list)
														
 
															                 # for t in obj_in_table_list:
														
 
															                 #     print('obj_text_box2', t.get_text())
														
@@ -625,10 +690,20 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															                                                                                 pdf_layout_size,
														
 
															                                                                                 )
														
 
															                 log('botr process cost: ' + str(time.time()-start_time))
														
 
															+                if judge_error_code(text_box_list):
														
 
															+                    return text_box_list
														
 
															+
														
 
															+                # print('b_table_list333', b_table_list)
														
 
															+                obj_in_table_list.update(set(b_obj_in_table_list))
														
 
															+                # for t in text_box_list:
														
 
															+                #     print('text_box2', t)
														
 
															                 # 合并非表格的同一行TextBox
														
 
															                 text_box_list = merge_textbox(text_box_list, obj_in_table_list)
														
 
															+                # for t in text_box_list:
														
 
															+                #     print('text_box3', t)
														
 
															+                # print('table_list, b_table_list', table_list, b_table_list)
														
 
															                 table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
														
 
															             if reverse_flag:
														
@@ -649,16 +724,21 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															             _add_y = 0
														
 
															             for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
														
 
															                 obj_list = []
														
 
															+                # print('obj_in_table_list', obj_in_table_list)
														
 
															                 for table in table_list:
														
 
															-                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
														
 
															+                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y,
														
 
															+                                   table["bbox"][2], table["bbox"][3] + _add_y]
														
 
															                     _table = _Table(table["table"], _table_bbox)
														
 
															+                    # print('_table.bbo2x', _table.bbox)
														
 
															                     obj_list.append(_table)
														
 
															                 for table in b_table_list:
														
 
															-                    _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
														
 
															-                    _table = _Table(table["table"], _table_bbox)
														
 
															-                    obj_list.append(_table)
														
 
															+                    # _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
														
 
															+                    # _table = _Table(table["table"], _table_bbox)
														
 
															+                    # print('table.bbo1x', table.bbox)
														
 
															+                    obj_list.append(table)
														
 
															                 for text_box in text_box_list:
														
 
															                     if text_box not in obj_in_table_list:
														
 
															+                        # print('text_box',  text_box)
														
 
															                         text_box.bbox[1] += _add_y
														
 
															                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
														
@@ -707,6 +787,8 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															                                                                         pdf_layout_size,
														
 
															                                                                         )
														
 
															             log('botr process cost: ' + str(time.time()-start_time))
														
 
															+            if judge_error_code(text_box_list):
														
 
															+                return text_box_list
														
 
															             # 合并非表格的同一行TextBox
														
 
															             text_box_list = merge_textbox(text_box_list, obj_in_table_list)
														
@@ -715,8 +797,10 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															             obj_list = []
														
 
															             # print('table_list', table_list)
														
 
															             for table in table_list:
														
 
															-                _table = _Table(table["table"], table["bbox"])
														
 
															-                obj_list.append(_table)
														
 
															+                # print('type(table)', type(table))
														
 
															+                # _table = _Table(table["table"], table["bbox"])
														
 
															+                # print('table.bbox', table.bbox)
														
 
															+                obj_list.append(table)
														
 
															             for text_box in text_box_list:
														
 
															                 if text_box not in obj_in_table_list:
														
 
															                     obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
														
@@ -732,6 +816,690 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
															         return [-1]
														
 
															+# class ImageProcess:
														
 
															+#     def __init__(self, image_np, image_path, is_from_pdf=False, is_from_docx=False,
														
 
															+#                  b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=(),
														
 
															+#                  is_reverse=False):
														
 
															+#
														
 
															+#         self.image_np = image_np
														
 
															+#         self.image_path = image_path
														
 
															+#         self.is_from_pdf = is_from_pdf
														
 
															+#         self.is_from_docx = is_from_docx
														
 
															+#         self.b_table_from_text = b_table_from_text
														
 
															+#         self.pdf_obj_list = pdf_obj_list
														
 
															+#         self.pdf_layout_size = pdf_layout_size
														
 
															+#         self.is_reverse = is_reverse
														
 
															+#
														
 
															+#     def merge_textbox(self, textbox_list, in_objs):
														
 
															+#         delete_obj = []
														
 
															+#         threshold = 5
														
 
															+#         textbox_list.sort(key=lambda x:x.bbox[0])
														
 
															+#         for k in range(len(textbox_list)):
														
 
															+#             tb1 = textbox_list[k]
														
 
															+#             if tb1 not in in_objs and tb1 not in delete_obj:
														
 
															+#                 for m in range(k+1, len(textbox_list)):
														
 
															+#                     tb2 = textbox_list[m]
														
 
															+#                     if tb2 in in_objs:
														
 
															+#                         continue
														
 
															+#                     if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
														
 
															+#                             and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
														
 
															+#                         if tb1.bbox[0] <= tb2.bbox[0]:
														
 
															+#                             tb1.text = tb1.text + tb2.text
														
 
															+#                         else:
														
 
															+#                             tb1.text = tb2.text + tb1.text
														
 
															+#                         tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
														
 
															+#                         tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
														
 
															+#                         delete_obj.append(tb2)
														
 
															+#         for _obj in delete_obj:
														
 
															+#             if _obj in textbox_list:
														
 
															+#                 textbox_list.remove(_obj)
														
 
															+#         return textbox_list
														
 
															+#
														
 
															+#     def resize_process(self, _image_np):
														
 
															+#         # 整体分辨率限制
														
 
															+#         threshold = 2048
														
 
															+#         if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
														
 
															+#             h, w = get_best_predict_size2(_image_np, threshold=threshold)
														
 
															+#             log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
														
 
															+#             _image_np = pil_resize(_image_np, h, w)
														
 
															+#         return _image_np
														
 
															+#
														
 
															+#     def idc_process(self, _image_np, return_angle=False):
														
 
															+#         # 图片倾斜校正，写入原来的图片路径
														
 
															+#         # print("image_process", image_path)
														
 
															+#         # g_r_i = get_rotated_image(_image_np, image_path)
														
 
															+#         # if judge_error_code(g_r_i):
														
 
															+#         #     if is_from_docx:
														
 
															+#         #         return []
														
 
															+#         #     else:
														
 
															+#         #         return g_r_i
														
 
															+#         # _image_np = cv2.imread(image_path)
														
 
															+#         # if _image_np is None:
														
 
															+#         #     return []
														
 
															+#         # return _image_np
														
 
															+#
														
 
															+#         # if _image_np is None:
														
 
															+#         #     return []
														
 
															+#
														
 
															+#         # idc模型实现图片倾斜校正
														
 
															+#         h, w = get_best_predict_size2(_image_np, 1080)
														
 
															+#         image_resize = pil_resize(_image_np, h, w)
														
 
															+#         # image_resize_path = image_path.split(".")[0] + "_resize_idc." + image_path.split(".")[-1]
														
 
															+#         # cv2.imwrite(image_resize_path, image_resize)
														
 
															+#
														
 
															+#         # with open(image_resize_path, "rb") as f:
														
 
															+#         #     image_bytes = f.read()
														
 
															+#         image_bytes = np2bytes(image_resize)
														
 
															+#         angle = from_idc_interface(image_bytes)
														
 
															+#         log('idc_process angle ' + str(angle))
														
 
															+#         if judge_error_code(angle):
														
 
															+#             if return_angle:
														
 
															+#                 if self.is_from_docx:
														
 
															+#                     return [], []
														
 
															+#                 else:
														
 
															+#                     return angle, angle
														
 
															+#             else:
														
 
															+#                 if self.is_from_docx:
														
 
															+#                     return []
														
 
															+#                 else:
														
 
															+#                     return angle
														
 
															+#         # 根据角度旋转
														
 
															+#         # _image_pil = Image.fromarray(_image_np)
														
 
															+#         # _image_np = np.array(_image_pil.rotate(angle, expand=1))
														
 
															+#         _image_np = image_rotate(_image_np, angle)
														
 
															+#
														
 
															+#         # 写入
														
 
															+#         # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
														
 
															+#         # cv2.imwrite(idc_path, image_np)
														
 
															+#         if return_angle:
														
 
															+#             return _image_np, angle
														
 
															+#         return _image_np
														
 
															+#
														
 
															+#     def isr_process(self, _image_np):
														
 
															+#         log("isr_process image shape " + str(_image_np.shape))
														
 
															+#         image_np_copy = copy.deepcopy(_image_np)
														
 
															+#         # isr模型去除印章
														
 
															+#         _isr_time = time.time()
														
 
															+#         if count_red_pixel(_image_np):
														
 
															+#             # 红色像素达到一定值才过模型
														
 
															+#             image_bytes = np2bytes(_image_np)
														
 
															+#             _image_np = from_isr_interface(image_bytes)
														
 
															+#             if judge_error_code(_image_np):
														
 
															+#                 if self.is_from_docx:
														
 
															+#                     return []
														
 
															+#                 else:
														
 
															+#                     return _image_np
														
 
															+#             # [1]代表检测不到印章，直接返回
														
 
															+#             if isinstance(_image_np, list) and _image_np == [1]:
														
 
															+#                 log("no seals detected!")
														
 
															+#                 _image_np = image_np_copy
														
 
															+#         log("isr total time "+str(time.time()-_isr_time))
														
 
															+#         return _image_np
														
 
															+#
														
 
															+#     def ocr_process(self, _image_np, _threshold=2048):
														
 
															+#         log("ocr_process image shape " + str(_image_np.shape))
														
 
															+#
														
 
															+#         # ocr图片过大内存溢出，需resize
														
 
															+#         # 大图按比例缩小，小图维持不变；若统一拉伸成固定大小如1024会爆显存
														
 
															+#         ratio = (1, 1)
														
 
															+#         h, w = _image_np.shape[:2]
														
 
															+#         if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
														
 
															+#             best_h, best_w = get_best_predict_size2(_image_np, _threshold)
														
 
															+#             _image_np = pil_resize(_image_np, best_h, best_w)
														
 
															+#             log("ocr_process image resize " + str(_image_np.shape))
														
 
															+#             ratio = (h/best_h, w/best_w)
														
 
															+#
														
 
															+#         # 大图片ocr加锁，防止爆显存
														
 
															+#         # if _image_np.shape[0] >= 1024 and _image_np.shape[1] >= 1024:
														
 
															+#         #     file_lock = True
														
 
															+#         # else:
														
 
															+#         #     file_lock = False
														
 
															+#
														
 
															+#         # 调用ocr模型接口
														
 
															+#         image_bytes = np2bytes(_image_np)
														
 
															+#         text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
														
 
															+#         if judge_error_code(text_list):
														
 
															+#             return text_list, text_list
														
 
															+#
														
 
															+#         for i in range(len(bbox_list)):
														
 
															+#             point = bbox_list[i]
														
 
															+#             bbox_list[i] = [[int(point[0][0]*ratio[0]), int(point[0][1]*ratio[1])],
														
 
															+#                             [int(point[1][0]*ratio[0]), int(point[1][1]*ratio[1])],
														
 
															+#                             [int(point[2][0]*ratio[0]), int(point[2][1]*ratio[1])],
														
 
															+#                             [int(point[3][0]*ratio[0]), int(point[3][1]*ratio[1])]]
														
 
															+#
														
 
															+#         # 去除水印字 根据识别是否为矩形框
														
 
															+#         temp_text_list = []
														
 
															+#         temp_bbox_list = []
														
 
															+#         water_mark_dict = {}
														
 
															+#         for i in range(len(bbox_list)):
														
 
															+#             bbox = bbox_list[i]
														
 
															+#             text = text_list[i]
														
 
															+#             if len(re.findall('[\u4e00-\u9fa5]', text)) == len(text):
														
 
															+#                 if (abs(bbox[0][1] - bbox[1][1]) <= 2 and abs(bbox[2][1] - bbox[3][1]) <= 2) \
														
 
															+#                         or (abs(bbox[0][0] - bbox[3][0]) <= 4 and abs(bbox[2][0] - bbox[1][0]) <= 4):
														
 
															+#                     temp_text_list.append(text)
														
 
															+#                     temp_bbox_list.append(bbox)
														
 
															+#                 else:
														
 
															+#                     if text in water_mark_dict.keys():
														
 
															+#                         water_mark_dict[text] += [bbox]
														
 
															+#                     else:
														
 
															+#                         water_mark_dict[text] = [bbox]
														
 
															+#             else:
														
 
															+#                 temp_text_list.append(text)
														
 
															+#                 temp_bbox_list.append(bbox)
														
 
															+#
														
 
															+#         # 数量多的才算水印
														
 
															+#         for text in water_mark_dict.keys():
														
 
															+#             bbox_list = water_mark_dict.get(text)
														
 
															+#             if len(bbox_list) < 3:
														
 
															+#                 for bbox in bbox_list:
														
 
															+#                     temp_text_list.append(text)
														
 
															+#                     temp_bbox_list.append(bbox)
														
 
															+#
														
 
															+#         text_list = temp_text_list
														
 
															+#         bbox_list = temp_bbox_list
														
 
															+#         return text_list, bbox_list
														
 
															+#
														
 
															+#     def otr_process(self, _image_np):
														
 
															+#         log("otr_process image shape " + str(_image_np.shape))
														
 
															+#         # otr模型识别表格，需要图片resize成模型所需大小, 写入另一个路径
														
 
															+#         best_h, best_w = get_best_predict_size(_image_np)
														
 
															+#         image_resize = pil_resize(_image_np, best_h, best_w)
														
 
															+#         # image_resize_path = image_path.split(".")[0] + "_resize_otr." + image_path.split(".")[-1]
														
 
															+#         # cv2.imwrite(image_resize_path, image_resize)
														
 
															+#
														
 
															+#         # 调用otr模型接口
														
 
															+#         # with open(image_resize_path, "rb") as f:
														
 
															+#         #     image_bytes = f.read()
														
 
															+#         image_bytes = np2bytes(image_resize)
														
 
															+#         list_line = from_otr_interface(image_bytes, self.is_from_pdf)
														
 
															+#         if judge_error_code(list_line):
														
 
															+#             if self.is_from_docx:
														
 
															+#                 return []
														
 
															+#             else:
														
 
															+#                 return list_line
														
 
															+#
														
 
															+#         # otr resize后得到的bbox根据比例还原
														
 
															+#         start_time = time.time()
														
 
															+#         ratio = (_image_np.shape[0]/best_h, _image_np.shape[1]/best_w)
														
 
															+#         for i in range(len(list_line)):
														
 
															+#             point = list_line[i]
														
 
															+#             list_line[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
														
 
															+#                             int(point[2]*ratio[1]), int(point[3]*ratio[0])]
														
 
															+#         log("otr resize bbox recover " + str(time.time()-start_time))
														
 
															+#         return list_line
														
 
															+#
														
 
															+#     def botr_process(self, _image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
														
 
															+#                      from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
														
 
															+#         if from_pdf:
														
 
															+#             # 交叉验证 ocr结果与pdf obj，暂时使用pdf提取的
														
 
															+#             h_ratio = _image_np.shape[0] / pdf_layout_size[1]
														
 
															+#             w_ratio = _image_np.shape[1] / pdf_layout_size[0]
														
 
															+#             pdf_text_list = []
														
 
															+#             pdf_box_list = []
														
 
															+#             for obj in pdf_obj_list:
														
 
															+#                 if obj.get_text() in ["", " "]:
														
 
															+#                     continue
														
 
															+#
														
 
															+#                 # pdf坐标是上下颠倒的
														
 
															+#                 # obj.bbox = (obj.bbox[0], pdf_layout_size[1]-obj.bbox[3],
														
 
															+#                 #             obj.bbox[2], pdf_layout_size[1]-obj.bbox[1])
														
 
															+#
														
 
															+#                 # 根据两个页面大小比例调整坐标
														
 
															+#                 obj.bbox = (obj.bbox[0]*w_ratio, obj.bbox[1]*h_ratio,
														
 
															+#                             obj.bbox[2]*w_ratio, obj.bbox[3]*h_ratio)
														
 
															+#
														
 
															+#                 # 剔除水印字
														
 
															+#                 text = re.sub('[\n ]', '', obj.get_text())
														
 
															+#                 if len(text) == 1 and abs(obj.bbox[0] - obj.bbox[2]) >= 70:
														
 
															+#                     continue
														
 
															+#
														
 
															+#                 pdf_box_list.append([[int(obj.bbox[0]), int(obj.bbox[1])],
														
 
															+#                                      [],
														
 
															+#                                      [int(obj.bbox[2]), int(obj.bbox[3])],
														
 
															+#                                      []
														
 
															+#                                      ])
														
 
															+#                 pdf_text_list.append(re.sub('[\n]', '', obj.get_text()))
														
 
															+#
														
 
															+#             pdf_text_box_list = self.get_text_box_obj(pdf_text_list, pdf_box_list)
														
 
															+#
														
 
															+#             text_list2 = pdf_text_list
														
 
															+#             box_list2 = pdf_box_list
														
 
															+#             text_box_list2 = pdf_text_box_list
														
 
															+#
														
 
															+#         _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, table_list2, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
														
 
															+#
														
 
															+#         # 保存无边框表格文件
														
 
															+#         if _table_list:
														
 
															+#             try:
														
 
															+#                 self.save_b_table(_image_np, text_box_list2, from_pdf)
														
 
															+#             except:
														
 
															+#                 pass
														
 
															+#
														
 
															+#         # print('_text_box_list', _text_box_list)
														
 
															+#         # print('_table_list', _table_list)
														
 
															+#         if from_pdf:
														
 
															+#             text_box_list2 = []
														
 
															+#             table_list2 = []
														
 
															+#
														
 
															+#         if _table_list and _text_box_list:
														
 
															+#             text_box_list2 += _text_box_list
														
 
															+#             text_box_list2 = list(set(text_box_list2))
														
 
															+#             # table_list2 += _table_list
														
 
															+#             # obj_in_table_list2 = obj_in_table_list2.union(_obj_in_table_list)
														
 
															+#         return text_box_list2, _table_list, _obj_in_table_list
														
 
															+#
														
 
															+#     def table_process(self, list_line, list_text_boxes, _image_np):
														
 
															+#         # 调用现成方法形成表格
														
 
															+#         try:
														
 
															+#             if list_line:
														
 
															+#
														
 
															+#                 # 排除掉短且经过文字bbox中间的竖线
														
 
															+#                 temp_list = []
														
 
															+#                 for line in list_line:
														
 
															+#                     find_cnt = 0
														
 
															+#                     if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
														
 
															+#                         for t_obj in list_text_boxes:
														
 
															+#                             # if not (t_obj.bbox[1] <= line[1] <= t_obj.bbox[3] or t_obj.bbox[1] <= line[3] <= t_obj.bbox[3]):
														
 
															+#                             #     continue
														
 
															+#                             if line_iou([[t_obj.bbox[1], 0], [t_obj.bbox[3], 0]], [[line[1], 0], [line[3], 0]]) < 0.3:
														
 
															+#                                 continue
														
 
															+#                             if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
														
 
															+#                                 # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2], t_obj.get_text())
														
 
															+#                                 find_cnt += 1
														
 
															+#                                 if find_cnt >= 2:
														
 
															+#                                     break
														
 
															+#                     if find_cnt >= 2:
														
 
															+#                         continue
														
 
															+#                     temp_list.append(line)
														
 
															+#                 list_line = temp_list
														
 
															+#
														
 
															+#                 from format_convert.convert_tree import TableLine
														
 
															+#                 list_lines = []
														
 
															+#                 for line in list_line:
														
 
															+#                     list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
														
 
															+#
														
 
															+#                 lt = LineTable()
														
 
															+#                 tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
														
 
															+#                                                                                    sourceP_LB=False, splited=False,
														
 
															+#                                                                                    from_pdf=self.is_from_pdf,
														
 
															+#                                                                                    is_reverse=self.is_reverse)
														
 
															+#                 # 需分割textbox
														
 
															+#                 if connect_textbox_list:
														
 
															+#                     list_text_boxes = self.table_textbox_split(_image_np, connect_textbox_list, list_text_boxes)
														
 
															+#                     # 新的textbox，重新做表格
														
 
															+#                     tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
														
 
															+#                                                                                        sourceP_LB=False, splited=True,
														
 
															+#                                                                                        from_pdf=self.is_from_pdf,
														
 
															+#                                                                                        is_reverse=self.is_reverse)
														
 
															+#
														
 
															+#                 if not tables:
														
 
															+#                     return list_text_boxes, tables, obj_in_table
														
 
															+#                 return list_text_boxes, tables, obj_in_table
														
 
															+#             else:
														
 
															+#                 return list_text_boxes, [], set()
														
 
															+#         except:
														
 
															+#             traceback.print_exc()
														
 
															+#             return [-8], [-8], [-8]
														
 
															+#
														
 
															+#     def slice_process(self, _image_np):
														
 
															+#         slice_flag = need_image_slice(_image_np)
														
 
															+#         log("need_image_slice " + str(slice_flag) + " " + str(_image_np.shape))
														
 
															+#         _image_np_list = [_image_np]
														
 
															+#         if slice_flag:
														
 
															+#             # 长图分割
														
 
															+#             _image_np_list = image_slice_new(_image_np)
														
 
															+#             angle_dict = {}
														
 
															+#             for im in _image_np_list:
														
 
															+#                 _, angle = self.idc_process(im, return_angle=True)
														
 
															+#                 if angle in [0, 360]:
														
 
															+#                     angle = 0
														
 
															+#                 if angle in angle_dict.keys():
														
 
															+#                     angle_dict[angle] += 1
														
 
															+#                 else:
														
 
															+#                     angle_dict[angle] = 1
														
 
															+#
														
 
															+#             # idc不太准，有0度就直接使用
														
 
															+#             if 0 in angle_dict.keys():
														
 
															+#                 log('image_slice 0 in angle_dict')
														
 
															+#                 angle = 0
														
 
															+#             else:
														
 
															+#                 angle_list = [[key, value] for key, value in angle_dict.items()]
														
 
															+#                 angle_list.sort(key=lambda x: x[1])
														
 
															+#                 log('image_slice angle_list ' + str(angle_list))
														
 
															+#                 angle = angle_list[-1][0]
														
 
															+#             for i in range(len(_image_np_list)):
														
 
															+#                 _image_np_list[i] = image_rotate(_image_np_list[i], angle)
														
 
															+#             if angle in [180]:
														
 
															+#                 _image_np_list.reverse()
														
 
															+#
														
 
															+#         if len(_image_np_list) < 1:
														
 
															+#             log("image_slice failed!")
														
 
															+#             _image_np_list = [_image_np]
														
 
															+#         return _image_np_list
														
 
															+#
														
 
															+#     def get_text_box_obj(self, _text_list, _bbox_list):
														
 
															+#         from format_convert.convert_tree import TextBox
														
 
															+#         _text_box_list = []
														
 
															+#         for i in range(len(_bbox_list)):
														
 
															+#             bbox = _bbox_list[i]
														
 
															+#             b_text = _text_list[i]
														
 
															+#             _text_box_list.append(TextBox([bbox[0][0], bbox[0][1],
														
 
															+#                                            bbox[2][0], bbox[2][1]], b_text))
														
 
															+#         return _text_box_list
														
 
															+#
														
 
															+#     def save_b_table(self, image_np2, text_box_list2, from_pdf=False):
														
 
															+#         _start_time = time.time()
														
 
															+#         _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table'
														
 
															+#         # _path = 'D:/Project/format_conversion_maxcompute/save_b_table'
														
 
															+#         max_index = 20000
														
 
															+#         if os.path.exists(_path):
														
 
															+#             file_list = glob(_path + '/*')
														
 
															+#             if file_list:
														
 
															+#                 file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
														
 
															+#                 file_index_list.sort(key=lambda x: x)
														
 
															+#                 index = file_index_list[-1] + 1
														
 
															+#             else:
														
 
															+#                 index = 0
														
 
															+#             if index > max_index:
														
 
															+#                 return
														
 
															+#
														
 
															+#             # 文件md5
														
 
															+#             from format_convert import _global
														
 
															+#             _md5 = _global.get("md5")
														
 
															+#
														
 
															+#             _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
														
 
															+#             cv2.imwrite(_image_path, image_np2)
														
 
															+#             log('save b_table image success!')
														
 
															+#
														
 
															+#             # if from_pdf:
														
 
															+#             #     _file_path = _path + '/' + str(_md5) + '-' + str(index) + '.txt'
														
 
															+#             #     new_text_box_list2 = [str(x) + '\n' for x in text_box_list2]
														
 
															+#             #     with open(_file_path, 'w') as f:
														
 
															+#             #         f.writelines(new_text_box_list2)
														
 
															+#             #     log('save b_table txt success!')
														
 
															+#
														
 
															+#         log('save_b_table cost: ' + str(time.time()-_start_time))
														
 
															+#
														
 
															+#     def table_textbox_split(self, image_np2, connect_textbox_list, textbox_list):
														
 
															+#         """
														
 
															+#         两个单元格里的文本被ocr识别为一个，需分开才能准确放进表格
														
 
															+#
														
 
															+#         :return:
														
 
															+#         """
														
 
															+#         split_bbox_list = []
														
 
															+#         split_text_list = []
														
 
															+#         splited_textbox_list = []
														
 
															+#         for textbox in connect_textbox_list:
														
 
															+#             bbox = textbox.bbox
														
 
															+#             bbox = [[bbox[0], bbox[1]], [], [bbox[2], bbox[3]], []]
														
 
															+#             sub_image_np = image_np2[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
														
 
															+#             split_index_list = []
														
 
															+#             # 从左到右遍历img
														
 
															+#             for i in range(5, sub_image_np.shape[1]-5):
														
 
															+#                 # 找表格分割线，这一列都为黑色像素
														
 
															+#                 if np.where(sub_image_np[:, i, 0] < 200)[0].size >= sub_image_np.shape[0]:
														
 
															+#                     split_index_list.append(i)
														
 
															+#
														
 
															+#             # 判断两线之间宽度，去重
														
 
															+#             if len(split_index_list) > 1:
														
 
															+#                 last_index = split_index_list[0]
														
 
															+#                 temp_list = []
														
 
															+#                 delete_list = []
														
 
															+#                 for index in split_index_list[1:]:
														
 
															+#                     if index in delete_list:
														
 
															+#                         continue
														
 
															+#                     if index - last_index <= 5:
														
 
															+#                         delete_list.append(index)
														
 
															+#                     else:
														
 
															+#                         last_index = index
														
 
															+#                     temp_list.append(last_index)
														
 
															+#                 split_index_list = temp_list
														
 
															+#
														
 
															+#             # n条以上分割线，有问题
														
 
															+#             if len(split_index_list) == 0 or len(split_index_list) >= 2:
														
 
															+#                 # print('len(split_index_list)', len(split_index_list), split_index_list)
														
 
															+#                 continue
														
 
															+#             else:
														
 
															+#                 # 根据index拆开图片，重新ocr
														
 
															+#                 split_index_list.insert(0, 0)
														
 
															+#                 print('split_index_list1', split_index_list)
														
 
															+#                 for _i, index in enumerate(split_index_list):
														
 
															+#                     if _i == len(split_index_list) - 1:
														
 
															+#                         split_image_np = sub_image_np[:, index:, :]
														
 
															+#                         split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[2][0], bbox[2][1]], []])
														
 
															+#                     else:
														
 
															+#                         next_index = split_index_list[_i+1]
														
 
															+#                         split_image_np = sub_image_np[:, index:next_index, :]
														
 
															+#                         split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[0][0]+next_index, bbox[2][1]], []])
														
 
															+#
														
 
															+#                     # ocr
														
 
															+#                     split_image_bytes = np2bytes(split_image_np)
														
 
															+#                     text_list2, bbox_list2 = from_ocr_interface(split_image_bytes, is_table=1, only_rec=1)
														
 
															+#                     # print('text_list2', text_list2)
														
 
															+#                     # print('bbox_list2', split_bbox_list)
														
 
															+#                     if judge_error_code(text_list2):
														
 
															+#                         text2 = ''
														
 
															+#                     else:
														
 
															+#                         if text_list2:
														
 
															+#                             text2 = text_list2[0]
														
 
															+#                         else:
														
 
															+#                             text2 = ''
														
 
															+#                     split_text_list.append(text2)
														
 
															+#                 splited_textbox_list.append(textbox)
														
 
															+#
														
 
															+#         if split_text_list and split_bbox_list:
														
 
															+#             split_textbox_list = self.get_text_box_obj(split_text_list, split_bbox_list)
														
 
															+#             for tb in splited_textbox_list:
														
 
															+#                 if tb in textbox_list:
														
 
															+#                     textbox_list.remove(tb)
														
 
															+#             textbox_list += split_textbox_list
														
 
															+#
														
 
															+#         return textbox_list
														
 
															+#
														
 
															+#     def __call__(self):
														
 
															+#         from format_convert.convert_tree import _Table, _Sentence
														
 
															+#         log("into image_preprocess")
														
 
															+#         try:
														
 
															+#             if self.image_np is None:
														
 
															+#                 log("image_preprocess image_np is None")
														
 
															+#                 return []
														
 
															+#             if self.image_np.shape[0] <= 20 or self.image_np.shape[1] <= 20:
														
 
															+#                 log('image_np.shape[0] <= 20 or image_np.shape[1] <= 20')
														
 
															+#                 return []
														
 
															+#
														
 
															+#             if not self.b_table_from_text:
														
 
															+#                 # 判断是否需要长图分割
														
 
															+#                 idc_flag = False
														
 
															+#                 image_np_list = self.slice_process(self.image_np)
														
 
															+#                 if len(image_np_list) > 1:
														
 
															+#                     idc_flag = True
														
 
															+#
														
 
															+#                 reverse_flag = 0
														
 
															+#                 table_textbox_list = []
														
 
															+#                 for image_np in image_np_list:
														
 
															+#                     # 整体分辨率限制
														
 
															+#                     image_np = self.resize_process(image_np)
														
 
															+#
														
 
															+#                     # 印章去除
														
 
															+#                     image_np = self.isr_process(image_np)
														
 
															+#                     if isinstance(image_np, list):
														
 
															+#                         return image_np
														
 
															+#
														
 
															+#                     # 文字识别
														
 
															+#                     text_list, box_list = self.ocr_process(image_np)
														
 
															+#                     if judge_error_code(text_list):
														
 
															+#                         return text_list
														
 
															+#
														
 
															+#                     # 判断ocr识别是否正确
														
 
															+#                     # print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag, text_list)
														
 
															+#                     if ocr_cant_read(text_list, box_list) and not idc_flag:
														
 
															+#                         # 方向分类
														
 
															+#                         image_np, angle = self.idc_process(image_np, return_angle=True)
														
 
															+#                         if isinstance(image_np, list):
														
 
															+#                             return image_np
														
 
															+#                         # 如果角度不变，旋转180
														
 
															+#                         if angle in [0, 360]:
														
 
															+#                             pass
														
 
															+#                             # log('ocr_cant_read image_rotate 180')
														
 
															+#                             # image_np = image_rotate(image_np, angle=180)
														
 
															+#                             # reverse_flag = 1
														
 
															+#                             # image_pil = Image.fromarray(image_np)
														
 
															+#                             # image_np = np.array(image_pil.rotate(180, expand=1))
														
 
															+#                         # cv2.imshow("idc_process", image_np)
														
 
															+#                         # cv2.waitKey(0)
														
 
															+#
														
 
															+#                         # 文字识别
														
 
															+#                         text_list1, box_list_1 = self.ocr_process(image_np)
														
 
															+#                         if judge_error_code(text_list1):
														
 
															+#                             return text_list1
														
 
															+#
														
 
															+#                         if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and self.is_from_pdf:
														
 
															+#                             return [-16]
														
 
															+#
														
 
															+#                         # 比较字数
														
 
															+#                         # print("ocr process", len("".join(text_list)), len("".join(text_list1)))
														
 
															+#                         if len("".join(text_list)) < len("".join(text_list1)):
														
 
															+#                             text_list = text_list1
														
 
															+#                             box_list = box_list_1
														
 
															+#
														
 
															+#                     # 表格识别
														
 
															+#                     line_list = self.otr_process(image_np)
														
 
															+#                     if judge_error_code(line_list):
														
 
															+#                         return line_list
														
 
															+#
														
 
															+#                     # 生成TextBox对象
														
 
															+#                     text_box_list = self.get_text_box_obj(text_list, box_list)
														
 
															+#                     # for t in text_box_list:
														
 
															+#                     #     print('text_box0', t.get_text())
														
 
															+#
														
 
															+#                     # 表格生成
														
 
															+#                     text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, image_np)
														
 
															+#                     # for t in text_box_list:
														
 
															+#                     #     print('text_box1', t.get_text())
														
 
															+#                     # print('table_list', table_list)
														
 
															+#                     # for t in obj_in_table_list:
														
 
															+#                     #     print('obj_text_box2', t.get_text())
														
 
															+#                     if judge_error_code(table_list):
														
 
															+#                         return table_list
														
 
															+#
														
 
															+#                     # 无边框表格识别
														
 
															+#                     start_time = time.time()
														
 
															+#                     text_box_list, b_table_list, b_obj_in_table_list \
														
 
															+#                         = self.botr_process(image_np, table_list, text_list, box_list,
														
 
															+#                                             text_box_list, obj_in_table_list, self.b_table_from_text,
														
 
															+#                                             self.pdf_obj_list, self.pdf_layout_size,
														
 
															+#                                             )
														
 
															+#                     log('botr process cost: ' + str(time.time()-start_time))
														
 
															+#
														
 
															+#                     # 合并非表格的同一行TextBox
														
 
															+#                     text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
														
 
															+#
														
 
															+#                     table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
														
 
															+#
														
 
															+#                 if reverse_flag:
														
 
															+#                     table_textbox_list.reverse()
														
 
															+#
														
 
															+#                     for i in range(len(image_np_list)):
														
 
															+#                         image_np_list[i] = image_rotate(image_np_list[i], angle=180)
														
 
															+#                     image_np_list.reverse()
														
 
															+#
														
 
															+#                 # index = 0
														
 
															+#                 # for image_np in image_np_list:
														
 
															+#                 #     cv2.imshow(str(index) + '.jpg', image_np)
														
 
															+#                 #     cv2.waitKey(0)
														
 
															+#                 #     index += 1
														
 
															+#
														
 
															+#                 # 对象生成
														
 
															+#                 all_obj_list = []
														
 
															+#                 _add_y = 0
														
 
															+#                 for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
														
 
															+#                     obj_list = []
														
 
															+#                     for table in table_list:
														
 
															+#                         _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
														
 
															+#                         _table = _Table(table["table"], _table_bbox)
														
 
															+#                         obj_list.append(_table)
														
 
															+#                     for table in b_table_list:
														
 
															+#                         _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
														
 
															+#                         _table = _Table(table["table"], _table_bbox)
														
 
															+#                         obj_list.append(_table)
														
 
															+#                     for text_box in text_box_list:
														
 
															+#                         if text_box not in obj_in_table_list:
														
 
															+#                             text_box.bbox[1] += _add_y
														
 
															+#                             obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
														
 
															+#
														
 
															+#                     # 多图修正y
														
 
															+#                     if len(image_np_list) > 1:
														
 
															+#                         list_y = []
														
 
															+#                         for obj in obj_list:
														
 
															+#                             obj.y += _add_y
														
 
															+#                             list_y.append(obj.y)
														
 
															+#                         if len(list_y) > 0:
														
 
															+#                             _add_y += max(list_y)
														
 
															+#
														
 
															+#                     # 合并
														
 
															+#                     all_obj_list += obj_list
														
 
															+#
														
 
															+#             # 无边框表格图片
														
 
															+#             else:
														
 
															+#                 all_obj_list = []
														
 
															+#                 table_list = []
														
 
															+#                 text_list = []
														
 
															+#                 box_list = []
														
 
															+#                 text_box_list = []
														
 
															+#                 obj_in_table_list = set()
														
 
															+#
														
 
															+#                 # 表格识别
														
 
															+#                 line_list = self.otr_process(self.image_np)
														
 
															+#                 if judge_error_code(line_list):
														
 
															+#                     return line_list
														
 
															+#
														
 
															+#                 # 生成TextBox对象
														
 
															+#                 text_box_list = self.get_text_box_obj(text_list, box_list)
														
 
															+#
														
 
															+#                 # 表格生成
														
 
															+#                 text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, self.image_np)
														
 
															+#                 if judge_error_code(table_list):
														
 
															+#                     return table_list
														
 
															+#
														
 
															+#                 # 无边框表格识别
														
 
															+#                 start_time = time.time()
														
 
															+#                 text_box_list, table_list, obj_in_table_list \
														
 
															+#                     = self.botr_process(self.image_np, table_list,
														
 
															+#                                         text_list, box_list,
														
 
															+#                                                                             text_box_list,
														
 
															+#                                                                             obj_in_table_list,
														
 
															+#                                         self.b_table_from_text,
														
 
															+#                                         self.pdf_obj_list,
														
 
															+#                                         self.pdf_layout_size,
														
 
															+#                                                                             )
														
 
															+#                 log('botr process cost: ' + str(time.time()-start_time))
														
 
															+#
														
 
															+#                 # 合并非表格的同一行TextBox
														
 
															+#                 text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
														
 
															+#
														
 
															+#                 # 对象生成
														
 
															+#                 obj_list = []
														
 
															+#                 # print('table_list', table_list)
														
 
															+#                 for table in table_list:
														
 
															+#                     _table = _Table(table["table"], table["bbox"])
														
 
															+#                     obj_list.append(_table)
														
 
															+#                 for text_box in text_box_list:
														
 
															+#                     if text_box not in obj_in_table_list:
														
 
															+#                         obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
														
 
															+#
														
 
															+#                 # 合并
														
 
															+#                 all_obj_list += obj_list
														
 
															+#
														
 
															+#             return all_obj_list
														
 
															+#
														
 
															+#         except Exception as e:
														
 
															+#             log("image_preprocess error")
														
 
															+#             traceback.print_exc()
														
 
															+#             return [-1]
														
 
															+
														
 
															+
														
 
															 @memory_decorator
														
 
															 def picture2text(path, html=False):
														
 
															     log("into picture2text")
														
@@ -786,6 +1554,21 @@ def get_best_predict_size2(image_np, threshold=3000):
 
															     return h, w
														
 
															+def get_best_predict_size_by_area(image_np, threshold=1280):
														
 
															+    max_area = threshold*threshold
														
 
															+    height, width = image_np.shape[:2]
														
 
															+    area = height * width
														
 
															+
														
 
															+    if area <= max_area:
														
 
															+        return height, width
														
 
															+
														
 
															+    # 计算缩放比例
														
 
															+    scale = (max_area / area) ** 0.5
														
 
															+    new_width = int(width * scale)
														
 
															+    new_height = int(height * scale)
														
 
															+    return new_height, new_width
														
 
															+
														
 
															+
														
 
															 def image_slice(image_np):
														
 
															     """
														
 
															     slice the image if the height is to large
														
@@ -1269,6 +2052,17 @@ def image_process_old(image_np, image_path, is_from_pdf=False, is_from_docx=Fals
 
															 if __name__ == "__main__":
														
 
															-    img111 = cv2.imread("C:/Users/Administrator/Downloads/1724146601927.png")
														
 
															-    cv2.imshow('111', img111)
														
 
															-    cv2.waitKey(0)
														
 
															+    # _pp = r'D:\Project\format_conversion_maxcompute\save_b_table' \
														
 
															+    #       r'\211-6591070e1cc8ea6904ba00a0a3d6c32f.png'
														
 
															+    _pp = r'C:\Users\Administrator\Desktop\test_b_table\error7.png'
														
 
															+    save_pp = r'D:\Project\format_conversion_maxcompute\format_convert\temp\test_convert_image.jpg'
														
 
															+    # img111 = cv2.imread(_pp)
														
 
															+    # img111 = pil_resize(img111, 1024, 768)
														
 
															+    # cv2.imwrite(save_pp, img111)
														
 
															+    # image_process(img111, '')
														
 
															+    # cv2.imshow('111', img111)
														
 
															+    # cv2.waitKey(0)
														
 
															+
														
 
															+    _html = ImageConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp").get_html()
														
 
															+    with open('../result.html', 'w', encoding='utf-8') as f:
														
 
															+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
														
--- a/format_convert/convert_need_interface.py
+++ b/format_convert/convert_need_interface.py
@@ -144,6 +144,7 @@ def from_office_interface_240606(src_path, dest_path, target_format, retry_times
 
															 def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
														
 
															+    start_time = time.time()
														
 
															     try:
														
 
															         if from_remote:
														
 
															             # 重试
														
@@ -200,6 +201,8 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
 
															         log("from_office_interface error!")
														
 
															         traceback.print_exc()
														
 
															         return [-1]
														
 
															+    finally:
														
 
															+        log("from_office_interface cost time " + str(time.time()-start_time))
														
 
															 def from_tika_interface(src_path, from_remote=FROM_REMOTE):
														
@@ -239,17 +242,21 @@ def from_tika_interface(src_path, from_remote=FROM_REMOTE):
 
															             return [-2]
														
 
															         _dict = r
														
 
															-        html = _dict.get("html")
														
 
															-        log("from_tika_interface cost time " + str(time.time()-start_time))
														
 
															-        return html
														
 
															+        data = _dict.get("data")
														
 
															+
														
 
															+        return data
														
 
															     except Exception as e:
														
 
															         log("from_tika_interface error!")
														
 
															         traceback.print_exc()
														
 
															         return [-11]
														
 
															+    finally:
														
 
															+        log("from_tika_interface cost time " + str(time.time()-start_time))
														
 
															 def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_REMOTE):
														
 
															     log("into from_ocr_interface")
														
 
															+    # print('FROM_REMOTE', FROM_REMOTE)
														
 
															+    start_time = time.time()
														
 
															     try:
														
 
															         base64_stream = base64.b64encode(image_stream)
														
@@ -281,7 +288,10 @@ def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_RE
 
															                             log("retry post ocr_interface... left times " + str(retry_times_1))
														
 
															                             continue
														
 
															                     if judge_error_code(r):
														
 
															-                        return r
														
 
															+                        if is_table:
														
 
															+                            return r, r
														
 
															+                        else:
														
 
															+                            return r
														
 
															                     break
														
 
															             else:
														
 
															                 if globals().get("global_ocr_model") is None:
														
@@ -326,6 +336,8 @@ def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_RE
 
															             return [-1], [-1]
														
 
															         else:
														
 
															             return [-1]
														
 
															+    finally:
														
 
															+        log("from_ocr_interface cost time " + str(time.time()-start_time))
														
 
															 def from_gpu_interface_redis(_dict, model_type, predictor_type):
														
@@ -366,6 +378,7 @@ def from_gpu_interface_redis(_dict, model_type, predictor_type):
 
															 def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE):
														
 
															     log("into from_otr_interface")
														
 
															+    start_time = time.time()
														
 
															     try:
														
 
															         base64_stream = base64.b64encode(image_stream)
														
@@ -424,6 +437,8 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
 
															         log("from_otr_interface error!")
														
 
															         print("from_otr_interface", traceback.print_exc())
														
 
															         return [-1]
														
 
															+    finally:
														
 
															+        log("from_otr_interface cost time " + str(time.time()-start_time))
														
 
															 def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
														
@@ -487,7 +502,6 @@ def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
 
															             image_np = cv2.imdecode(buffer, 1)
														
 
															         else:
														
 
															             image_np = _dict.get("image")
														
 
															-        log("from_isr_interface cost time " + str(time.time()-start_time))
														
 
															         return image_np
														
 
															     except Exception as e:
														
 
															         log("from_isr_interface error!")
														
@@ -495,7 +509,7 @@ def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
 
															         return [-11]
														
 
															     finally:
														
 
															         # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
														
 
															-        pass
														
 
															+        log("from_isr_interface cost time " + str(time.time()-start_time))
														
 
															 def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
														
@@ -543,12 +557,13 @@ def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
 
															         _dict = r
														
 
															         angle = _dict.get("angle")
														
 
															-        log("from_idc_interface cost time " + str(time.time()-start_time))
														
 
															         return angle
														
 
															     except Exception as e:
														
 
															         log("from_idc_interface error!")
														
 
															         traceback.print_exc()
														
 
															         return [-11]
														
 
															+    finally:
														
 
															+        log("from_idc_interface cost time " + str(time.time()-start_time))
														
 
															 def from_atc_interface(text, from_remote=FROM_REMOTE):
														
@@ -594,12 +609,13 @@ def from_atc_interface(text, from_remote=FROM_REMOTE):
 
															         _dict = r
														
 
															         classification = _dict.get("classification")
														
 
															-        log("from_atc_interface cost time " + str(time.time()-start_time))
														
 
															         return classification
														
 
															     except Exception as e:
														
 
															         log("from_atc_interface error!")
														
 
															         traceback.print_exc()
														
 
															         return [-11]
														
 
															+    finally:
														
 
															+        log("from_atc_interface cost time " + str(time.time()-start_time))
														
 
															 def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
														
@@ -652,12 +668,13 @@ def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
 
															         _dict = r
														
 
															         b_table_list = _dict.get("b_table_list")
														
 
															-        log("from_yolo_interface cost time " + str(time.time()-start_time))
														
 
															         return b_table_list
														
 
															     except Exception as e:
														
 
															         log("from_yolo_interface error!")
														
 
															         traceback.print_exc()
														
 
															         return [-11]
														
 
															+    finally:
														
 
															+        log("from_yolo_interface cost time " + str(time.time()-start_time))
														
 
															 def interface_pool_gunicorn(interface_type):
														
--- a/format_convert/convert_ofd.py
+++ b/format_convert/convert_ofd.py
@@ -0,0 +1,75 @@
 
															+import base64
														
 
															+import os
														
 
															+import re
														
 
															+import sys
														
 
															+import time
														
 
															+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
														
 
															+from format_convert.easyofd.easyofd.ofd import OFD
														
 
															+from format_convert.convert_tree import _Document, _Sentence, _Page
														
 
															+import logging
														
 
															+import traceback
														
 
															+from format_convert.convert_pdf import PDFConvert
														
 
															+from format_convert.utils import judge_error_code, get_logger, log
														
 
															+
														
 
															+
														
 
															+class OfdConvert:
														
 
															+    def __init__(self, path, unique_type_dir):
														
 
															+        self._doc = _Document(path)
														
 
															+        self.path = path
														
 
															+        self.unique_type_dir = unique_type_dir
														
 
															+        self.ofd = OFD()  # 初始化OFD 工具类
														
 
															+
														
 
															+    def convert(self):
														
 
															+        start_time = time.time()
														
 
															+        file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
														
 
															+
														
 
															+        with open(self.path, "rb") as f:
														
 
															+            ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
														
 
															+
														
 
															+        self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
														
 
															+                      save_dir=self.unique_type_dir)  # 读取ofdb64
														
 
															+        # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
														
 
															+        pdf_bytes, page_need_to_image_dict = self.ofd.to_pdf(return_need_convert_as_image=True)  # 转pdf
														
 
															+        log('ofd to pdf cost: ' + str(time.time()-start_time))
														
 
															+        # print('page_need_to_image_dict', page_need_to_image_dict)
														
 
															+
														
 
															+        self.ofd.del_data()
														
 
															+
														
 
															+        file_name = re.split('[/\\\]', self.path)[-1]
														
 
															+        new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
														
 
															+
														
 
															+        with open(new_path, "wb") as f:
														
 
															+            f.write(pdf_bytes)
														
 
															+        log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
														
 
															+
														
 
															+        # 用pdf提取
														
 
															+        self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None,
														
 
															+                               page_need_to_image_dict=page_need_to_image_dict)
														
 
															+        # self._pdf.convert()
														
 
															+        # self._doc = self._pdf._doc
														
 
															+
														
 
															+    def get_html(self):
														
 
															+        try:
														
 
															+            self.convert()
														
 
															+        except:
														
 
															+            traceback.print_exc()
														
 
															+            self._doc.error_code = [-1]
														
 
															+
														
 
															+        # 直接返回pdf处理的html
														
 
															+        if self._doc.error_code is not None:
														
 
															+            return self._doc.error_code
														
 
															+        else:
														
 
															+            return self._pdf.get_html()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
														
 
															+    p = '../1750060386706.ofd'
														
 
															+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
														
 
															+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp\2" + '/'
														
 
															+    c = OfdConvert(_p, save_dir)
														
 
															+    _html = c.get_html()
														
 
															+    with open('../result.html', 'w', encoding='utf-8') as f:
														
 
															+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
														
 
															+
														
 
															+
														
--- a/format_convert/convert_ofd_test.py
+++ b/format_convert/convert_ofd_test.py
@@ -0,0 +1,75 @@
 
															+import base64
														
 
															+import os
														
 
															+import re
														
 
															+import sys
														
 
															+import time
														
 
															+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
														
 
															+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
														
 
															+
														
 
															+from format_convert.utils import judge_error_code, get_logger, log, register_all_fonts
														
 
															+# register_all_fonts("/usr/share/fonts/")
														
 
															+
														
 
															+from format_convert.easyofd.easyofd.ofd import OFD
														
 
															+from format_convert.convert_tree import _Document, _Sentence, _Page
														
 
															+import logging
														
 
															+import traceback
														
 
															+from format_convert.convert_pdf import PDFConvert
														
 
															+
														
 
															+
														
 
															+class OfdConvert:
														
 
															+    def __init__(self, path, unique_type_dir):
														
 
															+        self._doc = _Document(path)
														
 
															+        self.path = path
														
 
															+        self.unique_type_dir = unique_type_dir
														
 
															+        self.ofd = OFD()  # 初始化OFD 工具类
														
 
															+
														
 
															+    def convert(self):
														
 
															+        start_time = time.time()
														
 
															+        file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
														
 
															+
														
 
															+        with open(self.path, "rb") as f:
														
 
															+            ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
														
 
															+
														
 
															+        self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
														
 
															+                      save_dir=self.unique_type_dir)  # 读取ofdb64
														
 
															+        # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
														
 
															+        pdf_bytes = self.ofd.to_pdf()  # 转pdf
														
 
															+
														
 
															+        self.ofd.del_data()
														
 
															+
														
 
															+        file_name = re.split('[/\\\]', self.path)[-1]
														
 
															+        new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
														
 
															+
														
 
															+        with open(new_path, "wb") as f:
														
 
															+            f.write(pdf_bytes)
														
 
															+        log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
														
 
															+
														
 
															+        # 用pdf提取
														
 
															+        self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None)
														
 
															+        # _pdf.convert()
														
 
															+        # self._doc = _pdf._doc
														
 
															+
														
 
															+    def get_html(self):
														
 
															+        try:
														
 
															+            self.convert()
														
 
															+        except:
														
 
															+            traceback.print_exc()
														
 
															+            self._doc.error_code = [-1]
														
 
															+
														
 
															+        # 直接返回doc处理的html
														
 
															+        if self._doc.error_code is not None:
														
 
															+            return self._doc.error_code
														
 
															+        else:
														
 
															+            return self._pdf.get_html()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
														
 
															+    _p = '../1750381792388.ofd'
														
 
															+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
														
 
															+    save_dir = "/data/fangjiasheng/format_conversion_maxcompute/format_convert/temp" + '/'
														
 
															+    c = OfdConvert(_p, save_dir)
														
 
															+    _html = c.get_html()
														
 
															+    print(_html)
														
 
															+
														
 
															+
														
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -1,3 +1,6 @@
 
															+import shutil
														
 
															+import zlib
														
 
															+from glob import glob
														
 
															 import copy
														
 
															 import io
														
 
															 import os
														
@@ -23,10 +26,12 @@ from pdfminer.converter import PDFPageAggregator
 
															 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
														
 
															     LTTextBoxVertical, LTLine, LTTextContainer, LTTextLine
														
 
															 from format_convert.utils import judge_error_code, get_platform, LineTable, log, \
														
 
															-    memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, get_traditional_chinese
														
 
															+    memory_decorator, get_garble_code, get_md5_from_bytes, bytes2np, bbox_iou, get_garble_code2, \
														
 
															+    get_traditional_chinese, ascii85_decode
														
 
															 import fitz
														
 
															 from format_convert.wrapt_timeout_decorator import timeout
														
 
															 from otr.table_line_pdf import table_line_pdf
														
 
															+from botr.extract_table import get_b_table_by_blank_colon
														
 
															 @memory_decorator
														
@@ -38,6 +43,7 @@ def pdf2text(path, unique_type_dir):
 
															 def pdf_analyze(interpreter, page, device, page_no):
														
 
															     pdf_time = time.time()
														
 
															     interpreter.process_page(page)
														
 
															+    # print('interpreter.process_page time', time.time()-pdf_time)
														
 
															     layout = device.get_result()
														
 
															     log("page_no: " + str(page_no) + " pdf_analyze cost: " + str(time.time() - pdf_time))
														
 
															     return layout
														
@@ -76,7 +82,7 @@ def read_pdfplumber(path, laparams):
 
															 class PDFConvert:
														
 
															-    def __init__(self, path, unique_type_dir, need_page_no):
														
 
															+    def __init__(self, path, unique_type_dir, need_page_no, page_need_to_image_dict=None):
														
 
															         self._doc = _Document(path)
														
 
															         self.path = path
														
 
															         self.unique_type_dir = unique_type_dir
														
@@ -89,7 +95,7 @@ class PDFConvert:
 
															         self.end_page_no = None
														
 
															         # 默认使用limit_page_cnt控制，前10页后10页
														
 
															         if self.need_page_no is None:
														
 
															-            self.limit_page_cnt = 20
														
 
															+            self.limit_page_cnt = 50
														
 
															         else:
														
 
															             # 使用start_page_no,end_page_no范围控制，例如2,5
														
 
															             ss = self.need_page_no.split(',')
														
@@ -120,6 +126,12 @@ class PDFConvert:
 
															         # 初始化_page
														
 
															         self._page = _Page(None, 0)
														
 
															+        # 需要直接转成image来识别的页面
														
 
															+        if type(page_need_to_image_dict) is not dict:
														
 
															+            self.page_need_to_image_dict = {}
														
 
															+        else:
														
 
															+            self.page_need_to_image_dict = page_need_to_image_dict
														
 
															+
														
 
															     @memory_decorator
														
 
															     def init_package(self, package_name):
														
 
															         # 各个包初始化
														
@@ -128,7 +140,9 @@ class PDFConvert:
 
															                                 char_margin=0.3,
														
 
															                                 line_margin=0.01,
														
 
															                                 word_margin=0.01,
														
 
															-                                boxes_flow=0.1, )
														
 
															+                                # boxes_flow=0.1,
														
 
															+                                boxes_flow=None,
														
 
															+                                )
														
 
															             if package_name == self.packages[0]:
														
 
															                 self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
														
 
															                 self.has_init_pdf[0] = 1
														
@@ -153,7 +167,7 @@ class PDFConvert:
 
															             self._doc.error_code = [-3]
														
 
															     @memory_decorator
														
 
															-    def convert(self, limit_page_cnt=20):
														
 
															+    def convert(self, limit_page_cnt=50):
														
 
															         if self.has_init_pdf[0] == 0:
														
 
															             self.init_package("pdfminer")
														
 
															         if self._doc.error_code is not None:
														
@@ -201,8 +215,11 @@ class PDFConvert:
 
															                     continue
														
 
															             # 限制pdf页数，只取前后各10页
														
 
															             else:
														
 
															-                if page_count > limit_page_cnt and int(limit_page_cnt / 2) <= page_no < page_count - int(
														
 
															-                        limit_page_cnt / 2):
														
 
															+                # if page_count > limit_page_cnt and int(limit_page_cnt / 2) <= page_no < page_count - int(
														
 
															+                #         limit_page_cnt / 2):
														
 
															+                #     page_no += 1
														
 
															+                #     continue
														
 
															+                if page_count > limit_page_cnt and page_no >= limit_page_cnt:
														
 
															                     page_no += 1
														
 
															                     continue
														
@@ -222,6 +239,8 @@ class PDFConvert:
 
															         delete_water_mark_list = []
														
 
															         for layout, layout_obj_list, max_y, page_no in layout_list:
														
 
															+            # for obj in layout_obj_list:
														
 
															+            #     print('obj', obj)
														
 
															             # 解析单页
														
 
															             start_time = time.time()
														
 
															             self._page = _Page(None, page_no)
														
@@ -251,7 +270,10 @@ class PDFConvert:
 
															                 find_flag = 0
														
 
															                 add_page_list = []
														
 
															                 for page in pages:
														
 
															-                    if not int(limit_page_cnt / 2) <= page_no < page_count - int(limit_page_cnt / 2):
														
 
															+                    # if not int(limit_page_cnt / 2) <= page_no < page_count - int(limit_page_cnt / 2):
														
 
															+                    #     page_no += 1
														
 
															+                    #     continue
														
 
															+                    if not (page_no >= limit_page_cnt):
														
 
															                         page_no += 1
														
 
															                         continue
														
@@ -297,9 +319,11 @@ class PDFConvert:
 
															                     page_no += 1
														
 
															                 if add_page_list:
														
 
															-                    self._doc.children = self._doc.children[
														
 
															-                                         :int(limit_page_cnt / 2)] + add_page_list + self._doc.children[
														
 
															-                                                                                     int(limit_page_cnt / 2):]
														
 
															+                    # self._doc.children = self._doc.children[:int(limit_page_cnt / 2)] \
														
 
															+                    #                      + add_page_list \
														
 
															+                    #                      + self._doc.children[int(limit_page_cnt / 2):]
														
 
															+                    self._doc.children = self._doc.children[:limit_page_cnt] \
														
 
															+                                         + add_page_list
														
 
															         self.delete_same_image()
														
 
															         # self.delete_bold_text_duplicate()
														
@@ -375,10 +399,14 @@ class PDFConvert:
 
															         return pages, delete_footer_header_list
														
 
															+    @memory_decorator
														
 
															     def delete_bold_text_duplicate(self, lt_text_box_list):
														
 
															         # 拿出所有LTChar
														
 
															         lt_char_list = []
														
 
															         for lt_text_box in lt_text_box_list:
														
 
															+            if '.......' in lt_text_box.get_text():
														
 
															+                # print('....... lt_text_box continue')
														
 
															+                continue
														
 
															             for lt_text_line in lt_text_box:
														
 
															                 for lt_char in lt_text_line:
														
 
															                     if isinstance(lt_char, LTChar):
														
@@ -447,14 +475,16 @@ class PDFConvert:
 
															     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
														
 
															         list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
														
 
															                                                                                     from_pdf=True, is_reverse=False)
														
 
															-        self._page.in_table_objs = filter_objs
														
 
															+        # self._page.in_table_objs = filter_objs
														
 
															         # print("=======text_len:%d:filter_len:%d"%(len(lt_text_list),len(filter_objs)))
														
 
															+        table_list = []
														
 
															         for table in list_tables:
														
 
															             _table = _Table(table["table"], table["bbox"])
														
 
															             # self._page.children.append(_table)
														
 
															             self._page.add_child(_table)
														
 
															+            table_list.append(_table)
														
 
															         list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
														
 
															                                                         layout.bbox, page_no)
														
@@ -466,7 +496,7 @@ class PDFConvert:
 
															         # pdf对象需反向排序
														
 
															         # self._page.is_reverse = True
														
 
															-        return list_tables
														
 
															+        return table_list
														
 
															     def is_text_legal(self, lt_text_list, page_no):
														
 
															         # 无法识别pdf字符编码，整页用ocr
														
@@ -498,10 +528,11 @@ class PDFConvert:
 
															         return True
														
 
															+    @memory_decorator
														
 
															     def judge_b_table(self, lt_text_list, table_list, page_no):
														
 
															         table_h_list = []
														
 
															         for table in table_list:
														
 
															-            table_h_list.append([table.get('bbox')[1], table.get('bbox')[3]])
														
 
															+            table_h_list.append([table.bbox[1], table.bbox[3]])
														
 
															         # 先分行
														
 
															         lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
														
@@ -528,6 +559,8 @@ class PDFConvert:
 
															         row_cnt = 0
														
 
															         b_table_row_list = []
														
 
															         all_b_table = []
														
 
															+        row_col_list = []
														
 
															+        all_row_col_list = []
														
 
															         for row in lt_text_row_list:
														
 
															             # 水印行跳过
														
 
															             if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
														
@@ -537,6 +570,7 @@ class PDFConvert:
 
															             for r in row:
														
 
															                 if re.search('[.·]{7,}', r.get_text()):
														
 
															                     continue_flag = True
														
 
															+                    all_row_col_list = []
														
 
															                     break
														
 
															             if continue_flag:
														
 
															                 continue
														
@@ -550,6 +584,7 @@ class PDFConvert:
 
															                     row_cnt += 1
														
 
															                     t_cnt = 0
														
 
															                     b_table_row_list += row
														
 
															+                    row_col_list += [row]
														
 
															                 else:
														
 
															                     # 容忍
														
 
															                     if t_cnt < tolerate_cnt:
														
@@ -557,15 +592,36 @@ class PDFConvert:
 
															                         continue
														
 
															                     if b_table_row_list and row_cnt >= is_b_table_cnt:
														
 
															                         all_b_table.append(b_table_row_list)
														
 
															+                        all_row_col_list.append(row_col_list)
														
 
															                     row_cnt = 0
														
 
															                     b_table_row_list = []
														
 
															+                    row_col_list = []
														
 
															             else:
														
 
															                 row_cnt += 1
														
 
															                 t_cnt = 0
														
 
															                 b_table_row_list += row
														
 
															+                row_col_list += [row]
														
 
															         if b_table_row_list and row_cnt >= is_b_table_cnt:
														
 
															             all_b_table.append(b_table_row_list)
														
 
															+            all_row_col_list.append(row_col_list)
														
 
															+            # print('b_table_row_list', b_table_row_list)
														
 
															+
														
 
															+        # 排除大部分是两列的，因为前面已经新增了两列无边框的单独识别
														
 
															+        # print('len(all_row_col_list)', len(all_row_col_list))
														
 
															+        row_cnt = 0
														
 
															+        col_2_cnt = 0
														
 
															+        for row_col_list in all_row_col_list:
														
 
															+            for col_list in row_col_list:
														
 
															+                row_cnt += 1
														
 
															+                if len(col_list) == 2:
														
 
															+                    col_2_cnt += 1
														
 
															+                # print('col_list', col_list)
														
 
															+
														
 
															+        # print('row_cnt, col_2_cnt', row_cnt, col_2_cnt)
														
 
															+        if row_cnt == 0 or col_2_cnt / row_cnt >= 0.5:
														
 
															+            log("page_no: " + str(page_no) + ' is_b_table_flag False')
														
 
															+            return False
														
 
															         # 对每个可能的b_table判断是否与table相交
														
 
															         is_b_table_flag = False
														
@@ -587,8 +643,35 @@ class PDFConvert:
 
															                 # print('table_h_list', table_h_list)
														
 
															                 break
														
 
															         log("page_no: " + str(page_no) + ' is_b_table_flag ' + str(is_b_table_flag))
														
 
															+        # 保存判断为True的pdf
														
 
															+        # if is_b_table_flag:
														
 
															+        #     self.save_b_table_pdf(page_no)
														
 
															         return is_b_table_flag
														
 
															+    def save_b_table_pdf(self, page_no):
														
 
															+        # save_dir = r"D:\Project\format_conversion_maxcompute\save_b_table_pdf"
														
 
															+        save_dir = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_pdf'
														
 
															+        max_index = 200
														
 
															+        if os.path.exists(save_dir):
														
 
															+            file_list = glob(save_dir + '/*')
														
 
															+            if file_list:
														
 
															+                file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
														
 
															+                file_index_list.sort(key=lambda x: x)
														
 
															+                index = file_index_list[-1] + 1
														
 
															+            else:
														
 
															+                index = 0
														
 
															+            if index > max_index:
														
 
															+                return
														
 
															+        else:
														
 
															+            return
														
 
															+
														
 
															+        save_path = f'{save_dir}/{index}-{page_no}.pdf'
														
 
															+        try:
														
 
															+            shutil.copy(self.path, save_path)
														
 
															+            print("文件复制成功！")
														
 
															+        except Exception as e:
														
 
															+            print(f"文件复制失败：{e}")
														
 
															+
														
 
															     def char_to_text_box(self, char_list):
														
 
															         lt_text_box_list = []
														
@@ -646,6 +729,7 @@ class PDFConvert:
 
															         return lt_text_box_list, text_box_char_dict
														
 
															+    @memory_decorator
														
 
															     def get_need_objs(self, obj_list, max_y):
														
 
															         # 文字
														
 
															         lt_char_list = []
														
@@ -695,6 +779,14 @@ class PDFConvert:
 
															             elif isinstance(x, (LTTextContainer, LTRect, LTLine, LTCurve)):
														
 
															                 lt_line_list.append(x)
														
 
															+        # print('len(obj_list)', len(obj_list))
														
 
															+        # print('len(lt_char_list)', len(lt_char_list))
														
 
															+        # print('len(lt_text_box_list)', len(lt_text_box_list))
														
 
															+        # if len(lt_text_box_list) >= 200:
														
 
															+        #     for lt_text in lt_text_box_list:
														
 
															+        #         print('>= 200 lt_text', lt_text.get_text())
														
 
															+        # print('len(lt_image_list)', len(lt_image_list))
														
 
															+
														
 
															         if lt_figure_list:
														
 
															             temp_figure_list = []
														
 
															             for sub_figure in lt_figure_list:
														
@@ -719,8 +811,21 @@ class PDFConvert:
 
															         text_box_char_dict = {**text_box_char_dict, **add_text_box_char_dict}
														
 
															+        lt_text_box_list = self.delete_water_mark_by_location(lt_text_box_list)
														
 
															+
														
 
															+        # 分行后过滤
														
 
															+        temp_list = []
														
 
															+        for lt_text_box in lt_text_box_list:
														
 
															+            if lt_text_box.get_text() in ['', ' ', '\t', '\n', '\r']:
														
 
															+                continue
														
 
															+            temp_list.append(lt_text_box)
														
 
															+        if len(lt_text_box_list) != len(temp_list):
														
 
															+            log('filter lt_text_box_list ' + str(len(lt_text_box_list)) + ' -> ' + str(len(temp_list)))
														
 
															+        lt_text_box_list = temp_list
														
 
															+
														
 
															         return lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, lt_line_list, text_box_char_dict
														
 
															+    @memory_decorator
														
 
															     def read_layout(self, page, page_no):
														
 
															         layout = self.get_layout(page, page_no)
														
 
															         if self._doc.error_code is not None:
														
@@ -834,6 +939,7 @@ class PDFConvert:
 
															         return lt_text_box_list
														
 
															+    @memory_decorator
														
 
															     def split_text_box_by_lines2(self, lt_line_list, lt_text_box_list, text_box_char_dict):
														
 
															         """
														
 
															         有单个字符位置信息，再根据表格线截断位置，分割text
														
@@ -932,12 +1038,23 @@ class PDFConvert:
 
															         return lt_text_box_list
														
 
															     @memory_decorator
														
 
															-    # def convert_page(self, page, page_no, skip_image=0):
														
 
															     def convert_page(self, layout, layout_obj_list, max_y, page_no, delete_water_mark_list, skip_image=0):
														
 
															         # 若Page中一个obj都无，后面ocr整页识别 20240820
														
 
															         if max_y == 0 and len(layout_obj_list) > 0:
														
 
															             return
														
 
															+        # 若该页在page_need_to_image_dict中为True，则直接ocr整页识别
														
 
															+        if self.page_need_to_image_dict.get(page_no) is True:
														
 
															+            page_image = self.get_page_image(page_no)
														
 
															+            if judge_error_code(page_image):
														
 
															+                self._page.error_code = page_image
														
 
															+            else:
														
 
															+                _image = _Image(page_image[1], page_image[0])
														
 
															+                _image.is_from_pdf = True
														
 
															+                _image.is_reverse = False
														
 
															+                self._page.add_child(_image)
														
 
															+            return
														
 
															+
														
 
															         lt_char_list, lt_text_box_list, lt_image_list, lt_figure_list, \
														
 
															             lt_line_list, text_box_char_dict = layout_obj_list
														
@@ -999,45 +1116,56 @@ class PDFConvert:
 
															         # 正常读取该页对象
														
 
															         else:
														
 
															             # 图表对象
														
 
															-            for image in lt_image_list:
														
 
															-                try:
														
 
															-                    # print("pdf2text LTImage size", page_no, image.width, image.height)
														
 
															-                    image_stream = image.stream.get_data()
														
 
															-                    # 小的图忽略
														
 
															-                    if image.width <= 300 and image.height <= 300:
														
 
															-                        continue
														
 
															-                    # 查看提取的图片高宽，太大则用pdf输出图进行ocr识别
														
 
															-                    img_test = Image.open(io.BytesIO(image_stream))
														
 
															-                    if image.height >= 1000 and image.width >= 1000:
														
 
															-                        page_image = self.get_page_image(page_no)
														
 
															-                        if judge_error_code(page_image):
														
 
															-                            self._page.error_code = page_image
														
 
															-                        else:
														
 
															-                            _image = _Image(page_image[1], page_image[0])
														
 
															-                            _image.is_from_pdf = True
														
 
															-                            _image.is_reverse = False
														
 
															-                            self._page.add_child(_image)
														
 
															-                            image_md5 = get_md5_from_bytes(page_image[1])
														
 
															-                            self.md5_image_obj_list.append([image_md5, _image])
														
 
															-                        return
														
 
															-                    # 比较小的图则直接保存用ocr识别
														
 
															-                    else:
														
 
															-                        temp_path = self.unique_type_dir + 'page' + str(page_no) \
														
 
															-                                    + '_lt' + str(lt_image_list.index(image)) + '.jpg'
														
 
															-                        img_test.save(temp_path)
														
 
															-                        with open(temp_path, "rb") as ff:
														
 
															-                            image_stream = ff.read()
														
 
															-                        _image = _Image(image_stream, temp_path, image.bbox)
														
 
															-                        self._page.add_child(_image)
														
 
															-                        image_md5 = get_md5_from_bytes(image_stream)
														
 
															-                        self.md5_image_obj_list.append([image_md5, _image])
														
 
															-                except Exception:
														
 
															-                    log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
														
 
															-                    traceback.print_exc()
														
 
															+            # for image in lt_image_list:
														
 
															+            #     try:
														
 
															+            #         # print("pdf2text LTImage size", page_no, image.width, image.height)
														
 
															+            #         # image_stream = image.stream.get_data()
														
 
															+            #         print('image.stream.get_filters()', image.stream.get_filters())
														
 
															+            #         image_stream = image.stream.get_data()
														
 
															+            #         # 小的图忽略
														
 
															+            #         if image.width <= 300 and image.height <= 300:
														
 
															+            #             continue
														
 
															+            #         # 查看提取的图片高宽，太大则用pdf输出图进行ocr识别
														
 
															+            #         img_test = Image.open(io.BytesIO(image_stream))
														
 
															+            #         # img_test = self.pdfminer_stream_to_image(image)
														
 
															+            #         if image.height >= 1000 and image.width >= 1000:
														
 
															+            #             page_image = self.get_page_image(page_no)
														
 
															+            #             if judge_error_code(page_image):
														
 
															+            #                 self._page.error_code = page_image
														
 
															+            #             else:
														
 
															+            #                 _image = _Image(page_image[1], page_image[0])
														
 
															+            #                 _image.is_from_pdf = True
														
 
															+            #                 _image.is_reverse = False
														
 
															+            #                 self._page.add_child(_image)
														
 
															+            #                 image_md5 = get_md5_from_bytes(page_image[1])
														
 
															+            #                 self.md5_image_obj_list.append([image_md5, _image])
														
 
															+            #             return
														
 
															+            #         # 比较小的图则直接保存用ocr识别
														
 
															+            #         else:
														
 
															+            #             temp_path = self.unique_type_dir + 'page' + str(page_no) \
														
 
															+            #                         + '_lt' + str(lt_image_list.index(image)) + '.jpg'
														
 
															+            #             img_test.save(temp_path)
														
 
															+            #             with open(temp_path, "rb") as ff:
														
 
															+            #                 image_stream = ff.read()
														
 
															+            #             _image = _Image(image_stream, temp_path, image.bbox)
														
 
															+            #             self._page.add_child(_image)
														
 
															+            #             image_md5 = get_md5_from_bytes(image_stream)
														
 
															+            #             self.md5_image_obj_list.append([image_md5, _image])
														
 
															+            #     except Exception:
														
 
															+            #         log("page_no: " + str(page_no) + " pdfminer read image fail! use pymupdf read image...")
														
 
															+            #         traceback.print_exc()
														
 
															             # pdf对象需反向排序
														
 
															             # self._page.is_reverse = True
														
 
															+            status = self.pdfminer_read_page_images(lt_image_list, page_no)
														
 
															+            if not status:
														
 
															+                log('pymupdf 提取页面中图片 page_no: ' + str(page_no))
														
 
															+                status = self.pymupdf_read_page_images(page_no)
														
 
															+            if not status:
														
 
															+                log('pymupdf 整页转化为图片 page_no: ' + str(page_no))
														
 
															+                status = self.pymupdf_get_whole_page_image(page_no)
														
 
															+
														
 
															             if self.has_init_pdf[3] == 0:
														
 
															                 self.init_package("pdfplumber")
														
@@ -1059,7 +1187,24 @@ class PDFConvert:
 
															             table_list = self.recognize_text(layout, page_no, lt_text_box_list, lt_line_list)
														
 
															             # 根据text规律，判断该页是否可能有无边框表格
														
 
															+            try:
														
 
															+                b_table_list, _ = get_b_table_by_blank_colon(lt_text_box_list, table_list, layout.bbox, None)
														
 
															+            except:
														
 
															+                traceback.print_exc()
														
 
															+                b_table_list = []
														
 
															+                self._page.error_code = [-23]
														
 
															+
														
 
															+            if b_table_list:
														
 
															+                for table in b_table_list:
														
 
															+                    _table = _Table(table[0], table[1])
														
 
															+                    table_list += [_table]
														
 
															+                    self._page.add_child(_table)
														
 
															+
														
 
															+            for t in table_list:
														
 
															+                self._page.table_bbox_list.append(t.bbox)
														
 
															+
														
 
															             if self.judge_b_table(lt_text_box_list, table_list, page_no):
														
 
															+                # log('judge_b_table match! ' + str(page_no))
														
 
															                 page_image = self.get_page_image(page_no)
														
 
															                 if judge_error_code(page_image):
														
 
															                     self._page.error_code = page_image
														
@@ -1073,6 +1218,7 @@ class PDFConvert:
 
															                     _image.b_table_layout_size = (layout.width, layout.height)
														
 
															                     self._page.add_child(_image)
														
 
															+    @memory_decorator
														
 
															     def get_layout(self, page, page_no):
														
 
															         if self.has_init_pdf[0] == 0:
														
 
															             self.init_package("pdfminer")
														
@@ -1096,6 +1242,7 @@ class PDFConvert:
 
															         log("page_no: " + str(page_no) + " get_layout cost: " + str(time.time() - start_time))
														
 
															         return layout
														
 
															+    @memory_decorator
														
 
															     def get_page_image(self, page_no):
														
 
															         start_time = time.time()
														
 
															         try:
														
@@ -1503,6 +1650,7 @@ class PDFConvert:
 
															             return [-12]
														
 
															         return html
														
 
															+    @memory_decorator
														
 
															     def delete_water_mark(self, lt_text_list, page_bbox, times=5):
														
 
															         # 删除过多重复字句，为水印
														
 
															         duplicate_dict = {}
														
@@ -1540,6 +1688,32 @@ class PDFConvert:
 
															                 temp_text_list.append(_obj)
														
 
															         return temp_text_list, delete_text
														
 
															+    @memory_decorator
														
 
															+    def delete_water_mark_by_location(self, lt_text_box_list):
														
 
															+        x_text_box_dict = {}
														
 
															+        # 水印，x坐标相同，且长度为1
														
 
															+        for lt_text_box in lt_text_box_list:
														
 
															+            x1, y1, x2, y2 = lt_text_box.bbox
														
 
															+            text = lt_text_box.get_text()
														
 
															+            if len(text) != 1:
														
 
															+                continue
														
 
															+            key = f'{x1}-{x2}-{text}'
														
 
															+            if key in x_text_box_dict:
														
 
															+                x_text_box_dict[key] += [lt_text_box]
														
 
															+            else:
														
 
															+                x_text_box_dict[key] = [lt_text_box]
														
 
															+
														
 
															+        len1 = len(lt_text_box_list)
														
 
															+        for key, box_list in x_text_box_dict.items():
														
 
															+            if len(box_list) >= 3:
														
 
															+                for box in box_list:
														
 
															+                    if box in lt_text_box_list:
														
 
															+                        lt_text_box_list.remove(box)
														
 
															+        len2 = len(lt_text_box_list)
														
 
															+        if len1 != len2:
														
 
															+            log('delete_water_mark_by_location box num ' + str(len1) + ' -> ' + str(len2))
														
 
															+        return lt_text_box_list
														
 
															+
														
 
															     def delete_water_mark_by_color(self, lt_text_list):
														
 
															         # 删除浅色字体，大概率为水印
														
 
															         # 1. 单个char颜色透明度0.8以上
														
@@ -1587,6 +1761,9 @@ class PDFConvert:
 
															         water_mark_text_box_list = []
														
 
															         sin_range = [0.3, 0.94]
														
 
															         for lt_text_box in lt_text_list:
														
 
															+            if '.......' in lt_text_box.get_text():
														
 
															+                # print('....... lt_text_box continue')
														
 
															+                continue
														
 
															             for lt_text_line in lt_text_box:
														
 
															                 for lt_char in lt_text_line:
														
 
															                     matrix = lt_char.matrix
														
@@ -1634,6 +1811,126 @@ class PDFConvert:
 
															             log("page_no: " + str(page_no) + " get_single_pdf error!")
														
 
															             return [-3]
														
 
															+    def pymupdf_read_page_images(self, page_no):
														
 
															+        try:
														
 
															+            self.init_package("PyMuPDF")
														
 
															+            # 获取指定页面
														
 
															+            page = self.doc_pymupdf.load_page(page_no)
														
 
															+            # 获取页面中所有图片的信息
														
 
															+            image_list = page.get_images(full=True)
														
 
															+
														
 
															+            # 存储提取的图片信息
														
 
															+            extracted_images = []
														
 
															+
														
 
															+            # 遍历图片列表
														
 
															+            for img_index, img_info in enumerate(image_list):
														
 
															+                xref = img_info[0]  # 图片xref编号
														
 
															+                base_image = self.doc_pymupdf.extract_image(xref)
														
 
															+                image_bytes = base_image["image"]  # 图片字节数据
														
 
															+                image_ext = base_image["ext"]  # 图片扩展名
														
 
															+
														
 
															+                # 获取图片在页面中的位置和大小
														
 
															+                bbox = img_info[0:4]  # x0, y0, x1, y1
														
 
															+                # print('img_info', img_info)
														
 
															+                width = img_info[2] - img_info[0]  # 计算宽度
														
 
															+                height = img_info[3] - img_info[1]  # 计算高度
														
 
															+
														
 
															+                # 构建图片信息字典
														
 
															+                img_data = {
														
 
															+                    "xref": xref,
														
 
															+                    "width": width,
														
 
															+                    "height": height,
														
 
															+                    "image": image_bytes,
														
 
															+                    "ext": image_ext,
														
 
															+                    "bbox": bbox
														
 
															+                }
														
 
															+                extracted_images.append(img_data)
														
 
															+
														
 
															+            image_obj_list = []
														
 
															+            for index, d in enumerate(extracted_images):
														
 
															+                temp_path = self.unique_type_dir + 'page' + str(page_no) \
														
 
															+                            + '_lt2_' + str(index) + '.jpg'
														
 
															+                image_bytes = d.get("image")
														
 
															+                bbox = d.get('bbox')
														
 
															+                with open(temp_path, 'wb') as f:
														
 
															+                    f.write(image_bytes)
														
 
															+
														
 
															+                _image = _Image(image_bytes, temp_path, bbox)
														
 
															+                image_md5 = get_md5_from_bytes(image_bytes)
														
 
															+                image_obj_list.append([_image, image_md5])
														
 
															+        except:
														
 
															+            traceback.print_exc()
														
 
															+            return False
														
 
															+
														
 
															+        for _image, image_md5 in image_obj_list:
														
 
															+            self._page.add_child(_image)
														
 
															+            self.md5_image_obj_list.append([image_md5, _image])
														
 
															+        return True
														
 
															+
														
 
															+    def pymupdf_get_whole_page_image(self, page_no):
														
 
															+        image_obj_list = []
														
 
															+        page_image = self.get_page_image(page_no)
														
 
															+        if judge_error_code(page_image):
														
 
															+            self._page.error_code = page_image
														
 
															+            return False
														
 
															+        else:
														
 
															+            _image = _Image(page_image[1], page_image[0])
														
 
															+            _image.is_from_pdf = True
														
 
															+            _image.is_reverse = False
														
 
															+            image_md5 = get_md5_from_bytes(page_image[1])
														
 
															+            image_obj_list.append([_image, image_md5])
														
 
															+
														
 
															+        for _image, image_md5 in image_obj_list:
														
 
															+            self._page.add_child(_image)
														
 
															+            self.md5_image_obj_list.append([image_md5, _image])
														
 
															+        return True
														
 
															+
														
 
															+    def pdfminer_read_page_images(self, lt_image_list, page_no):
														
 
															+        # 图表对象
														
 
															+        image_obj_list = []
														
 
															+        for image in lt_image_list:
														
 
															+            try:
														
 
															+                # print("pdf2text LTImage size", page_no, image.width, image.height)
														
 
															+                # image_stream = image.stream.get_data()
														
 
															+                # print('image.stream.get_filters()', image.stream.get_filters())
														
 
															+                image_stream = image.stream.get_data()
														
 
															+                # 小的图忽略
														
 
															+                if image.width <= 300 and image.height <= 300:
														
 
															+                    continue
														
 
															+                # 查看提取的图片高宽，太大则用pdf输出图进行ocr识别
														
 
															+                img_test = Image.open(io.BytesIO(image_stream))
														
 
															+                # img_test = self.pdfminer_stream_to_image(image)
														
 
															+                # if image.height >= 1000 and image.width >= 1000:
														
 
															+                #     page_image = self.get_page_image(page_no)
														
 
															+                #     if judge_error_code(page_image):
														
 
															+                #         self._page.error_code = page_image
														
 
															+                #     else:
														
 
															+                #         _image = _Image(page_image[1], page_image[0])
														
 
															+                #         _image.is_from_pdf = True
														
 
															+                #         _image.is_reverse = False
														
 
															+                #         image_md5 = get_md5_from_bytes(page_image[1])
														
 
															+                #         image_obj_list.append([_image, image_md5])
														
 
															+                # # 比较小的图则直接保存用ocr识别
														
 
															+                # else:
														
 
															+                temp_path = self.unique_type_dir + 'page' + str(page_no) \
														
 
															+                            + '_lt_' + str(lt_image_list.index(image)) + '.jpg'
														
 
															+                img_test.save(temp_path)
														
 
															+                with open(temp_path, "rb") as ff:
														
 
															+                    image_stream = ff.read()
														
 
															+                _image = _Image(image_stream, temp_path, image.bbox)
														
 
															+                self._page.add_child(_image)
														
 
															+                image_md5 = get_md5_from_bytes(image_stream)
														
 
															+                self.md5_image_obj_list.append([image_md5, _image])
														
 
															+            except Exception:
														
 
															+                log("page_no: " + str(page_no) + " pdfminer read image fail!")
														
 
															+                traceback.print_exc()
														
 
															+                return False
														
 
															+
														
 
															+        for _image, image_md5 in image_obj_list:
														
 
															+            self._page.add_child(_image)
														
 
															+            self.md5_image_obj_list.append([image_md5, _image])
														
 
															+        return True
														
 
															+
														
 
															 def get_text_font():
														
 
															     def flags_decomposer(flags):
														
@@ -1999,4 +2296,8 @@ class ParseUtils:
 
															 if __name__ == '__main__':
														
 
															-    PDFConvert(r"C:/Users/Administrator/Downloads/1651896704621.pdf", "C:/Users/Administrator/Downloads/1").get_html()
														
 
															+    _pp = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-116.pdf'
														
 
															+    # _pp = r'C:\Users\Administrator\Downloads\1746582280828.pdf'
														
 
															+    _html = PDFConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp", None).get_html()
														
 
															+    with open('../result.html', 'w', encoding='utf-8') as f:
														
 
															+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
														
--- a/format_convert/convert_test.py
+++ b/format_convert/convert_test.py
@@ -11,15 +11,6 @@ from glob import glob
 
															 import requests
														
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															-from pdfminer.converter import PDFPageAggregator
														
 
															-from pdfminer.layout import LAParams, LTLine
														
 
															-from pdfminer.pdfdocument import PDFDocument
														
 
															-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
														
 
															-from pdfminer.pdfpage import PDFPage
														
 
															-from pdfminer.pdfparser import PDFParser
														
 
															-from pdfplumber import PDF
														
 
															-
														
 
															-from otr.table_line_pdf import _plot
														
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
														
@@ -44,7 +35,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
 
															     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
														
 
															             'timeout': timeout, 'save_middle': save_middle}
														
 
															-    # _url = 'http://121.46.18.113:15010/convert'
														
 
															+    # _url = 'http://dianxin.bidizhaobiao.com:15010/convert'
														
 
															     # _url = 'http://192.168.2.103:15010/convert'
														
 
															     # _url = 'http://192.168.2.102:15010/convert'
														
 
															     # _url = 'http://172.16.160.65:15010/convert'
														
@@ -53,7 +44,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
 
															     text_str = ""
														
 
															     try:
														
 
															         result = json.loads(request_post(_url, data, time_out=timeout+20))
														
 
															-
														
 
															+        print('result', result)
														
 
															         for t in result.get("result_html"):
														
 
															             text_str += t
														
 
															         to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
														
@@ -67,7 +58,7 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
 
															                 to_html(new_path, text_str)
														
 
															         print(_md5)
														
 
															-        print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
														
 
															+        # print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
														
 
															         print("result_text", result.get("result_text")[0][:20])
														
 
															         print("is_success", result.get("is_success"))
														
 
															     except:
														
@@ -80,7 +71,6 @@ def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=Fal
 
															     return p, 1
														
 
															-
														
 
															 def test_path():
														
 
															     # _url = 'http://121.46.18.113:15010/convert'
														
 
															     _url = 'http://192.168.0.115:15010/convert'
														
@@ -186,21 +176,25 @@ def test_kimi():
 
															 if __name__ == '__main__':
														
 
															     if get_platform() == "Windows":
														
 
															-        # file_path = "C:/Users/Administrator/Downloads/1672314827836.pdf"
														
 
															+        # file_path = "C:/Users/Administrator/Downloads/1750737587843.ofd"
														
 
															+        # file_path = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf/e-1.pdf'
														
 
															         # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
														
 
															-        # file_path = "C:/Users/Administrator/Desktop/test_xls/error7.xls"
														
 
															-        # file_path = "C:/Users/Administrator/Desktop/test_doc/error15.doc"
														
 
															-        # file_path = "C:/Users/Administrator/Desktop/test_swf/error1.swf"
														
 
															+        # file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xlsx"
														
 
															+        # file_path = "C:/Users/Administrator/Desktop/test_doc/error17.docx"
														
 
															+        # file_path = "C:/Users/Administrator/Desktop/test_swf/error2.swf"
														
 
															         # file_path = "C:/Users/Administrator/Desktop/test_rar/error1.rar"
														
 
															-        file_path = "C:/Users/Administrator/Desktop/test_image/error7.png"
														
 
															-        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error13.pdf"
														
 
															-        # file_path = "C:/Users/Administrator/Desktop/test_pdf/表格连接error/error6.pdf"
														
 
															+        # file_path = "C:/Users/Administrator/Desktop/test_image/error18.png"
														
 
															+        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error29.png"
														
 
															+        # file_path = "C:/Users/Administrator/Desktop/test_pdf/普通error/error6.pdf"
														
 
															         # file_path = "C:/Users/Administrator/Desktop/test_table_head/error2.pdf"
														
 
															+        # file_path = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
														
 
															+        file_path = "C:/Users/Administrator/Desktop/test_ofd/1750381792388.ofd"
														
 
															     else:
														
 
															         file_path = "1660296734009.pdf"
														
 
															-    test_one(file_path, page_no_range='1,-1', timeout=1000, save_middle=None)
														
 
															+    # test_one(file_path, page_no_range="1,-1", timeout=1000, save_middle=None)
														
 
															+    test_one(file_path, page_no_range=None, timeout=1000, save_middle=None)
														
 
															     # run_files()
														
@@ -212,21 +206,21 @@ if __name__ == '__main__':
 
															     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
														
 
															     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
														
 
															     # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
														
 
															-    file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
														
 
															-    test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
														
 
															-                     ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
														
 
															-                     ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
														
 
															-                     ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
														
 
															-                     ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
														
 
															-                     ['error50.pdf', '1,-1'],
														
 
															-                     ['error59.pdf', '1,-1'],
														
 
															-                     ['error60.pdf', '1,-1'],
														
 
															-                     ['error61.pdf', '1,-1'],
														
 
															-                     ['error7.pdf', '39,57'],
														
 
															-                     ['error8.pdf', '7,12'],
														
 
															-                     ['error23.pdf', '1,-1']
														
 
															-                     ]
														
 
															-    index = 11
														
 
															+    # file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
														
 
															+    # test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
														
 
															+    #                  ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
														
 
															+    #                  ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
														
 
															+    #                  ['7fd2ce6b08d086c98158b6f2fa0293b0.pdf', '32,48'],
														
 
															+    #                  ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
														
 
															+    #                  ['error50.pdf', '1,-1'],
														
 
															+    #                  ['error59.pdf', '1,-1'],
														
 
															+    #                  ['error60.pdf', '1,-1'],
														
 
															+    #                  ['error61.pdf', '1,-1'],
														
 
															+    #                  ['error7.pdf', '39,57'],
														
 
															+    #                  ['error8.pdf', '7,12'],
														
 
															+    #                  ['error23.pdf', '1,-1']
														
 
															+    #                  ]
														
 
															+    # index = 11
														
 
															     # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
														
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -61,6 +61,8 @@ class _Page:
 
															         self.in_table_objs = set()
														
 
															         # 是否pdf
														
 
															         self.is_pdf = 0
														
 
															+        # 所有表格范围
														
 
															+        self.table_bbox_list = []
														
 
															     def add_child(self, child):
														
 
															         if child.error_code is None:
														
@@ -74,12 +76,66 @@ class _Page:
 
															         self.children = sort_object(self.children, self.is_reverse)
														
 
															+        # 有图片类型，需返回图片中所有对象，并重新设置图片中的bbox，以及图片后的对象的bbox
														
 
															+        image_add_y = 0
														
 
															+        add_childern = []
														
 
															+        for child in self.children:
														
 
															+            if type(child) == _Image:
														
 
															+                image_children = child.get_html(return_children=True)
														
 
															+                if judge_error_code(image_children) and not self.is_pdf:
														
 
															+                    self.error_code = image_children
														
 
															+                    return self.error_code
														
 
															+                if len(image_children) == 0:
														
 
															+                    continue
														
 
															+                image_children = sort_object(image_children, False)
														
 
															+
														
 
															+                # 单张图可能无bbox，但文档中的图有bbox
														
 
															+                if child.bbox != (0, 0, 0, 0):
														
 
															+                    for i_child in image_children:
														
 
															+                        i_child.bbox = [i_child.bbox[0], i_child.bbox[1] + child.bbox[3] + image_add_y,
														
 
															+                                        i_child.bbox[2], i_child.bbox[3] + child.bbox[3] + image_add_y
														
 
															+                                        ]
														
 
															+
														
 
															+                image_add_y += image_children[-1].bbox[3]
														
 
															+                add_childern += image_children
														
 
															+                continue
														
 
															+
														
 
															+            # 图片对象后面的对象，bbox重新设置
														
 
															+            child.bbox = [child.bbox[0], child.bbox[1] + image_add_y,
														
 
															+                          child.bbox[2], child.bbox[3] + image_add_y
														
 
															+                          ]
														
 
															+            # self.children += child.get_html(return_children=True)
														
 
															+
														
 
															+        self.children += add_childern
														
 
															+        self.children = sort_object(self.children, self.is_reverse)
														
 
															+
														
 
															+        # 获取所有table，计算bbox，排除在table中的sentence
														
 
															+        for child in self.children:
														
 
															+            if type(child) == _Table:
														
 
															+                # table_bbox = get_table_bbox(child.content)
														
 
															+                # print('table.content ', child.content)
														
 
															+                # print('child.bbox', child.bbox)
														
 
															+                self.table_bbox_list += [child.bbox]
														
 
															+
														
 
															         html_text = ""
														
 
															         image_html = ""
														
 
															         text_html = ""
														
 
															         for child in self.children:
														
 
															+            if type(child) == _Image:
														
 
															+                continue
														
 
															+            if type(child) == _Sentence:
														
 
															+                continue_flag = 0
														
 
															+                for table_bbox in self.table_bbox_list:
														
 
															+                    # print('table_bbox', table_bbox)
														
 
															+                    if table_bbox[1] - 3 <= child.bbox[1] <= child.bbox[3] <= table_bbox[3] + 3:
														
 
															+                        continue_flag = 1
														
 
															+                        break
														
 
															+                if continue_flag:
														
 
															+                    continue
														
 
															+
														
 
															             # 先调用get_html才能更新error_code
														
 
															             child_html_text = child.get_html()
														
 
															+            # print('sort child_html_text', child_html_text)
														
 
															             if child.error_code is not None:
														
 
															                 self.error_code = child.error_code
														
 
															                 return ""
														
@@ -158,14 +214,16 @@ class _Image:
 
															         else:
														
 
															             self.error_code = child.error_code
														
 
															-    def get_html(self):
														
 
															+    def get_html(self, return_children=False):
														
 
															         # 将Image转为Sentence,table
														
 
															         self.convert()
														
 
															         # if self.error_code == [-16]:
														
 
															         #     self.error_code = None
														
 
															         #     return "<div>#idc error#<div>"
														
 
															         if self.error_code is not None:
														
 
															-            return ""
														
 
															+            return self.error_code
														
 
															+        if return_children:
														
 
															+            return self.children
														
 
															         html_text = ""
														
 
															         self.children = sort_object(self.children)
														
@@ -192,7 +250,9 @@ class _Image:
 
															                                  self.b_table_layout_size, self.is_reverse)
														
 
															         if judge_error_code(obj_list):
														
 
															             # 20241101 注释 图片识别报错返回空
														
 
															-            # self.error_code = obj_list
														
 
															+            # 20250604 不是来源pdf的，返回错误码
														
 
															+            if not self.is_from_pdf:
														
 
															+                self.error_code = obj_list
														
 
															             return
														
 
															         if self.b_table_from_text:
														
@@ -213,9 +273,19 @@ class _Table:
 
															         self.bbox = bbox
														
 
															         self.x = bbox[0]
														
 
															         self.y = bbox[1]
														
 
															-        self.shape = (len(content), len(content[0]))
														
 
															+        if len(content) and len(content[0]):
														
 
															+            self.shape = (len(content), len(content[0]))
														
 
															+        else:
														
 
															+            self.shape = (0, 0)
														
 
															         self.error_code = None
														
 
															+    def get_table_bbox(self, table):
														
 
															+        x1 = min([y.bbox[0] for x in table for y in x])
														
 
															+        y1 = min([y.bbox[1] for x in table for y in x])
														
 
															+        x2 = max([y.bbox[2] for x in table for y in x])
														
 
															+        y2 = max([y.bbox[3] for x in table for y in x])
														
 
															+        return [x1, y1, x2, y2]
														
 
															+
														
 
															     def get_html(self):
														
 
															         if self.error_code is not None:
														
 
															             return ""
														
@@ -227,6 +297,9 @@ class _Table:
 
															             html_text = get_table_html(self.content)
														
 
															             return html_text
														
 
															+    def __repr__(self):
														
 
															+        return '(%s@#@%s)' % (str('table'), '@'.join([str(x) for x in self.bbox]))
														
 
															+
														
 
															 class _Sentence:
														
 
															     def __init__(self, content, bbox, is_html=False):
														
@@ -249,6 +322,9 @@ class _Sentence:
 
															         else:
														
 
															             return add_div(self.content)
														
 
															+    def __repr__(self):
														
 
															+        return '(%s@#@%s)' % (str(self.content), '@'.join([str(x) for x in self.bbox]))
														
 
															+
														
 
															 class TextBox:
														
 
															     def __init__(self, bbox, text):
														
@@ -261,6 +337,17 @@ class TextBox:
 
															     def __str__(self):
														
 
															         return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
														
 
															+    def __repr__(self):
														
 
															+        return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
														
 
															+
														
 
															+    def __hash__(self):
														
 
															+        return hash(self.__str__())
														
 
															+
														
 
															+    def __eq__(self, other):
														
 
															+        if isinstance(other, TextBox):
														
 
															+            return self.__str__() == other.__str__()
														
 
															+        return False
														
 
															+
														
 
															 class TableLine:
														
 
															     def __init__(self, bbox):
														
--- a/format_convert/convert_wps.py
+++ b/format_convert/convert_wps.py
@@ -0,0 +1,61 @@
 
															+import os
														
 
															+import re
														
 
															+import sys
														
 
															+
														
 
															+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
														
 
															+from format_convert.convert_tree import _Document, _Sentence, _Page
														
 
															+import logging
														
 
															+import traceback
														
 
															+from format_convert.convert_doc import DocConvert
														
 
															+from format_convert.utils import judge_error_code, get_logger, log
														
 
															+
														
 
															+
														
 
															+class WpsConvert:
														
 
															+    def __init__(self, path, unique_type_dir):
														
 
															+        self._doc = _Document(path)
														
 
															+        self.path = path
														
 
															+        self.unique_type_dir = unique_type_dir
														
 
															+
														
 
															+    def convert(self):
														
 
															+        # 改后缀，调用doc处理
														
 
															+        print('self.path', self.path)
														
 
															+        file_name = re.split('[/\\\]', self.path)[-1]
														
 
															+        with open(self.path, 'rb') as file:
														
 
															+            content = file.read()
														
 
															+
														
 
															+        new_file_name = file_name[:-4] + '.doc'
														
 
															+        new_file_path = self.unique_type_dir + new_file_name
														
 
															+        print('new_file_path', new_file_path)
														
 
															+        with open(new_file_path, 'wb') as file:
														
 
															+            file.write(content)
														
 
															+
														
 
															+        log('wps file ' + file_name + ' -> ' + new_file_name)
														
 
															+
														
 
															+        self._doc_convert = DocConvert(new_file_path, self.unique_type_dir)
														
 
															+        self._doc_convert.convert()
														
 
															+        self._doc = self._doc_convert._doc
														
 
															+
														
 
															+    def get_html(self):
														
 
															+        try:
														
 
															+            self.convert()
														
 
															+        except:
														
 
															+            traceback.print_exc()
														
 
															+            self._doc.error_code = [-1]
														
 
															+
														
 
															+        # 直接返回doc处理的html
														
 
															+        if self._doc.error_code is not None:
														
 
															+            return self._doc.error_code
														
 
															+        else:
														
 
															+            return self._doc.get_html()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    _p = "C:/Users/Administrator/Downloads/1723004790329.wps"
														
 
															+    # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
														
 
															+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
														
 
															+    c = WpsConvert(_p, save_dir)
														
 
															+    _html = c.get_html()
														
 
															+    with open('../result.html', 'w', encoding='utf-8') as f:
														
 
															+        f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
														
 
															+
														
 
															+
														
--- a/format_convert/easyofd/easyofd/__init__.py
+++ b/format_convert/easyofd/easyofd/__init__.py
@@ -0,0 +1,6 @@
 
															+from .ofd import OFD
														
 
															+__version__ = "0.5.1"
														
 
															+__author__ = "renoyuan"
														
 
															+__email__ = "renoyuan@foxmail.com"
														
 
															+__description__ = "一个用于OFD文档处理的Python库"
														
 
															+__all__ = ["OFD"]
														
--- a/format_convert/easyofd/easyofd/chinese_characters.txt
+++ b/format_convert/easyofd/easyofd/chinese_characters.txt
@@ -0,0 +1,474 @@
 
															+豈
														
 
															+更
														
 
															+車
														
 
															+賈
														
 
															+滑
														
 
															+串
														
 
															+句
														
 
															+龜
														
 
															+龜
														
 
															+契
														
 
															+金
														
 
															+喇
														
 
															+奈
														
 
															+懶
														
 
															+癩
														
 
															+羅
														
 
															+蘿
														
 
															+螺
														
 
															+裸
														
 
															+邏
														
 
															+樂
														
 
															+洛
														
 
															+烙
														
 
															+珞
														
 
															+落
														
 
															+酪
														
 
															+駱
														
 
															+亂
														
 
															+卵
														
 
															+欄
														
 
															+爛
														
 
															+蘭
														
 
															+鸞
														
 
															+嵐
														
 
															+濫
														
 
															+藍
														
 
															+襤
														
 
															+拉
														
 
															+臘
														
 
															+蠟
														
 
															+廊
														
 
															+朗
														
 
															+浪
														
 
															+狼
														
 
															+郎
														
 
															+來
														
 
															+冷
														
 
															+勞
														
 
															+擄
														
 
															+櫓
														
 
															+爐
														
 
															+盧
														
 
															+老
														
 
															+蘆
														
 
															+虜
														
 
															+路
														
 
															+露
														
 
															+魯
														
 
															+鷺
														
 
															+碌
														
 
															+祿
														
 
															+綠
														
 
															+菉
														
 
															+錄
														
 
															+鹿
														
 
															+論
														
 
															+壟
														
 
															+弄
														
 
															+籠
														
 
															+聾
														
 
															+牢
														
 
															+磊
														
 
															+賂
														
 
															+雷
														
 
															+壘
														
 
															+屢
														
 
															+樓
														
 
															+淚
														
 
															+漏
														
 
															+累
														
 
															+縷
														
 
															+陋
														
 
															+勒
														
 
															+肋
														
 
															+凜
														
 
															+凌
														
 
															+稜
														
 
															+綾
														
 
															+菱
														
 
															+陵
														
 
															+讀
														
 
															+拏
														
 
															+樂
														
 
															+諾
														
 
															+丹
														
 
															+寧
														
 
															+怒
														
 
															+率
														
 
															+異
														
 
															+北
														
 
															+磻
														
 
															+便
														
 
															+復
														
 
															+不
														
 
															+泌
														
 
															+數
														
 
															+索
														
 
															+參
														
 
															+塞
														
 
															+省
														
 
															+葉
														
 
															+說
														
 
															+殺
														
 
															+辰
														
 
															+沈
														
 
															+拾
														
 
															+若
														
 
															+掠
														
 
															+略
														
 
															+亮
														
 
															+兩
														
 
															+凉
														
 
															+梁
														
 
															+糧
														
 
															+良
														
 
															+諒
														
 
															+量
														
 
															+勵
														
 
															+呂
														
 
															+女
														
 
															+廬
														
 
															+旅
														
 
															+濾
														
 
															+礪
														
 
															+閭
														
 
															+驪
														
 
															+麗
														
 
															+黎
														
 
															+力
														
 
															+曆
														
 
															+歷
														
 
															+轢
														
 
															+年
														
 
															+憐
														
 
															+戀
														
 
															+撚
														
 
															+漣
														
 
															+煉
														
 
															+璉
														
 
															+秊
														
 
															+練
														
 
															+聯
														
 
															+輦
														
 
															+蓮
														
 
															+連
														
 
															+鍊
														
 
															+列
														
 
															+劣
														
 
															+咽
														
 
															+烈
														
 
															+裂
														
 
															+說
														
 
															+廉
														
 
															+念
														
 
															+捻
														
 
															+殮
														
 
															+簾
														
 
															+獵
														
 
															+令
														
 
															+囹
														
 
															+寧
														
 
															+嶺
														
 
															+怜
														
 
															+玲
														
 
															+瑩
														
 
															+羚
														
 
															+聆
														
 
															+鈴
														
 
															+零
														
 
															+靈
														
 
															+領
														
 
															+例
														
 
															+禮
														
 
															+醴
														
 
															+隸
														
 
															+惡
														
 
															+了
														
 
															+僚
														
 
															+寮
														
 
															+尿
														
 
															+料
														
 
															+樂
														
 
															+燎
														
 
															+療
														
 
															+蓼
														
 
															+遼
														
 
															+龍
														
 
															+暈
														
 
															+阮
														
 
															+劉
														
 
															+杻
														
 
															+柳
														
 
															+流
														
 
															+溜
														
 
															+琉
														
 
															+留
														
 
															+硫
														
 
															+紐
														
 
															+類
														
 
															+六
														
 
															+戮
														
 
															+陸
														
 
															+倫
														
 
															+崙
														
 
															+淪
														
 
															+輪
														
 
															+律
														
 
															+慄
														
 
															+栗
														
 
															+率
														
 
															+隆
														
 
															+利
														
 
															+吏
														
 
															+履
														
 
															+易
														
 
															+李
														
 
															+梨
														
 
															+泥
														
 
															+理
														
 
															+痢
														
 
															+罹
														
 
															+裏
														
 
															+裡
														
 
															+里
														
 
															+離
														
 
															+匿
														
 
															+溺
														
 
															+吝
														
 
															+燐
														
 
															+璘
														
 
															+藺
														
 
															+隣
														
 
															+鱗
														
 
															+麟
														
 
															+林
														
 
															+淋
														
 
															+臨
														
 
															+立
														
 
															+笠
														
 
															+粒
														
 
															+狀
														
 
															+炙
														
 
															+識
														
 
															+什
														
 
															+茶
														
 
															+刺
														
 
															+切
														
 
															+度
														
 
															+拓
														
 
															+糖
														
 
															+宅
														
 
															+洞
														
 
															+暴
														
 
															+輻
														
 
															+行
														
 
															+降
														
 
															+見
														
 
															+廓
														
 
															+兀
														
 
															+嗀
														
 
															+﨎
														
 
															+﨏
														
 
															+塚
														
 
															+﨑
														
 
															+晴
														
 
															+﨓
														
 
															+﨔
														
 
															+凞
														
 
															+猪
														
 
															+益
														
 
															+礼
														
 
															+神
														
 
															+祥
														
 
															+福
														
 
															+靖
														
 
															+精
														
 
															+羽
														
 
															+﨟
														
 
															+蘒
														
 
															+﨡
														
 
															+諸
														
 
															+﨣
														
 
															+﨤
														
 
															+逸
														
 
															+都
														
 
															+﨧
														
 
															+﨨
														
 
															+﨩
														
 
															+飯
														
 
															+飼
														
 
															+館
														
 
															+鶴
														
 
															+郞
														
 
															+隷
														
 
															+侮
														
 
															+僧
														
 
															+免
														
 
															+勉
														
 
															+勤
														
 
															+卑
														
 
															+喝
														
 
															+嘆
														
 
															+器
														
 
															+塀
														
 
															+墨
														
 
															+層
														
 
															+屮
														
 
															+悔
														
 
															+慨
														
 
															+憎
														
 
															+懲
														
 
															+敏
														
 
															+既
														
 
															+暑
														
 
															+梅
														
 
															+海
														
 
															+渚
														
 
															+漢
														
 
															+煮
														
 
															+爫
														
 
															+琢
														
 
															+碑
														
 
															+社
														
 
															+祉
														
 
															+祈
														
 
															+祐
														
 
															+祖
														
 
															+祝
														
 
															+禍
														
 
															+禎
														
 
															+穀
														
 
															+突
														
 
															+節
														
 
															+練
														
 
															+縉
														
 
															+繁
														
 
															+署
														
 
															+者
														
 
															+臭
														
 
															+艹
														
 
															+艹
														
 
															+著
														
 
															+褐
														
 
															+視
														
 
															+謁
														
 
															+謹
														
 
															+賓
														
 
															+贈
														
 
															+辶
														
 
															+逸
														
 
															+難
														
 
															+響
														
 
															+頻
														
 
															+恵
														
 
															+𤋮
														
 
															+舘
														
 
															+﩮
														
 
															+﩯
														
 
															+並
														
 
															+况
														
 
															+全
														
 
															+侀
														
 
															+充
														
 
															+冀
														
 
															+勇
														
 
															+勺
														
 
															+喝
														
 
															+啕
														
 
															+喙
														
 
															+嗢
														
 
															+塚
														
 
															+墳
														
 
															+奄
														
 
															+奔
														
 
															+婢
														
 
															+嬨
														
 
															+廒
														
 
															+廙
														
 
															+彩
														
 
															+徭
														
 
															+惘
														
 
															+慎
														
 
															+愈
														
 
															+憎
														
 
															+慠
														
 
															+懲
														
 
															+戴
														
 
															+揄
														
 
															+搜
														
 
															+摒
														
 
															+敖
														
 
															+晴
														
 
															+朗
														
 
															+望
														
 
															+杖
														
 
															+歹
														
 
															+殺
														
 
															+流
														
 
															+滛
														
 
															+滋
														
 
															+漢
														
 
															+瀞
														
 
															+煮
														
 
															+瞧
														
 
															+爵
														
 
															+犯
														
 
															+猪
														
 
															+瑱
														
 
															+甆
														
 
															+画
														
 
															+瘝
														
 
															+瘟
														
 
															+益
														
 
															+盛
														
 
															+直
														
 
															+睊
														
 
															+着
														
 
															+磌
														
 
															+窱
														
 
															+節
														
 
															+类
														
 
															+絛
														
 
															+練
														
 
															+缾
														
 
															+者
														
 
															+荒
														
 
															+華
														
 
															+蝹
														
 
															+襁
														
 
															+覆
														
 
															+視
														
 
															+調
														
 
															+諸
														
 
															+請
														
 
															+謁
														
 
															+諾
														
 
															+諭
														
 
															+謹
														
 
															+變
														
 
															+贈
														
 
															+輸
														
 
															+遲
														
 
															+醙
														
 
															+鉶
														
 
															+陼
														
 
															+難
														
 
															+靖
														
 
															+韛
														
 
															+響
														
 
															+頋
														
 
															+頻
														
 
															+鬒
														
 
															+龜
														
 
															+𢡊
														
 
															+𢡄
														
 
															+𣏕
														
 
															+㮝
														
 
															+䀘
														
 
															+䀹
														
 
															+𥉉
														
 
															+𥳐
														
 
															+𧻓
														
 
															+齃
														
 
															+龎
														
--- a/format_convert/easyofd/easyofd/draw/__init__.py
+++ b/format_convert/easyofd/easyofd/draw/__init__.py
@@ -0,0 +1,23 @@
 
															+import os
														
 
															+import sys
														
 
															+
														
 
															+from reportlab.pdfbase import pdfmetrics
														
 
															+
														
 
															+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../../")
														
 
															+from format_convert.easyofd.easyofd.parser_ofd import *
														
 
															+
														
 
															+FONTS = ['宋体',"SWPMEH+SimSun",'SimSun','KaiTi','楷体',"STKAITI","SWLCQE+KaiTi",
														
 
															+         'Courier New','STSong-Light',"CourierNew","SWANVV+CourierNewPSMT",
														
 
															+         "CourierNewPSMT","BWSimKai","hei","黑体","SimHei","SWDKON+SimSun",
														
 
															+         "SWCRMF+CourierNewPSMT","SWHGME+KaiTi"]
														
 
															+
														
 
															+from .font_tools import FontTool
														
 
															+from .draw_pdf import DrawPDF
														
 
															+from .draw_ofd import OFDWrite
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+    
														
--- a/format_convert/easyofd/easyofd/draw/draw_ofd.py
+++ b/format_convert/easyofd/easyofd/draw/draw_ofd.py
@@ -0,0 +1,290 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME: F:\code\easyofd\easyofd\draw
														
 
															+# CREATE_TIME: 2023-10-26
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# note:  写入 xml 目录并打包成ofd 文件
														
 
															+from datetime import datetime
														
 
															+from io import BytesIO
														
 
															+from typing import Optional
														
 
															+
														
 
															+from PIL import Image
														
 
															+from loguru import logger
														
 
															+
														
 
															+from .ofdtemplate import CurId, OFDTemplate, DocumentTemplate, DocumentResTemplate, PublicResTemplate, ContentTemplate, \
														
 
															+    OFDStructure
														
 
															+from .pdf_parse import DPFParser
														
 
															+
														
 
															+
														
 
															+class OFDWrite(object):
														
 
															+    """
														
 
															+    写入ofd 工具类
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, ):
														
 
															+        self.OP = 200 / 25.4
														
 
															+        # self.OP = 1
														
 
															+
														
 
															+    def build_ofd_entrance(self, id_obj: Optional[CurId] = None):
														
 
															+        """
														
 
															+        build_ofd_entrance
														
 
															+        """
														
 
															+        CreationDate = str(datetime.now())
														
 
															+        ofd_entrance = OFDTemplate(CreationDate=CreationDate, id_obj=id_obj)
														
 
															+        return ofd_entrance
														
 
															+
														
 
															+    def build_document(self, img_len, id_obj: Optional[CurId] = None, PhysicalBox: Optional[str] = "0 0 140 90"):
														
 
															+        """
														
 
															+        build_document
														
 
															+        """
														
 
															+        pages = []
														
 
															+
														
 
															+        for idx in range(img_len):
														
 
															+            pages.append(
														
 
															+                {
														
 
															+                    "@ID": f"{idx + 1}",
														
 
															+                    "@BaseLoc": f"Pages/Page_{idx}/Content.xml"
														
 
															+                }
														
 
															+            )
														
 
															+        document = DocumentTemplate(Page=pages, id_obj=id_obj, PhysicalBox=PhysicalBox)
														
 
															+        return document
														
 
															+
														
 
															+    def build_document_res(self, img_len: int = 0, id_obj: Optional[CurId] = None,
														
 
															+                           pfd_res_uuid_map: Optional[dict] = None):
														
 
															+        """
														
 
															+        build_document_res
														
 
															+        """
														
 
															+        MultiMedia = []
														
 
															+        DrawParams = []  # todo DrawParams 参数后面有空增加
														
 
															+        pfd_img = None
														
 
															+        if pfd_res_uuid_map:
														
 
															+            pfd_img = pfd_res_uuid_map.get("img")
														
 
															+
														
 
															+        if img_len and not pfd_res_uuid_map:
														
 
															+            for num in range(img_len):
														
 
															+                MultiMedia.append({
														
 
															+                    "@ID": 0,
														
 
															+                    "@Type": "Image",
														
 
															+                    "ofd:MediaFile": f"Image_{num}.jpg",
														
 
															+                    "res_uuid": f"{num}",
														
 
															+                })
														
 
															+        elif pfd_res_uuid_map and pfd_img:
														
 
															+            for res_uuid in pfd_img.keys():
														
 
															+                name = f"Image_{res_uuid}.jpg"
														
 
															+                MultiMedia.append({
														
 
															+                    "@ID": 0,
														
 
															+                    "@Type": "Image",
														
 
															+                    "ofd:MediaFile": name,
														
 
															+                    "res_uuid": res_uuid,
														
 
															+
														
 
															+                })
														
 
															+
														
 
															+        document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
														
 
															+        return document_res
														
 
															+
														
 
															+    def build_public_res(self, id_obj: CurId = None, pfd_res_uuid_map: dict = None):
														
 
															+        """
														
 
															+        build_public_res
														
 
															+        """
														
 
															+        fonts = []
														
 
															+
														
 
															+        pfd_font = None
														
 
															+        if pfd_res_uuid_map:
														
 
															+            pfd_font = pfd_res_uuid_map.get("font")
														
 
															+
														
 
															+        if pfd_res_uuid_map and pfd_font:
														
 
															+            for res_uuid, font in pfd_font.items():
														
 
															+                fonts.append({
														
 
															+                    "@ID": 0,
														
 
															+                    "@FontName": font,
														
 
															+                    "@FamilyName": font,  # 匹配替代字型
														
 
															+                    "res_uuid": res_uuid,
														
 
															+                    "@FixedWidth": "false",
														
 
															+                    "@Serif": "false",
														
 
															+                    "@Bold": "false",
														
 
															+                    "@Charset": "prc"
														
 
															+                })
														
 
															+        else:
														
 
															+            pass
														
 
															+
														
 
															+        public_res = PublicResTemplate(Font=fonts, id_obj=id_obj)
														
 
															+        return public_res
														
 
															+
														
 
															+    def build_content_res(self, pil_img_list=None, pdf_info_list=None, id_obj: CurId = None,
														
 
															+                          pfd_res_uuid_map: dict = None):
														
 
															+        """
														
 
															+        pil_img_list - >一张图片是一页
														
 
															+        content_res -> 写入 pdf 信息
														
 
															+        """
														
 
															+        PhysicalBox = None
														
 
															+        content_res_list = []
														
 
															+        if pil_img_list:
														
 
															+            for idx, pil_img in enumerate(pil_img_list):
														
 
															+                # print(pil_img)
														
 
															+                # print(idx, pil_img[1], pil_img[2])
														
 
															+                PhysicalBox = f"0 0 {pil_img[1]} {pil_img[2]}"
														
 
															+                ImageObject = [{
														
 
															+                    "@ID": 0,
														
 
															+                    "@CTM": f"{pil_img[1]} 0 0 {pil_img[2]} 0 0",
														
 
															+                    "@Boundary": f"0 0 {pil_img[1]} {pil_img[2]}",
														
 
															+                    "res_uuid": f"{idx}",  # 资源标识
														
 
															+                    "@ResourceID": f""
														
 
															+                }]
														
 
															+
														
 
															+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
														
 
															+
														
 
															+                                         CGTransform=[], PathObject=[], TextObject=[], id_obj=id_obj)
														
 
															+                # print(conten)
														
 
															+                content_res_list.append(conten)
														
 
															+        elif pdf_info_list:  # 写入读取后的pdf 结果 # todo 图片id 需要关联得提前定义或者有其他方式反向对齐
														
 
															+
														
 
															+            for idx, content in enumerate(pdf_info_list):
														
 
															+                ImageObject = []
														
 
															+                TextObject = []
														
 
															+                PhysicalBox = pfd_res_uuid_map["other"]["page_size"][idx]
														
 
															+                PhysicalBox = f"0 0 {PhysicalBox[0]} {PhysicalBox[1]}"  # page_size 没有的话使用document 里面的
														
 
															+                for block in content:
														
 
															+                    # print(block)
														
 
															+
														
 
															+                    bbox = block['bbox']
														
 
															+                    x0, y0, length, height = bbox[0] / self.OP, bbox[1] / self.OP, (bbox[2] - bbox[0]) / self.OP, (
														
 
															+                            bbox[3] - bbox[1]) / self.OP
														
 
															+                    if block["type"] == "text":
														
 
															+
														
 
															+                        count = len(block.get("text"))
														
 
															+
														
 
															+                        TextObject.append({
														
 
															+                            "@ID": 0,
														
 
															+                            "res_uuid": block.get("res_uuid"),  # 资源标识
														
 
															+                            "@Font": "",
														
 
															+                            "ofd:FillColor": {"Value": "156 82 35"},
														
 
															+
														
 
															+                            "ofd:TextCode": {
														
 
															+                                "#text": block.get("text"),
														
 
															+                                "@X": "0",
														
 
															+                                "@Y": f"{block.get('size') / self.OP}",
														
 
															+                                "@DeltaX": f"g {count - 1} {length / count}"
														
 
															+                            },
														
 
															+
														
 
															+                            "@size": block.get("size") / self.OP,
														
 
															+                            "@Boundary": f"{x0} {y0} {length} {height}",
														
 
															+
														
 
															+                        })
														
 
															+                    elif block["type"] == "img":
														
 
															+                        ImageObject.append({
														
 
															+                            "@ID": 0,
														
 
															+                            "res_uuid": block.get("res_uuid"),  # 资源标识
														
 
															+
														
 
															+                            "@Boundary": f"{x0} {y0} {length} {height}",
														
 
															+                            "@ResourceID": f""  # 需要关联public res 里面的结果
														
 
															+
														
 
															+                        })
														
 
															+
														
 
															+                # for i in content:
														
 
															+                #     if i["type"] == "img":
														
 
															+                #         ImageObject.append(i)
														
 
															+                #     elif i["type"] == "text":
														
 
															+                #         TextObject.append(i)
														
 
															+
														
 
															+                conten = ContentTemplate(PhysicalBox=PhysicalBox, ImageObject=ImageObject,
														
 
															+
														
 
															+                                         CGTransform=[], PathObject=[], TextObject=TextObject, id_obj=id_obj)
														
 
															+                # print(conten)
														
 
															+                content_res_list.append(conten)
														
 
															+        else:
														
 
															+            pass
														
 
															+        return content_res_list
														
 
															+
														
 
															+    def pil_2_bytes(self, image):
														
 
															+        """"""
														
 
															+        # 创建一个 BytesIO 对象
														
 
															+        img_bytesio = BytesIO()
														
 
															+
														
 
															+        # 将图像保存到 BytesIO 对象
														
 
															+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
														
 
															+
														
 
															+        # 获取 BytesIO 对象中的字节
														
 
															+        img_bytes = img_bytesio.getvalue()
														
 
															+
														
 
															+        # 关闭 BytesIO 对象
														
 
															+        img_bytesio.close()
														
 
															+        return img_bytes
														
 
															+
														
 
															+    def __call__(self, pdf_bytes=None, pil_img_list=None, optional_text=False):
														
 
															+        """
														
 
															+        input pdf | imgs if pdf  >optional_text or not
														
 
															+        0 解析pdf文件
														
 
															+        1 构建必要的ofd template
														
 
															+        2 转化为 ofd
														
 
															+        """
														
 
															+        pdf_obj = DPFParser()
														
 
															+        page_pil_img_list = None
														
 
															+
														
 
															+        # 插入图片ofd
														
 
															+        if pil_img_list:  # 读取 图片
														
 
															+            page_pil_img_list = [(self.pil_2_bytes(_img), _img.size[0] / self.OP, _img.size[1] / self.OP) for _img in
														
 
															+                                 pil_img_list]
														
 
															+        else:  # 读取 pdf 转图片
														
 
															+            if optional_text:  # 生成可编辑ofd:
														
 
															+                pdf_info_list, pfd_res_uuid_map = pdf_obj.extract_text_with_details(pdf_bytes)  # 解析pdf
														
 
															+                # logger.debug(f"pdf_info_list: {pdf_info_list} \n pfd_res_uuid_map {pfd_res_uuid_map}")
														
 
															+            else:
														
 
															+                img_list = pdf_obj.to_img(pdf_bytes)
														
 
															+                page_pil_img_list = [(self.pil_2_bytes(Image.frombytes("RGB", [_img.width, _img.height],
														
 
															+                                                                       _img.samples)), _img.width / self.OP,
														
 
															+                                      _img.height / self.OP) for _img in img_list]
														
 
															+
														
 
															+        id_obj = CurId()
														
 
															+
														
 
															+        if page_pil_img_list:  # img 内容转ofd
														
 
															+            res_static = {}  # 图片资源
														
 
															+            pfd_res_uuid_map = {"img": {}}
														
 
															+            PhysicalBox = f"0 0 {page_pil_img_list[0][1]} {page_pil_img_list[0][2]}"
														
 
															+            for idx, pil_img_tuple in enumerate(page_pil_img_list):
														
 
															+                pfd_res_uuid_map["img"][f"{idx}"] = pil_img_tuple[0]
														
 
															+                res_static[f"Image_{idx}.jpg"] = pil_img_tuple[0]
														
 
															+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
														
 
															+            document = self.build_document(len(page_pil_img_list), id_obj=id_obj, PhysicalBox=PhysicalBox)
														
 
															+            public_res = self.build_public_res(id_obj=id_obj)
														
 
															+            document_res = self.build_document_res(len(page_pil_img_list), id_obj=id_obj,
														
 
															+                                                   pfd_res_uuid_map=pfd_res_uuid_map)
														
 
															+
														
 
															+            content_res_list = self.build_content_res(page_pil_img_list, id_obj=id_obj,
														
 
															+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
														
 
															+
														
 
															+
														
 
															+        else:
														
 
															+            #  生成的文档结构对象需要传入id实例
														
 
															+            ofd_entrance = self.build_ofd_entrance(id_obj=id_obj)
														
 
															+            document = self.build_document(len(pdf_info_list), id_obj=id_obj)
														
 
															+            public_res = self.build_public_res(id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
														
 
															+            document_res = self.build_document_res(len(pdf_info_list), id_obj=id_obj, pfd_res_uuid_map=pfd_res_uuid_map)
														
 
															+            content_res_list = self.build_content_res(pdf_info_list=pdf_info_list, id_obj=id_obj,
														
 
															+                                                      pfd_res_uuid_map=pfd_res_uuid_map)
														
 
															+
														
 
															+            res_static = {}  # 图片资源
														
 
															+
														
 
															+            print("pfd_res_uuid_map", pfd_res_uuid_map)
														
 
															+            img_dict = pfd_res_uuid_map.get("img")
														
 
															+            if img_dict:
														
 
															+                for key, v_io in img_dict.items():
														
 
															+                    res_static[f"Image_{key}.jpg"] = v_io.getvalue()
														
 
															+
														
 
															+        # 生成 ofd 文件
														
 
															+        ofd_byte = OFDStructure("123", ofd=ofd_entrance, document=document, public_res=public_res,
														
 
															+                                document_res=document_res, content_res=content_res_list, res_static=res_static)(
														
 
															+            test=True)
														
 
															+        return ofd_byte
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    pdf_p = r"D:\renodoc\技术栈\GBT_33190-2016_电子文件存储与交换格式版式文档.pdf"
														
 
															+    pdf_p = r"F:\code\easyofd\test"
														
 
															+    with open(pdf_p, "rb") as f:
														
 
															+        content = f.read()
														
 
															+
														
 
															+    ofd_content = OFDWrite()(content)
														
 
															+
														
 
															+    with open("ofd.ofd", "wb") as f:
														
 
															+        f.write(ofd_content)
														
--- a/format_convert/easyofd/easyofd/draw/draw_pdf.py
+++ b/format_convert/easyofd/easyofd/draw/draw_pdf.py
@@ -0,0 +1,1178 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME: E:\code\easyofd\easyofd\draw
														
 
															+# CREATE_TIME: 2023-08-10
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE:  绘制pdf
														
 
															+import base64
														
 
															+import math
														
 
															+import os
														
 
															+import re
														
 
															+import sys
														
 
															+import traceback
														
 
															+from io import BytesIO
														
 
															+
														
 
															+from PIL import Image as PILImage, Image, ImageFont, ImageDraw
														
 
															+from fontTools.ttLib import TTFont
														
 
															+from loguru import logger
														
 
															+from reportlab.lib.pagesizes import A4
														
 
															+from reportlab.lib.utils import ImageReader
														
 
															+from reportlab.pdfgen import canvas
														
 
															+
														
 
															+from format_convert.utils import special_font_to_normal, image_resize_by_ratio
														
 
															+
														
 
															+sys.path.append(os.path.dirname(__file__) + "/../../../../")
														
 
															+from format_convert.easyofd.easyofd.draw.font_tools import FontTool
														
 
															+from .find_seal_img import SealExtract
														
 
															+
														
 
															+
														
 
															+# print(reportlab_fonts)
														
 
															+class DrawPDF():
														
 
															+    """
														
 
															+    ofd 解析结果 绘制pdf
														
 
															+    OP ofd 单位转换
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, data, *args, **kwargs):
														
 
															+        assert data, "未输入ofd解析结果"
														
 
															+        self.data = data
														
 
															+        self.author = "renoyuan"
														
 
															+        self.OP = 200 / 25.4
														
 
															+        # self.OP = 1
														
 
															+        self.pdf_uuid_name = self.data[0]["pdf_name"]
														
 
															+        self.pdf_io = BytesIO()
														
 
															+        self.SupportImgType = ("JPG", "JPEG", "PNG")
														
 
															+        self.init_font = "宋体"
														
 
															+        self.font_tool = FontTool()
														
 
															+        self.page_need_to_image_dict = {}
														
 
															+
														
 
															+    def draw_lines(my_canvas):
														
 
															+        """
														
 
															+        draw_line
														
 
															+        """
														
 
															+        my_canvas.setLineWidth(.3)
														
 
															+
														
 
															+        start_y = 710
														
 
															+        my_canvas.line(30, start_y, 580, start_y)
														
 
															+
														
 
															+        for x in range(10):
														
 
															+            start_y -= 10
														
 
															+            my_canvas.line(30, start_y, 580, start_y)
														
 
															+
														
 
															+    def gen_empty_pdf(self):
														
 
															+        """
														
 
															+        """
														
 
															+        c = canvas.Canvas(self.pdf_io)
														
 
															+        c.setPageSize(A4)
														
 
															+        c.setFont(self.init_font, 20)
														
 
															+        c.drawString(0, 210, "ofd 格式错误,不支持解析", mode=1)
														
 
															+        c.save()
														
 
															+
														
 
															+    # 单个字符偏移量计算
														
 
															+    def cmp_offset(self, pos, offset, DeltaRule, text, CTM_info, dire="X") -> list:
														
 
															+        """
														
 
															+        pos 文本框x|y 坐标 
														
 
															+        offset 第一个字符的X|Y 
														
 
															+        DeltaRule 偏移量规则
														
 
															+        resize 字符坐标缩放
														
 
															+        返回 x|y  字符位置list 
														
 
															+        """
														
 
															+        if CTM_info and dire == "X":
														
 
															+            resize = CTM_info.get("resizeX")
														
 
															+            rotate = CTM_info.get("rotateX")
														
 
															+            move = CTM_info.get("moveX")
														
 
															+        elif CTM_info and dire == "Y":
														
 
															+            resize = CTM_info.get("resizeY")
														
 
															+            rotate = CTM_info.get("rotateY")
														
 
															+            move = CTM_info.get("moveY")
														
 
															+        else:
														
 
															+            resize = 1
														
 
															+            rotate = 0
														
 
															+            move = 0
														
 
															+
														
 
															+        # print(f"resize is {resize}")
														
 
															+        char_pos = float(pos if pos else 0) + (float(offset if offset else 0) + move) * resize
														
 
															+        pos_list = []
														
 
															+        pos_list.append(char_pos)  # 放入第一个字符
														
 
															+        offsets = [i for i in DeltaRule.split(" ")]
														
 
															+
														
 
															+        if "g" in DeltaRule:  # g 代表多个元素
														
 
															+            g_no = None
														
 
															+            for _no, offset_i in enumerate(offsets):
														
 
															+
														
 
															+                if offset_i == "g":
														
 
															+                    g_no = _no
														
 
															+                    for j in range(int(offsets[(g_no + 1)])):
														
 
															+                        char_pos += float(offsets[(g_no + 2)])
														
 
															+                        pos_list.append(char_pos)
														
 
															+
														
 
															+                elif offset_i and offset_i != "g":
														
 
															+                    if g_no == None:
														
 
															+                        char_pos += float(offset_i) * resize
														
 
															+                        pos_list.append(char_pos)
														
 
															+                    elif (int(_no) > int(g_no + 2)) and g_no != None:
														
 
															+                        # print(f"offset_i is {offset_i}")
														
 
															+                        char_pos += float(offset_i) * resize
														
 
															+                        pos_list.append(char_pos)
														
 
															+
														
 
															+        elif not DeltaRule:  # 没有字符偏移量 一般单字符
														
 
															+            pos_list = []
														
 
															+            for i in range(len(text)):
														
 
															+                pos_list.append(char_pos)
														
 
															+        else:  # 有字符偏移量
														
 
															+            for i in offsets:
														
 
															+                if not i:
														
 
															+                    char_pos += 0
														
 
															+                else:
														
 
															+                    char_pos += float(i) * resize
														
 
															+                pos_list.append(char_pos)
														
 
															+
														
 
															+        return pos_list
														
 
															+
														
 
															+    def draw_chars_old(self, canvas, text_list, fonts, page_size):
														
 
															+        """写入字符"""
														
 
															+        c = canvas
														
 
															+        for line_dict in text_list:
														
 
															+            # TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
														
 
															+            text = line_dict.get("text")
														
 
															+            # font_info = fonts.get(line_dict.get("font"), {})
														
 
															+            # if font_info:
														
 
															+            #     font_name = font_info.get("FontName", "")
														
 
															+            # else:
														
 
															+            #     font_name = self.init_font
														
 
															+            # print(f"font_name:{font_name}")
														
 
															+
														
 
															+            # TODO 判断是否通用已有字体 否则匹配相近字体使用
														
 
															+            # if font_name not in self.font_tool.FONTS:
														
 
															+            #     font_name = self.font_tool.FONTS[0]
														
 
															+            font_name = self.init_font
														
 
															+
														
 
															+            font = self.font_tool.normalize_font_name(font_name)
														
 
															+            # print(f"font_name:{font_name} font:{font}")
														
 
															+
														
 
															+            try:
														
 
															+                c.setFont(font, line_dict["size"] * self.OP)
														
 
															+            except KeyError as key_error:
														
 
															+                logger.error(f"{key_error}")
														
 
															+                font = self.font_tool.FONTS[0]
														
 
															+                c.setFont(font, line_dict["size"] * self.OP)
														
 
															+            # 原点在页面的左下角 
														
 
															+            color = line_dict.get("color", [0, 0, 0])
														
 
															+            if len(color) < 3:
														
 
															+                color = [0, 0, 0]
														
 
															+
														
 
															+            c.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
														
 
															+            c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
														
 
															+
														
 
															+            DeltaX = line_dict.get("DeltaX", "")
														
 
															+            DeltaY = line_dict.get("DeltaY", "")
														
 
															+            # print("DeltaX",DeltaX)
														
 
															+            X = line_dict.get("X", "")
														
 
															+            Y = line_dict.get("Y", "")
														
 
															+            CTM = line_dict.get("CTM", "")  # 因为ofd 增加这个字符缩放
														
 
															+            resizeX = 1
														
 
															+            resizeY = 1
														
 
															+            # CTM =None # 有的数据不使用这个CTM
														
 
															+            CTMS = None
														
 
															+            if CTM:
														
 
															+                CTMS = CTM.split(" ")
														
 
															+
														
 
															+            if CTM and CTMS and len(CTMS) == 6:
														
 
															+                CTM_info = {
														
 
															+                    "resizeX": float(CTMS[0]),
														
 
															+                    "rotateX": float(CTMS[1]),
														
 
															+                    "rotateY": float(CTMS[2]),
														
 
															+                    "resizeY": float(CTMS[3]),
														
 
															+                    "moveX": float(CTMS[4]),
														
 
															+                    "moveY": float(CTMS[5]),
														
 
															+
														
 
															+                }
														
 
															+
														
 
															+            else:
														
 
															+                CTM_info = {}
														
 
															+            x_list = self.cmp_offset(line_dict.get("pos")[0], X, DeltaX, text, CTM_info, dire="X")
														
 
															+            y_list = self.cmp_offset(line_dict.get("pos")[1], Y, DeltaY, text, CTM_info, dire="Y")
														
 
															+
														
 
															+            # print("x_list",x_list)
														
 
															+            # print("y_list",y_list)
														
 
															+            # print("Y",page_size[3])
														
 
															+            # print("x",page_size[2])
														
 
															+            # if line_dict.get("Glyphs_d") and  FontFilePath.get(line_dict["font"])  and font_f not in FONTS:
														
 
															+            if False:  # 对于自定义字体 写入字形 drawPath 性能差暂时作废
														
 
															+                Glyphs = [int(i) for i in line_dict.get("Glyphs_d").get("Glyphs").split(" ")]
														
 
															+                for idx, Glyph_id in enumerate(Glyphs):
														
 
															+                    _cahr_x = float(x_list[idx]) * self.OP
														
 
															+                    _cahr_y = (float(page_size[3]) - (float(y_list[idx]))) * self.OP
														
 
															+                    imageFile = draw_Glyph(FontFilePath.get(line_dict["font"]), Glyph_id, text[idx])
														
 
															+
														
 
															+                    # font_img_info.append((FontFilePath.get(line_dict["font"]), Glyph_id,text[idx],_cahr_x,_cahr_y,-line_dict["size"]*Op*2,line_dict["size"]*Op*2))
														
 
															+                    c.drawImage(imageFile, _cahr_x, _cahr_y, -line_dict["size"] * self.OP * 2,
														
 
															+                                line_dict["size"] * self.OP * 2)
														
 
															+            else:
														
 
															+                if len(text) > len(x_list) or len(text) > len(y_list):
														
 
															+                    text = re.sub("[^\u4e00-\u9fa5]", "", text)
														
 
															+                try:
														
 
															+                    # 按行写入  最后一个字符y  算出来大于 y轴  最后一个字符x  算出来大于 x轴 
														
 
															+                    if y_list[-1] * self.OP > page_size[3] * self.OP or x_list[-1] * self.OP > page_size[2] * self.OP or \
														
 
															+                            x_list[-1] < 0 or y_list[-1] < 0:
														
 
															+                        # if True:
														
 
															+                        # print("line wtite")
														
 
															+                        x_p = abs(float(X)) * self.OP
														
 
															+                        y_p = abs(float(page_size[3]) - (float(Y))) * self.OP
														
 
															+                        print('text, x_p, y_p', text, x_p, y_p)
														
 
															+                        c.drawString(x_p, y_p, text, mode=0)  # mode=3 文字不可见 0可見
														
 
															+
														
 
															+                        # text_write.append((x_p,  y_p, text))
														
 
															+                    # 按字符写入
														
 
															+                    else:
														
 
															+                        for char_id, _char in enumerate(text):
														
 
															+                            if len(x_list) > char_id:
														
 
															+                                # print("char wtite")
														
 
															+                                font_size = line_dict["size"] * self.OP * resizeX
														
 
															+                                c.setFont(font, line_dict["size"] * self.OP * resizeX)
														
 
															+                                _char_x = float(x_list[char_id]) * self.OP
														
 
															+                                _char_y = (float(page_size[3]) - (float(y_list[char_id]))) * self.OP
														
 
															+                                # print(_cahr_x,  _cahr_y, _cahr_)
														
 
															+                                print('_cahr_, _char_x, _char_y', _char, _char_x, _char_y, font_size)
														
 
															+                                c.drawString(_char_x, _char_y, _char, mode=0)  # mode=3 文字不可见 0可見
														
 
															+                                break
														
 
															+                            else:
														
 
															+                                pass
														
 
															+                                # logger.debug(f"match {_cahr_} pos error \n{text} \n{x_list}")
														
 
															+                            # text_write.append((_cahr_x,  _cahr_y, _cahr_))
														
 
															+                        break
														
 
															+                except Exception as e:
														
 
															+                    logger.error(f"{e}")
														
 
															+                    traceback.print_exc()
														
 
															+
														
 
															+    def draw_chars(self, canvas, text_list, fonts, page_size, pdf_page_size):
														
 
															+        """写入字符"""
														
 
															+        for line_dict in text_list:
														
 
															+            # TODO 写入前对于正文内容整体序列化一次 方便 查看最后输入值 对于最终 格式先
														
 
															+            # print('line_dict', line_dict)
														
 
															+            text = line_dict.get("text")
														
 
															+            text_size = line_dict.get("size")
														
 
															+            if not text_size:
														
 
															+                print('draw_chars not text_size', text)
														
 
															+                return
														
 
															+
														
 
															+            # 变换矩阵
														
 
															+            ctm = line_dict.get("CTM", '')
														
 
															+            ctm = self.get_ctm(ctm)
														
 
															+            a, b, c, d, e, f = ctm
														
 
															+            # 计算水平和垂直方向的缩放因子的平均值
														
 
															+            font_scale = (a + d) / 2
														
 
															+
														
 
															+            color = line_dict.get("color", [0, 0, 0])
														
 
															+            if len(color) < 3:
														
 
															+                color = [0, 0, 0]
														
 
															+            canvas.setFillColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
														
 
															+            # c.setStrokeColorRGB(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255)
														
 
															+
														
 
															+            # 文本框范围
														
 
															+            boundary = line_dict.get("pos")
														
 
															+            if len(boundary) != 4:
														
 
															+                print('draw_chars not boundary', text, boundary)
														
 
															+                return
														
 
															+            left, top, width, height = boundary
														
 
															+
														
 
															+            # 根据delta_x判断有重复文本
														
 
															+            delta_x = line_dict.get("DeltaX", "")
														
 
															+            delta_y = line_dict.get("DeltaY", "")
														
 
															+            g_cnt = re.findall('g', delta_x)
														
 
															+            if len(g_cnt) >= 2:
														
 
															+                split_index = len(text) / 2
														
 
															+                if text[:int(split_index)] == text[int(split_index):]:
														
 
															+                    text2 = text[:int(split_index)]
														
 
															+                    print('len(g_cnt) >= 2', g_cnt, text, '->', text2)
														
 
															+                    text = text2
														
 
															+
														
 
															+            # 文字相对与boundary的偏移
														
 
															+            x = line_dict.get("X", "")
														
 
															+            y = line_dict.get("Y", "")
														
 
															+            if "" in [x, y]:
														
 
															+                print('draw_chars not x or not y', text, x, y)
														
 
															+                return
														
 
															+            x, y = float(x) * font_scale, float(y) * font_scale
														
 
															+
														
 
															+            font_name = self.init_font
														
 
															+            font = self.font_tool.normalize_font_name(font_name)
														
 
															+
														
 
															+            # boundary, x, y 计算实际坐标
														
 
															+            actual_left = left + x
														
 
															+            actual_right = actual_left + width
														
 
															+            actual_top = top + y
														
 
															+            actual_bottom = actual_top + y
														
 
															+
														
 
															+            # print('actual_left, actual_top', text, actual_left, actual_top)
														
 
															+
														
 
															+            # ctm, text_size 计算字体大小
														
 
															+            actual_size = text_size * font_scale
														
 
															+
														
 
															+            canvas.setFont(font, actual_size * self.OP)
														
 
															+
														
 
															+            # print('actual_bottom, y', actual_bottom, y)
														
 
															+            # ofd原点在左上角，pdf原点在左下角
														
 
															+            try:
														
 
															+                # print('text111', text, actual_left * self.OP, pdf_page_size[3] - actual_bottom * self.OP)
														
 
															+                # 按行写入
														
 
															+                canvas.drawString(actual_left * self.OP,
														
 
															+                                  pdf_page_size[3] - actual_top * self.OP,
														
 
															+                                  text, mode=0)
														
 
															+
														
 
															+            except Exception as e:
														
 
															+                logger.error(f"{e}")
														
 
															+                traceback.print_exc()
														
 
															+
														
 
															+    def draw_odf_char_on_image(self, line_dict, img, pos, ofd_page_size):
														
 
															+        text = line_dict.get("text")
														
 
															+        text_size = line_dict.get("size")
														
 
															+        if not text_size:
														
 
															+            print('get_odf_char_info not text_size', text)
														
 
															+            return
														
 
															+
														
 
															+        # 变换矩阵
														
 
															+        ctm = line_dict.get("CTM", '')
														
 
															+        ctm = self.get_ctm(ctm)
														
 
															+        a, b, c, d, e, f = ctm
														
 
															+        # 计算水平和垂直方向的缩放因子的平均值
														
 
															+        font_scale = (a + d) / 2
														
 
															+
														
 
															+        color = line_dict.get("color", [0, 0, 0])
														
 
															+        if len(color) < 3:
														
 
															+            color = (0, 0, 0)
														
 
															+        else:
														
 
															+            color = tuple([int(x) for x in color])
														
 
															+        # print('color', color)
														
 
															+
														
 
															+        # 文本框范围
														
 
															+        boundary = line_dict.get("pos")
														
 
															+        if len(boundary) != 4:
														
 
															+            print('get_odf_char_info not boundary', text, boundary)
														
 
															+            return
														
 
															+        left, top, width, height = boundary
														
 
															+
														
 
															+        # 文字相对与boundary的偏移，y小于size的话会显示不完全
														
 
															+        x = line_dict.get("X", "")
														
 
															+        y = line_dict.get("Y", "")
														
 
															+        # print('x, y', x, y)
														
 
															+        if "" in [x, y]:
														
 
															+            print('get_odf_char_info not x or not y', text, x, y)
														
 
															+            return
														
 
															+        x, y = float(x) * a, float(y) * d
														
 
															+
														
 
															+        # boundary, x, y 计算实际坐标
														
 
															+        actual_left = left
														
 
															+        actual_right = actual_left + x
														
 
															+        actual_top = top
														
 
															+        actual_bottom = actual_top + y
														
 
															+
														
 
															+        # print('actual_left', actual_left, ofd_page_size[2], pos[2])
														
 
															+        # actual_left = actual_left / ofd_page_size[2] * pos[2]
														
 
															+        # print('actual_left2', actual_left, ofd_page_size[2], pos[2])
														
 
															+        # actual_top = actual_top / ofd_page_size[3] * pos[3]
														
 
															+
														
 
															+        # actual_bottom = bottom + y
														
 
															+        # actual_top = actual_bottom + y
														
 
															+
														
 
															+        # print('actual_left, actual_top', text, actual_left, actual_top)
														
 
															+
														
 
															+        # ctm, text_size 计算字体大小
														
 
															+        actual_size = text_size * font_scale
														
 
															+        actual_size = int(actual_size * img.size[0] / pos[2])
														
 
															+
														
 
															+        left_top_point = [actual_left * img.size[0] / pos[2], actual_top * img.size[1] / pos[3]]
														
 
															+        left_top_point = [int(x) for x in left_top_point]
														
 
															+        draw = ImageDraw.Draw(img)
														
 
															+        font = ImageFont.truetype(os.path.dirname(__file__) + '/simsun.ttc', actual_size)
														
 
															+
														
 
															+        # print('text left_top_point, actual_size', text, left_top_point, actual_size)
														
 
															+        # print('img.size', img.size)
														
 
															+
														
 
															+        draw.text(left_top_point, text, font=font, fill=color)
														
 
															+        return img
														
 
															+
														
 
															+    def compute_ctm(self, CTM, x1, y1, img_width, img_height):
														
 
															+        """待定方法"""
														
 
															+        a, b, c, d, e, f = CTM.split(" ")
														
 
															+        a, b, c, d, e, f = float(a), float(b), float(c), float(d), float(e), float(f)
														
 
															+        # 定义变换矩阵的元素
														
 
															+
														
 
															+        # 计算原始矩形的宽和高
														
 
															+        x2 = x1 + img_width
														
 
															+        y2 = y1 + img_height
														
 
															+        print(f"ori x1 {x1} y1 {y1} x2 {x2} y2 {y2} img_width {img_width} img_height {img_height}")
														
 
															+        a = a / 10
														
 
															+        d = d / 10
														
 
															+        # 对左上角和右下角点进行变换
														
 
															+        x1_new = a * x1 + c * y1 + (e)
														
 
															+        y1_new = b * x1 + d * y1 + (f)
														
 
															+        x2_new = a * x2 + c * y2 + (e)
														
 
															+        y2_new = b * x2 + d * y2 + (f)
														
 
															+        print(f"x1_new {x1_new} y1_new {y1_new} x2_new {x2_new} y2_new {y2_new}")
														
 
															+        # 计算变换后矩形的宽和高
														
 
															+        w_new = x2_new - x1_new
														
 
															+        h_new = y2_new - y1_new
														
 
															+
														
 
															+        print(f"原始矩形宽度: {img_width}, 高度: {img_height}")
														
 
															+        print(f"变换后矩形宽度: {w_new}, 高度: {h_new}")
														
 
															+        return x1_new, y1_new, w_new, h_new
														
 
															+
														
 
															+    def get_ctm(self, ctm):
														
 
															+        default_ctm = (1, 0, 0, 1, 0, 0)
														
 
															+        if not ctm:
														
 
															+            # print('get_ctm no ctm!', ctm)
														
 
															+            return default_ctm
														
 
															+        ctm = ctm.split(" ")
														
 
															+        if len(ctm) != 6:
														
 
															+            print('get_ctm len(ctm) != 6', ctm)
														
 
															+            return default_ctm
														
 
															+        ctm = [float(x) for x in ctm]
														
 
															+        # a, b, c, d, e, f = ctm
														
 
															+        return ctm
														
 
															+
														
 
															+    def draw_img_old(self, canvas, img_list, images, page_size):
														
 
															+        """写入图片"""
														
 
															+        c = canvas
														
 
															+        for img_d in img_list:
														
 
															+            image = images.get(img_d["ResourceID"])
														
 
															+
														
 
															+            if not image or image.get("suffix").upper() not in self.SupportImgType:
														
 
															+                continue
														
 
															+
														
 
															+            imgbyte = base64.b64decode(image.get('imgb64'))
														
 
															+            if not imgbyte:
														
 
															+                logger.error(f"{image['fileName']} is null")
														
 
															+                continue
														
 
															+
														
 
															+            img = PILImage.open(BytesIO(imgbyte))
														
 
															+            img_width, img_height = img.size
														
 
															+            # img_width = img_width / self.OP *25.4
														
 
															+            # img_height = img_height / self.OP *25.4
														
 
															+            info = img.info
														
 
															+            # print( f"ing info dpi {info.get('dpi')}")
														
 
															+            # print(img_width, img_height)
														
 
															+            imgReade = ImageReader(img)
														
 
															+            CTM = img_d.get('CTM')
														
 
															+            # print("CTM", CTM)
														
 
															+
														
 
															+            wrap_pos = image.get("wrap_pos")
														
 
															+            # print("wrap_pos", wrap_pos)
														
 
															+            pos = img_d.get('pos')
														
 
															+            # print("pos", pos)
														
 
															+            CTM = None
														
 
															+            if CTM and not wrap_pos and page_size == pos:
														
 
															+                x1_new, y1_new, w_new, h_new = self.compute_ctm(CTM, 0, 0, img_width, img_height)
														
 
															+                pdf_pos = [pos[0] * self.OP, pos[1] * self.OP, pos[2] * self.OP, pos[3] * self.OP]
														
 
															+                print(f"pos: {pos} pdf_pos: {pdf_pos}")
														
 
															+
														
 
															+                x1_new = (pos[0] + x1_new) * self.OP
														
 
															+                y1_new = (page_size[3] - y1_new) * self.OP
														
 
															+                if w_new > pdf_pos[2]:
														
 
															+                    w_new = pdf_pos[2]
														
 
															+                if h_new > pdf_pos[3]:
														
 
															+                    h_new = pdf_pos[3]
														
 
															+                print(f"写入 {x1_new} {y1_new} {w_new} {-h_new}")
														
 
															+                c.drawImage(imgReade, x1_new, y1_new, w_new, -h_new, 'auto')
														
 
															+            else:
														
 
															+                x_offset = 0
														
 
															+                y_offset = 0
														
 
															+
														
 
															+                x = (pos[0] + x_offset) * self.OP
														
 
															+                y = (page_size[3] - (pos[1] + y_offset)) * self.OP
														
 
															+                if wrap_pos:
														
 
															+                    x = x + (wrap_pos[0] * self.OP)
														
 
															+                    y = y - (wrap_pos[1] * self.OP)
														
 
															+                    w = img_d.get('pos')[2] * self.OP
														
 
															+                    h = -img_d.get('pos')[3] * self.OP
														
 
															+
														
 
															+                    # print(x, y, w, h)
														
 
															+                    c.drawImage(imgReade, x, y, w, h, 'auto')
														
 
															+                elif pos:
														
 
															+                    # print(f"page_size == pos :{page_size == pos} ")
														
 
															+                    x = pos[0] * self.OP
														
 
															+                    y = (page_size[3] - pos[1]) * self.OP
														
 
															+                    w = pos[2] * self.OP
														
 
															+                    h = -pos[3] * self.OP
														
 
															+
														
 
															+                    # print("pos",pos[0],pos[1],pos[2]* self.OP,pos[3]* self.OP)
														
 
															+                    # print(x2_new, -y2_new, w_new, h_new,)
														
 
															+
														
 
															+                    x, y = 0, 0
														
 
															+                    w, h = img.size
														
 
															+
														
 
															+                    print('x, y, w, h', x, y, w, h)
														
 
															+
														
 
															+                    c.drawImage(imgReade, x, y, w, h, 'auto')
														
 
															+                    # c.drawImage(imgReade,x2_new, -y2_new, w_new, h_new, 'auto')
														
 
															+
														
 
															+    def draw_img(self, canvas, img_list, images, ofd_page_size, pdf_page_size, ofd_to_pdf_ratio):
														
 
															+        """写入图片"""
														
 
															+        c = canvas
														
 
															+        for img_d in img_list:
														
 
															+            image = images.get(img_d["ResourceID"])
														
 
															+            if not image or image.get("suffix").upper() not in self.SupportImgType:
														
 
															+                print('img_d["ResourceID"]', img_d["ResourceID"])
														
 
															+                logger.error(f"not image")
														
 
															+                continue
														
 
															+
														
 
															+            imgbyte = base64.b64decode(image.get('imgb64'))
														
 
															+            if not imgbyte:
														
 
															+                logger.error(f"{image['fileName']} is null")
														
 
															+                continue
														
 
															+
														
 
															+            img = PILImage.open(BytesIO(imgbyte))
														
 
															+            info = img.info
														
 
															+            # print( f"ing info dpi {info.get('dpi')}")
														
 
															+            ctm = img_d.get('CTM')
														
 
															+            # print("ctm", ctm)
														
 
															+            pos = img_d.get('pos')
														
 
															+            pdf_pos = [x * ofd_to_pdf_ratio for x in pos]
														
 
															+            # print('pos', pos)
														
 
															+            # print('pdf_pos', pdf_pos)
														
 
															+            # print('ofd_page_size', ofd_page_size)
														
 
															+            # print('pdf_page_size', pdf_page_size)
														
 
															+            if pos:
														
 
															+                if pos[2] <= 0.1 or pos[3] <= 0.1:
														
 
															+                    print('pos[2] <= 0.1 or pos[3] <= 0.1')
														
 
															+                    continue
														
 
															+                x, y = pdf_pos[0], pdf_page_size[3] - pdf_pos[1] - pdf_pos[3]
														
 
															+                w, h = img.size
														
 
															+                ctm = ctm.split(' ')
														
 
															+                ctm = [float(x) for x in ctm]
														
 
															+                a, b, d, e, f, g = ctm
														
 
															+                if b == 0 and d == 0:
														
 
															+                    angle_deg = 0
														
 
															+                else:
														
 
															+                    # 计算旋转角度，考虑可能的镜像翻转
														
 
															+                    angle_rad = math.atan2(b, a)
														
 
															+                    angle_deg = math.degrees(angle_rad)
														
 
															+                    # 调整角度到 0 到 360 度范围内
														
 
															+                    angle_deg = angle_deg % 360
														
 
															+                img = img.rotate(-angle_deg, expand=1)
														
 
															+                img = img.resize((int(pdf_pos[2]), int(pdf_pos[3])), Image.BICUBIC)
														
 
															+                img = image_resize_by_ratio(img, int(pdf_page_size[2]), int(pdf_page_size[3]))
														
 
															+                # img = img.resize((int(pdf_page_size[2]), int(pdf_page_size[3])), Image.BICUBIC)
														
 
															+                # img = img.rotate(180, expand=1)
														
 
															+                w, h = img.size
														
 
															+                # print('jb2 angle_deg, x, y, w, h', angle_deg, x, y, w, h)
														
 
															+                if img.mode == 'P':
														
 
															+                    img = img.convert('RGBA')
														
 
															+                imgReade = ImageReader(img)
														
 
															+                # print('img.size, x, y, w, h, img.mode', img.size, x, y, w, h, img.mode)
														
 
															+                c.drawImage(imgReade, x, y, w, h, 'auto')
														
 
															+
														
 
															+    def draw_img_with_annot(self, canvas, img_list, images, annot_page_size, pdf_page_size, ofd_to_pdf_ratio, annot_page_info):
														
 
															+        """写入图片"""
														
 
															+        c = canvas
														
 
															+        for img_d in img_list:
														
 
															+            image = images.get(img_d["ResourceID"])
														
 
															+            if not image or image.get("suffix").upper() not in self.SupportImgType:
														
 
															+                print('img_d["ResourceID"]', img_d["ResourceID"])
														
 
															+                logger.error(f"not image")
														
 
															+                continue
														
 
															+
														
 
															+            imgbyte = base64.b64decode(image.get('imgb64'))
														
 
															+            if not imgbyte:
														
 
															+                logger.error(f"{image['fileName']} is null")
														
 
															+                continue
														
 
															+
														
 
															+            img = PILImage.open(BytesIO(imgbyte))
														
 
															+            ctm = img_d.get('CTM')
														
 
															+            pos = img_d.get('pos')
														
 
															+            pdf_pos = [x * ofd_to_pdf_ratio for x in pos]
														
 
															+            if pos:
														
 
															+                if pos[2] <= 0.1 or pos[3] <= 0.1:
														
 
															+                    print('pos[2] <= 0.1 or pos[3] <= 0.1')
														
 
															+                    continue
														
 
															+                x, y = pdf_pos[0], pdf_page_size[3] - pdf_pos[1] - pdf_pos[3]
														
 
															+                w, h = img.size
														
 
															+                ctm = ctm.split(' ')
														
 
															+                ctm = [float(x) for x in ctm]
														
 
															+                a, b, d, e, f, g = ctm
														
 
															+                if b == 0 and d == 0:
														
 
															+                    angle_deg = 0
														
 
															+                else:
														
 
															+                    # 计算旋转角度，考虑可能的镜像翻转
														
 
															+                    angle_rad = math.atan2(b, a)
														
 
															+                    angle_deg = math.degrees(angle_rad)
														
 
															+                    # 调整角度到 0 到 360 度范围内
														
 
															+                    angle_deg = angle_deg % 360
														
 
															+
														
 
															+                img = img.rotate(-angle_deg, expand=1)
														
 
															+                print('angle_deg', angle_deg)
														
 
															+
														
 
															+                # 画上注释文字
														
 
															+                # text_list = annot_page_info
														
 
															+                for text_d in annot_page_info:
														
 
															+                    # print('text_d', text_d)
														
 
															+                    # print('img.size', img.size)
														
 
															+                    print('img pos', pos)
														
 
															+                    img = self.draw_odf_char_on_image(text_d, img, pos, annot_page_size)
														
 
															+
														
 
															+                img = img.resize((int(pdf_pos[2]), int(pdf_pos[3])), Image.BICUBIC)
														
 
															+                img = image_resize_by_ratio(img, int(pdf_page_size[2]), int(pdf_page_size[3]))
														
 
															+
														
 
															+                w, h = img.size
														
 
															+                if img.mode == 'P':
														
 
															+                    img = img.convert('RGBA')
														
 
															+                imgReade = ImageReader(img)
														
 
															+                c.drawImage(imgReade, x, y, w, h, 'auto')
														
 
															+
														
 
															+    def draw_signature(self, canvas, signatures_page_list, page_size):
														
 
															+        """
														
 
															+        写入签章
														
 
															+            {
														
 
															+            "sing_page_no": sing_page_no,
														
 
															+            "PageRef": PageRef,
														
 
															+            "Boundary": Boundary,
														
 
															+            "SignedValue": self.file_tree(SignedValue),
														
 
															+                            }
														
 
															+        """
														
 
															+        c = canvas
														
 
															+        try:
														
 
															+            if signatures_page_list:
														
 
															+                # print("signatures_page_list",signatures_page_list)
														
 
															+                for signature_info in signatures_page_list:
														
 
															+                    image = SealExtract()(b64=signature_info.get("SignedValue"))
														
 
															+                    if not image:
														
 
															+                        # logger.info(f"提取不到签章图片")
														
 
															+                        continue
														
 
															+                    else:
														
 
															+                        image_pil = image[0]
														
 
															+
														
 
															+                    pos = [float(i) for i in signature_info.get("Boundary").split(" ")]
														
 
															+
														
 
															+                    imgReade = ImageReader(image_pil)
														
 
															+
														
 
															+                    x = pos[0] * self.OP
														
 
															+                    y = (page_size[3] - pos[1]) * self.OP
														
 
															+
														
 
															+                    w = pos[2] * self.OP
														
 
															+                    h = -pos[3] * self.OP
														
 
															+                    c.drawImage(imgReade, x, y, w, h, 'auto')
														
 
															+                    # print(f"签章写入成功")
														
 
															+            else:
														
 
															+                # 无签章
														
 
															+                pass
														
 
															+        except Exception as e:
														
 
															+            # print(f"签章写入失败 {e}")
														
 
															+            traceback.print_exc()
														
 
															+
														
 
															+    def draw_line_old(self, canvas, line_list, page_size):
														
 
															+        """绘制线条"""
														
 
															+
														
 
															+        # print("绘制",line_list)
														
 
															+
														
 
															+        def match_mode(Abbr: list):
														
 
															+            """
														
 
															+            解析AbbreviatedData
														
 
															+            匹配各种线条模式
														
 
															+            S 定义起始 坐标 x, y
														
 
															+            M 移动到指定坐标 x, y
														
 
															+            L 从当前点移动到指定点 x, y
														
 
															+            Q x1 y1 x2 y2 二次贝塞尔曲线
														
 
															+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线
														
 
															+            A 到 x,y 的圆弧 并移动到 x,y  rx 长轴 ry 短轴 angle 旋转角度 large为1表示 大于180 的弧 为0时表示小于180的弧 swcpp 为1 表示顺时针旋转 0 表示逆时针旋转
														
 
															+            C 当前点和SubPath自动闭合
														
 
															+            """
														
 
															+            relu_list = []
														
 
															+            mode = ""
														
 
															+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
														
 
															+            mode_dict = {}
														
 
															+            for idx, i in enumerate(Abbr):
														
 
															+                if i in modes:
														
 
															+                    mode = i
														
 
															+                    if mode_dict:
														
 
															+                        relu_list.append(mode_dict)
														
 
															+                    mode_dict = {"mode": i, "points": []}
														
 
															+
														
 
															+                else:
														
 
															+                    mode_dict["points"].append(i)
														
 
															+
														
 
															+                if idx + 1 == len(Abbr):
														
 
															+                    relu_list.append(mode_dict)
														
 
															+            return relu_list
														
 
															+
														
 
															+        def assemble(relu_list: list):
														
 
															+            start_point = {}
														
 
															+            acticon = []
														
 
															+            for i in relu_list:
														
 
															+                if i.get("mode") == "M":
														
 
															+                    start_point = i
														
 
															+                elif i.get("mode") in ['B', "Q", 'L']:
														
 
															+                    acticon.append({"start_point": start_point,
														
 
															+                                    "end_point": i
														
 
															+                                    })
														
 
															+            return acticon
														
 
															+
														
 
															+        def convert_coord(p_list, direction, page_size, pos):
														
 
															+            """坐标转换ofd2pdf"""
														
 
															+            new_p_l = []
														
 
															+            for p in p_list:
														
 
															+                if direction == "x":
														
 
															+
														
 
															+                    new_p = (float(pos[0]) + float(p)) * self.OP
														
 
															+                else:
														
 
															+                    new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
														
 
															+                new_p_l.append(new_p)
														
 
															+            return new_p_l
														
 
															+
														
 
															+        for line in line_list:
														
 
															+            Abbr = line.get("AbbreviatedData").split(" ")  # AbbreviatedData 
														
 
															+            color = line.get("FillColor", [0, 0, 0])
														
 
															+
														
 
															+            relu_list = match_mode(Abbr)
														
 
															+            # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
														
 
															+
														
 
															+            # print(relu_list)
														
 
															+
														
 
															+            acticons = assemble(relu_list)
														
 
															+            pos = line.get("pos")
														
 
															+            # print(color)
														
 
															+            if len(color) < 3:
														
 
															+                color = [0, 0, 0]
														
 
															+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
														
 
															+
														
 
															+            # 设置线条宽度
														
 
															+            try:
														
 
															+                LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
														
 
															+                                 line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
														
 
															+            except Exception as e:
														
 
															+                # logger.error(f"{e}")
														
 
															+                LineWidth = 0.25 * self.OP
														
 
															+
														
 
															+            canvas.setLineWidth(LineWidth)  # 单位为点，2 表示 2 点
														
 
															+
														
 
															+            for acticon in acticons:
														
 
															+                if acticon.get("end_point").get("mode") == 'L':  # 直线
														
 
															+                    x1, y1, x2, y2 = *acticon.get("start_point").get("points"), *acticon.get("end_point").get("points")
														
 
															+                    x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
														
 
															+                    y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
														
 
															+                    # 绘制一条线 x1 y1 x2 y2
														
 
															+                    canvas.line(x1, y1, x2, y2)
														
 
															+
														
 
															+                elif acticon.get("end_point").get("mode") == 'B':  # 三次贝塞尔线
														
 
															+                    continue
														
 
															+                    x1, y1, x2, y2, x3, y3, x4, y4 = *acticon.get("start_point").get("points"), *acticon.get(
														
 
															+                        "end_point").get("points")
														
 
															+                    x1, x2, x3, x4 = convert_coord([x1, x2, x3, x4], "x", page_size, pos)
														
 
															+                    y1, y2, y3, y4 = convert_coord([y1, y2, y3, y4], "y", page_size, pos)
														
 
															+                    # print(x1, y1, x2, y2, x3, y3, x4, y4)
														
 
															+
														
 
															+                    # 绘制三次贝塞尔线
														
 
															+                    canvas.bezier(x1, y1, x2, y2, x3, y3, x4, y4)
														
 
															+
														
 
															+                elif acticon.get("end_point").get("mode") == 'Q':  # 二次贝塞尔线
														
 
															+                    pass
														
 
															+                else:
														
 
															+                    continue
														
 
															+
														
 
															+    def draw_line_old_250619(self, canvas, line_list, page_size):
														
 
															+        def match_mode(Abbr: list):
														
 
															+            """
														
 
															+            解析AbbreviatedData
														
 
															+            匹配各种线条模式
														
 
															+            S 定义起始 坐标 x, y
														
 
															+            M 移动到指定坐标 x, y
														
 
															+            L 从当前点移动到指定点 x, y
														
 
															+            Q x1 y1 x2 y2 二次贝塞尔曲线 从当前点连接一条到点(x2,y2)的二次贝塞尔曲线，并将当前点移动到点(x2,y2)，此贝塞尔曲线使用点(x1,y1)作为其控制点。
														
 
															+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 从当前点连接一条到点(x3,y3)的三次贝塞尔曲线，并将当前点移动到点(x3,y3)，此贝塞尔曲线使用点(x1,y1)和点(x2,y2)作为其控制点。
														
 
															+            A Are 操作数为rx ry angle large sweep x y，从当前点连接一条到点(x,y)的圆弧，并将当前点移动到点(x,y)。
														
 
															+            其中，rx表示椭圆的长轴长度，ry表示椭圆的短轴长度，angle表示椭圆在当前坐标系下旋转的角度，正值为顺时针，
														
 
															+            负值为逆时针，large为 1 时表示对应度数大于 180° 的弧，为 0 时表示对应度数小于 180° 的弧，
														
 
															+            sweep为 1 时表示由圆弧起始点到结束点是顺时针旋转，为 0 时表示由圆弧起始点到结束点是逆时针旋转。
														
 
															+            C 无操作数，其作用是SubPath自动闭合，表示将当前点和SubPath的起始点用线段直接连接。
														
 
															+            """
														
 
															+            relu_list = []
														
 
															+            mode = ""
														
 
															+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
														
 
															+            mode_dict = {}
														
 
															+            for idx, i in enumerate(Abbr):
														
 
															+                if i in modes:
														
 
															+                    mode = i
														
 
															+                    if mode_dict:
														
 
															+                        relu_list.append(mode_dict)
														
 
															+                    mode_dict = {"mode": i, "points": []}
														
 
															+
														
 
															+                else:
														
 
															+                    mode_dict["points"].append(i)
														
 
															+
														
 
															+                if idx + 1 == len(Abbr):
														
 
															+                    relu_list.append(mode_dict)
														
 
															+            return relu_list
														
 
															+
														
 
															+        def assemble(relu_list: list):
														
 
															+            start_point = {}
														
 
															+            acticon = []
														
 
															+
														
 
															+            for i in relu_list:
														
 
															+                if i.get("mode") == "M":
														
 
															+                    if not start_point:
														
 
															+                        start_point = i
														
 
															+                    acticon.append({
														
 
															+                        "start_point": start_point, "end_point": i})
														
 
															+
														
 
															+                elif i.get("mode") in ['B', "Q", 'L']:
														
 
															+                    acticon.append({"start_point": start_point,
														
 
															+                                    "end_point": i
														
 
															+                                    })
														
 
															+                elif i.get("mode") == "C":
														
 
															+                    acticon.append({"start_point": start_point,
														
 
															+                                    "end_point": i
														
 
															+                                    })
														
 
															+                elif i.get("mode") == "A":
														
 
															+                    acticon.append({"start_point": start_point,
														
 
															+                                    "end_point": i
														
 
															+                                    })
														
 
															+                elif i.get("mode") == "S":
														
 
															+                    start_point = i
														
 
															+
														
 
															+            return acticon
														
 
															+
														
 
															+        def convert_coord(p_list, direction, page_size, pos):
														
 
															+            """坐标转换ofd2pdf"""
														
 
															+            new_p_l = []
														
 
															+            # print("p_list", p_list)
														
 
															+            for p in p_list:
														
 
															+                if direction == "x":
														
 
															+                    new_p = (float(pos[0]) + float(p)) * self.OP
														
 
															+                else:
														
 
															+                    new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP
														
 
															+                new_p_l.append(new_p)
														
 
															+            # print("new_p_l", new_p_l)
														
 
															+            return new_p_l
														
 
															+
														
 
															+        for line in line_list:
														
 
															+            print('one line', "="*20)
														
 
															+            path = canvas.beginPath()
														
 
															+            Abbr = line.get("AbbreviatedData").split(" ")  # AbbreviatedData
														
 
															+            color = line.get("FillColor", [0, 0, 0])
														
 
															+
														
 
															+            relu_list = match_mode(Abbr)
														
 
															+            # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线
														
 
															+
														
 
															+            # print(relu_list)
														
 
															+
														
 
															+            acticons = assemble(relu_list)
														
 
															+            pos = line.get("pos")
														
 
															+            # print(color)
														
 
															+            if len(color) < 3:
														
 
															+                color = [0, 0, 0]
														
 
															+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
														
 
															+
														
 
															+            # 设置线条宽度
														
 
															+            try:
														
 
															+                LineWidth = (float(line.get("LineWidth", "0.25").replace(" ", "")) if \
														
 
															+                                 line.get("LineWidth", "0.25").replace(" ", "") else 0.25) * self.OP
														
 
															+            except Exception as e:
														
 
															+                logger.error(f"{e}")
														
 
															+                LineWidth = 0.25 * self.OP
														
 
															+
														
 
															+            canvas.setLineWidth(LineWidth)  # 单位为点，2 表示 2 点
														
 
															+            cur_point = []
														
 
															+            for acticon in acticons:
														
 
															+                if acticon.get("end_point").get("mode") == 'M':
														
 
															+                    x, y = acticon.get("end_point").get("points")
														
 
															+                    x = convert_coord([x], "x", page_size, pos)[0]
														
 
															+                    y = convert_coord([y], "y", page_size, pos)[0]
														
 
															+                    cur_point = [x, y]
														
 
															+                    path.moveTo(x, y)
														
 
															+
														
 
															+                elif acticon.get("end_point").get("mode") == 'L':  # 直线
														
 
															+                    x, y = acticon.get("end_point").get("points")
														
 
															+                    print('path L x, y', x, y)
														
 
															+                    x = convert_coord([x], "x", page_size, pos)[0]
														
 
															+                    y = convert_coord([y], "y", page_size, pos)[0]
														
 
															+                    print('path L x, y2', x, y)
														
 
															+                    path.lineTo(x, y)
														
 
															+
														
 
															+
														
 
															+                elif acticon.get("end_point").get("mode") == 'B':  # 三次贝塞尔线
														
 
															+                    x1, y1, x2, y2, x3, y3 = acticon.get("end_point").get("points")
														
 
															+                    # print(x1, y1, x2, y2, x3, y3)
														
 
															+                    x1, x2, x3 = convert_coord([x1, x2, x3], "x", page_size, pos)
														
 
															+                    y1, y2, y3 = convert_coord([y1, y2, y3], "y", page_size, pos)
														
 
															+                    cur_point = [x2, y2]
														
 
															+                    path.curveTo(x1, y1, x2, y2, x3, y3)
														
 
															+                    path.moveTo(x3, y3)
														
 
															+
														
 
															+                elif acticon.get("end_point").get("mode") == 'Q':  # 二次贝塞尔线
														
 
															+                    x1, y1, x2, y2 = acticon.get("end_point").get("points")
														
 
															+                    x1, x2 = convert_coord([x1, x2], "x", page_size, pos)
														
 
															+                    y1, y2 = convert_coord([y1, y2], "y", page_size, pos)
														
 
															+                    cur_point = [x2, y2]
														
 
															+                    path.curveTo(x1, y1, x2, y2, x2, y2)
														
 
															+                    path.moveTo(x2, y2)
														
 
															+                elif acticon.get("end_point").get("mode") == 'A':  # 圆弧线
														
 
															+                    x1, y1 = acticon.get("start_point").get("points")
														
 
															+                    rx, ry, startAng, large_arc_flag, sweep_flag, x2, y2 = acticon.get("end_point").get("points")
														
 
															+                    rx_o = rx
														
 
															+                    ry_o = ry
														
 
															+
														
 
															+                    x1, x2, rx = convert_coord([x1, x2, rx], "x", page_size, pos)
														
 
															+                    y1, y2, ry = convert_coord([y1, y2, ry], "y", page_size, pos)
														
 
															+
														
 
															+                    cur_x, cur_y = cur_point
														
 
															+
														
 
															+                    # 绘制圆弧 有问题
														
 
															+                    if rx_o == ry_o:
														
 
															+                        # path.circle(cur_x,cur_y, 20) # 圆
														
 
															+                        path.circle(rx, ry, 20)  # 圆 # 莫名其妙的圆
														
 
															+                    else:
														
 
															+                        print(rx, ry, x2, y2, startAng, large_arc_flag, sweep_flag)
														
 
															+                        path.ellipse(rx, ry, 20, 20, )  # 椭圆
														
 
															+                    # path.arc(rx, ry, x2, y2, startAng=int(startAng), extent=int(sweep_flag))
														
 
															+                    # path.ellipse(rx, ry,x2, y2, ) # 椭圆
														
 
															+                    # path.curveTo(rx, ry ,x2, y2, startAng=int(startAng), extent=int(sweep_flag))
														
 
															+                    path.moveTo(x2, y2)
														
 
															+                    cur_point = [x2, y2]
														
 
															+
														
 
															+                elif acticon.get("end_point").get("mode") == 'C':
														
 
															+                    # canvas.drawPath(path)
														
 
															+                    path.close()
														
 
															+            canvas.drawPath(path)
														
 
															+
														
 
															+    def draw_line(self, canvas, line_list, page_size, pdf_page_size):
														
 
															+        def match_mode(Abbr: list):
														
 
															+            """
														
 
															+            解析AbbreviatedData
														
 
															+            匹配各种线条模式
														
 
															+            S 定义起始 坐标 x, y
														
 
															+            M 移动到指定坐标 x, y
														
 
															+            L 从当前点移动到指定点 x, y
														
 
															+            Q x1 y1 x2 y2 二次贝塞尔曲线 从当前点连接一条到点(x2,y2)的二次贝塞尔曲线，并将当前点移动到点(x2,y2)，此贝塞尔曲线使用点(x1,y1)作为其控制点。
														
 
															+            B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 从当前点连接一条到点(x3,y3)的三次贝塞尔曲线，并将当前点移动到点(x3,y3)，此贝塞尔曲线使用点(x1,y1)和点(x2,y2)作为其控制点。
														
 
															+            A Are 操作数为rx ry angle large sweep x y，从当前点连接一条到点(x,y)的圆弧，并将当前点移动到点(x,y)。
														
 
															+            其中，rx表示椭圆的长轴长度，ry表示椭圆的短轴长度，angle表示椭圆在当前坐标系下旋转的角度，正值为顺时针，
														
 
															+            负值为逆时针，large为 1 时表示对应度数大于 180° 的弧，为 0 时表示对应度数小于 180° 的弧，
														
 
															+            sweep为 1 时表示由圆弧起始点到结束点是顺时针旋转，为 0 时表示由圆弧起始点到结束点是逆时针旋转。
														
 
															+            C 无操作数，其作用是SubPath自动闭合，表示将当前点和SubPath的起始点用线段直接连接。
														
 
															+            """
														
 
															+            relu_list = []
														
 
															+            mode = ""
														
 
															+            modes = ["S", "M", "L", "Q", "B", "A", "C"]
														
 
															+            mode_dict = {}
														
 
															+            for idx, i in enumerate(Abbr):
														
 
															+                if i in modes:
														
 
															+                    mode = i
														
 
															+                    if mode_dict:
														
 
															+                        relu_list.append(mode_dict)
														
 
															+                    mode_dict = {"mode": i, "points": []}
														
 
															+
														
 
															+                else:
														
 
															+                    mode_dict["points"].append(i)
														
 
															+
														
 
															+                if idx + 1 == len(Abbr):
														
 
															+                    relu_list.append(mode_dict)
														
 
															+            return relu_list
														
 
															+
														
 
															+        def assemble(relu_list: list):
														
 
															+            start_point = {}
														
 
															+            acticon = []
														
 
															+
														
 
															+            for i in relu_list:
														
 
															+                if i.get("mode") == "M":
														
 
															+                    if not start_point:
														
 
															+                        start_point = i
														
 
															+                    acticon.append({
														
 
															+                        "start_point": start_point, "end_point": i})
														
 
															+
														
 
															+                elif i.get("mode") in ['B', "Q", 'L']:
														
 
															+                    acticon.append({"start_point": start_point,
														
 
															+                                    "end_point": i
														
 
															+                                    })
														
 
															+                elif i.get("mode") == "C":
														
 
															+                    acticon.append({"start_point": start_point,
														
 
															+                                    "end_point": i
														
 
															+                                    })
														
 
															+                elif i.get("mode") == "A":
														
 
															+                    acticon.append({"start_point": start_point,
														
 
															+                                    "end_point": i
														
 
															+                                    })
														
 
															+                elif i.get("mode") == "S":
														
 
															+                    start_point = i
														
 
															+
														
 
															+            return acticon
														
 
															+
														
 
															+        for line in line_list:
														
 
															+            # print('one line', "="*20)
														
 
															+            path = canvas.beginPath()
														
 
															+            abbr = line.get("AbbreviatedData").split(" ")
														
 
															+            color = line.get("FillColor", [0, 0, 0])
														
 
															+
														
 
															+            # 线条解析
														
 
															+            relu_list = match_mode(abbr)
														
 
															+            actions = assemble(relu_list)
														
 
															+
														
 
															+            # 变换矩阵
														
 
															+            ctm = line.get("CTM", '')
														
 
															+            ctm = self.get_ctm(ctm)
														
 
															+
														
 
															+            # 文本框范围
														
 
															+            boundary = line.get("pos")
														
 
															+            if len(boundary) != 4:
														
 
															+                print('draw_line not boundary', boundary)
														
 
															+                return
														
 
															+
														
 
															+            # 设置颜色
														
 
															+            if len(color) < 3:
														
 
															+                color = [0, 0, 0]
														
 
															+            canvas.setStrokeColorRGB(*(int(color[0]) / 255, int(color[1]) / 255, int(color[2]) / 255))  # 颜色
														
 
															+
														
 
															+            # 设置线条宽度
														
 
															+            line_w = 0.20 * self.OP
														
 
															+            canvas.setLineWidth(line_w)
														
 
															+
														
 
															+            for action in actions:
														
 
															+                if action.get("end_point").get("mode") == 'M':
														
 
															+                    x, y = action.get("end_point").get("points")
														
 
															+                    # print('path M x, y', x, y)
														
 
															+                    x, y = self.get_actural_p(x, y, ctm, boundary)
														
 
															+                    x = x * self.OP
														
 
															+                    y = pdf_page_size[3] - y * self.OP
														
 
															+                    # print('path M x, y2', x, y)
														
 
															+                    path.moveTo(x, y)
														
 
															+
														
 
															+                elif action.get("end_point").get("mode") == 'L':  # 直线
														
 
															+                    x, y = action.get("end_point").get("points")
														
 
															+                    # print('path L x, y', x, y)
														
 
															+                    x, y = self.get_actural_p(x, y, ctm, boundary)
														
 
															+                    # print('path L x, y1', x, y)
														
 
															+                    x = x * self.OP
														
 
															+                    y = pdf_page_size[3] - y * self.OP
														
 
															+                    # print('path L x, y2', x, y)
														
 
															+                    path.lineTo(x, y)
														
 
															+
														
 
															+                elif action.get("end_point").get("mode") == 'C':
														
 
															+                    path.close()
														
 
															+            canvas.drawPath(path)
														
 
															+
														
 
															+    def get_actural_p(self, x, y, ctm, boundary):
														
 
															+        x, y = float(x), float(y)
														
 
															+        a, b, c, d, e, f = ctm
														
 
															+        left, bottom, width, height = boundary
														
 
															+        # print('left, x, a', left, x, a, type(left), type(x), type(a))
														
 
															+        x = left + x * a
														
 
															+        y2 = bottom + y * d
														
 
															+        y1 = y2 + height
														
 
															+        return x, y2
														
 
															+
														
 
															+    def draw_pdf(self):
														
 
															+        c = canvas.Canvas(self.pdf_io)
														
 
															+        c.setAuthor(self.author)
														
 
															+        page_need_to_image_dict = {}
														
 
															+        for doc_id, doc in enumerate(self.data, start=0):
														
 
															+            # print(1)
														
 
															+            fonts = doc.get("fonts")
														
 
															+            images = doc.get("images")
														
 
															+            default_page_size = doc.get("default_page_size")
														
 
															+            page_size_details = doc.get("page_size")
														
 
															+            # print("page_size_details", page_size_details)
														
 
															+            signatures_page_id = doc.get("signatures_page_id")  # 签证信息
														
 
															+            # annot_page_info = doc.get("annot_page_info")
														
 
															+
														
 
															+            # 注册字体
														
 
															+            # for font_id, font_v in fonts.items():
														
 
															+            #     file_name = font_v.get("FontFile")
														
 
															+            #     font_b64 = font_v.get("font_b64")
														
 
															+            #     if font_b64:
														
 
															+            #         self.font_tool.register_font(os.path.split(file_name)[1], font_v.get("@FontName"), font_b64)
														
 
															+
														
 
															+            # 判断页数是否匹配
														
 
															+            if len(doc.get("page_info")) != len(page_size_details):
														
 
															+                print('len(doc.get("page_info")) != len(page_size_details)')
														
 
															+                continue
														
 
															+
														
 
															+            page_id_list = list(doc.get("page_info").keys())
														
 
															+            try:
														
 
															+                page_id_list.sort(key=lambda x: int(x))
														
 
															+            except:
														
 
															+                traceback.print_exc()
														
 
															+                print('sort page_id_list error!', page_id_list)
														
 
															+                continue
														
 
															+
														
 
															+            # text_img_idwrite = []
														
 
															+            # print("doc.get(page_info)", len(doc.get("page_info")))
														
 
															+            for pi, page_id in enumerate(page_id_list):
														
 
															+                page = doc.get("page_info").get(page_id)
														
 
															+                annot_text_list = doc.get("page_info").get(page_id).get('annot_text_list')
														
 
															+                # print('page111', page)
														
 
															+                # print(f"page_id: {page_id} page_size_details: {page_size_details}")
														
 
															+                # if len(page_size_details) > page_id and page_size_details[page_id]:
														
 
															+                #     page_size = page_size_details[page_id]
														
 
															+                # else:
														
 
															+                #     page_size = default_page_size
														
 
															+                page_size = page_size_details[pi]
														
 
															+                # logger.info(f"page_id {page_id} page_size {page_size}")
														
 
															+                text_list = page.get("text_list")
														
 
															+                img_list = page.get("img_list")
														
 
															+                line_list = page.get("line_list")
														
 
															+                # print("img_list",img_list)
														
 
															+                # print('page_size222', page_size)
														
 
															+                c.setPageSize((page_size[2] * self.OP, page_size[3] * self.OP))
														
 
															+                pdf_page_size = [x * self.OP for x in page_size]
														
 
															+
														
 
															+                # print('len(img_list), len(images), len(text_list), len(line_list)', len(img_list), len(images), len(text_list), len(line_list))
														
 
															+
														
 
															+                # 写入图片
														
 
															+                # print('annot_text_list', annot_text_list)
														
 
															+                # if img_list and annot_text_list:
														
 
															+                #     annot_page_size = doc.get("page_info").get(page_id).get('annot_page_size')
														
 
															+                #     print('annot_page_size111', annot_page_size)
														
 
															+                #     self.draw_img_with_annot(c, img_list, images, annot_page_size, pdf_page_size, self.OP, annot_text_list)
														
 
															+
														
 
															+                if img_list and annot_text_list:
														
 
															+                    page_need_to_image_dict[pi] = True
														
 
															+                else:
														
 
															+                    page_need_to_image_dict[pi] = False
														
 
															+                if img_list:
														
 
															+                    self.draw_img(c, img_list, images, page_size, pdf_page_size, self.OP)
														
 
															+
														
 
															+                # 写入文本
														
 
															+                if text_list:
														
 
															+                    # 特殊中文转为基本中文
														
 
															+                    for line_dict in text_list:
														
 
															+                        text = line_dict.get("text")
														
 
															+                        line_dict['text'] = special_font_to_normal(text)
														
 
															+                        # print('draw_chars, text', text, line_dict.get('pos'))
														
 
															+                    self.draw_chars(c, text_list, fonts, page_size, pdf_page_size)
														
 
															+
														
 
															+                # 绘制线条
														
 
															+                if line_list:
														
 
															+                    # for line in line_list:
														
 
															+                    #     print('line', line)
														
 
															+                    self.draw_line(c, line_list, page_size, pdf_page_size)
														
 
															+
														
 
															+                # 绘制签章
														
 
															+                # if signatures_page_id:
														
 
															+                #     self.draw_signature(c, signatures_page_id.get(page_id), page_size)
														
 
															+
														
 
															+                # print("去写入")
														
 
															+                # print(doc_id,len(self.data))
														
 
															+                # 页码判断逻辑
														
 
															+                # if page_id != len(doc.get("page_info")) - 1 and doc_id != len(self.data):
														
 
															+                #     # print("写入")
														
 
															+                #     c.showPage()
														
 
															+                    # json.dump(text_write,open("text_write.json","w",encoding="utf-8"),ensure_ascii=False)
														
 
															+                c.showPage()
														
 
															+        c.save()
														
 
															+        return page_need_to_image_dict
														
 
															+
														
 
															+    def __call__(self):
														
 
															+        try:
														
 
															+            page_need_to_image_dict = self.draw_pdf()
														
 
															+            self.page_need_to_image_dict = page_need_to_image_dict
														
 
															+            pdfbytes = self.pdf_io.getvalue()
														
 
															+        except Exception as e:
														
 
															+            logger.error(f"{e}")
														
 
															+            logger.error(f"ofd解析失败")
														
 
															+            traceback.print_exc()
														
 
															+            self.gen_empty_pdf()
														
 
															+            self.page_need_to_image_dict = {}
														
 
															+            pdfbytes = self.pdf_io.getvalue()
														
 
															+        return pdfbytes
														
 
															+
														
 
															+
														
 
															+
														
--- a/format_convert/easyofd/easyofd/draw/find_seal_img.py
+++ b/format_convert/easyofd/easyofd/draw/find_seal_img.py
@@ -0,0 +1,113 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME: easyofd read_seal_img
														
 
															+# CREATE_TIME: 2024/5/28 14:13
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: renoyuan
														
 
															+# note: 根据 ASN.1 解析签章 拿到 签章图片
														
 
															+import io
														
 
															+import base64
														
 
															+
														
 
															+from PIL import Image, UnidentifiedImageError
														
 
															+from loguru import logger
														
 
															+from pyasn1.codec.der.decoder import decode
														
 
															+from pyasn1.type import univ
														
 
															+from pyasn1.error import PyAsn1Error
														
 
															+
														
 
															+
														
 
															+
														
 
															+class SealExtract(object):
														
 
															+    def __init__(self,):
														
 
															+        pass
														
 
															+    def read_signed_value(self, path="", b64=""):
														
 
															+        # 读取二进制文件
														
 
															+        if b64:
														
 
															+            binary_data = base64.b64decode(b64)
														
 
															+        elif path:
														
 
															+            # print("seal_path",path)
														
 
															+            with open(path, 'rb') as file:
														
 
															+                binary_data = file.read()
														
 
															+        else:
														
 
															+            return
														
 
															+        # 尝试解码为通用的 ASN.1 结构
														
 
															+        try:
														
 
															+            decoded_data, _ = decode(binary_data)
														
 
															+        except (PyAsn1Error,) as e:
														
 
															+            logger.warning(f"Decoding failed: {e}")
														
 
															+            decoded_data = None
														
 
															+        except (AttributeError,) as e:
														
 
															+            logger.warning(f"AttributeError failed: {e}")
														
 
															+            decoded_data = None
														
 
															+        finally:
														
 
															+           return  decoded_data
														
 
															+
														
 
															+
														
 
															+    def find_octet_strings(self, asn1_data,octet_strings:list):
														
 
															+
														
 
															+        # 递归查找所有的 OctetString 实例
														
 
															+
														
 
															+        if isinstance(asn1_data, univ.OctetString):
														
 
															+
														
 
															+            octet_strings.append(asn1_data)
														
 
															+        elif isinstance(asn1_data, univ.Sequence) or isinstance(asn1_data, univ.Set):
														
 
															+            for component in asn1_data:
														
 
															+                self.find_octet_strings(asn1_data[f"{component}"], octet_strings)
														
 
															+        elif isinstance(asn1_data, univ.Choice):
														
 
															+            self.find_octet_strings(asn1_data.getComponent(), octet_strings)
														
 
															+        elif isinstance(asn1_data, univ.Any):
														
 
															+            try:
														
 
															+                sub_data, _ = decode(asn1_data.asOctets())
														
 
															+                self.find_octet_strings(sub_data, octet_strings)
														
 
															+            except PyAsn1Error:
														
 
															+                pass
														
 
															+
														
 
															+
														
 
															+    def hex_to_image(self, hex_data, image_format='PNG',inx=0):
														
 
															+        """
														
 
															+        将16进制数据转换为图片并保存。
														
 
															+
														
 
															+        :param hex_data: 图片的16进制数据字符串
														
 
															+        :param image_format: 图片的格式，默认为'PNG'
														
 
															+        """
														
 
															+        # 将16进制数据转换为二进制数据
														
 
															+
														
 
															+        binary_data = bytes.fromhex(hex_data)
														
 
															+
														
 
															+        # 创建BytesIO对象以读取二进制数据
														
 
															+        image_stream = io.BytesIO(binary_data)
														
 
															+
														
 
															+        # 使用Pillow打开图像数据并保存
														
 
															+        try:
														
 
															+            image = Image.open(image_stream)
														
 
															+            # image.save(f'{inx}_image.{image_format}', format=image_format)
														
 
															+            # print(f"图片已保存为'image.{image_format}'")
														
 
															+            return image
														
 
															+        except UnidentifiedImageError:
														
 
															+            # logger.info("not img ")
														
 
															+            pass
														
 
															+
														
 
															+    def __call__(self, path="", b64=""):
														
 
															+
														
 
															+        decoded_data = self.read_signed_value(path=path, b64=b64)
														
 
															+        octet_strings = []
														
 
															+        img_list = []  # 目前是只有一个的，若存在多个的话关联后面考虑
														
 
															+        if decoded_data:
														
 
															+            self.find_octet_strings(decoded_data, octet_strings)
														
 
															+
														
 
															+            for i, octet_string in enumerate(octet_strings):
														
 
															+                # logger.info(f"octet_string{octet_string}")
														
 
															+                if str(octet_string.prettyPrint()).startswith("0x"):
														
 
															+
														
 
															+                    img = self.hex_to_image(str(octet_string.prettyPrint())[2:],inx= i)
														
 
															+                    if img:
														
 
															+                        # logger.info("ASN.1 data found.")
														
 
															+                        img_list.append(img)
														
 
															+        else:
														
 
															+            pass
														
 
															+            # logger.info("No valid ASN.1 data found.")
														
 
															+
														
 
															+        return  img_list
														
 
															+
														
 
															+if __name__=="__main__":
														
 
															+    print(SealExtract()(r"F:\code\easyofd\test\1111_xml\Doc_0\Signs\Sign_0\SignedValue.dat" ))
														
 
															+
														
--- a/format_convert/easyofd/easyofd/draw/font_tools.py
+++ b/format_convert/easyofd/easyofd/draw/font_tools.py
@@ -0,0 +1,216 @@
 
															+#!/usr/bin/env python
														
 
															+#-*- coding: utf-8 -*-
														
 
															+#PROJECT_NAME: D:\code\easyofd\easyofd
														
 
															+#CREATE_TIME: 2023-07-27 
														
 
															+#E_MAIL: renoyuan@foxmail.com
														
 
															+#AUTHOR: reno 
														
 
															+#NOTE: 字体处理
														
 
															+import sys
														
 
															+import time
														
 
															+import re
														
 
															+import json
														
 
															+import base64
														
 
															+import zipfile
														
 
															+import os
														
 
															+import shutil
														
 
															+import logging
														
 
															+from io import BytesIO, StringIO
														
 
															+import string
														
 
															+from uuid import uuid1
														
 
															+import random
														
 
															+import traceback
														
 
															+import logging
														
 
															+
														
 
															+
														
 
															+import tempfile
														
 
															+import xmltodict
														
 
															+from fontTools.ttLib import TTFont as ttLib_TTFont
														
 
															+from fontTools.pens.basePen import BasePen
														
 
															+from reportlab.graphics.shapes import Path
														
 
															+from reportlab.lib import colors
														
 
															+from reportlab.graphics import renderPM
														
 
															+from reportlab.graphics.shapes import Group, Drawing, scale
														
 
															+from reportlab import platypus
														
 
															+from reportlab.lib.pagesizes import letter, A4
														
 
															+from reportlab.lib.units import mm,inch
														
 
															+from reportlab.platypus import SimpleDocTemplate, Image
														
 
															+from reportlab.lib.utils import ImageReader
														
 
															+from reportlab.pdfgen import canvas
														
 
															+from reportlab.pdfbase import pdfmetrics
														
 
															+from reportlab.pdfbase.cidfonts import UnicodeCIDFont
														
 
															+from reportlab.pdfbase.ttfonts import TTFont
														
 
															+from concurrent.futures import ThreadPoolExecutor
														
 
															+import threading
														
 
															+import multiprocessing
														
 
															+import PIL
														
 
															+
														
 
															+
														
 
															+from reportlab.lib.fonts import _tt2ps_map 
														
 
															+from reportlab.lib.fonts import _family_alias
														
 
															+
														
 
															+
														
 
															+sys.path.append(os.path.dirname(__file__) + "/../../../../")
														
 
															+
														
 
															+from format_convert.easyofd.easyofd.draw import FONTS
														
 
															+
														
 
															+from loguru import logger
														
 
															+
														
 
															+
														
 
															+
														
 
															+class FontTool(object):
														
 
															+    FONTS = FONTS
														
 
															+    def __init__(self):
														
 
															+        # 初始支持字体
														
 
															+        # 字体检测
														
 
															+        # logger.debug("FontTool init ,read system default Font ... ")
														
 
															+        self.FONTS = self.get_installed_fonts()
														
 
															+        # logger.debug(f"system default Font is \n{self.FONTS} \n{'-'*50}")
														
 
															+
														
 
															+
														
 
															+    def get_system_font_dirs(self,):
														
 
															+        """获取不同操作系统的字体目录"""
														
 
															+        system = os.name
														
 
															+        if system == 'nt':  # Windows
														
 
															+            return [os.path.join(os.environ['WINDIR'], 'Fonts')]
														
 
															+        elif system == 'posix':  # Linux/macOS
														
 
															+            return [
														
 
															+                '/usr/share/fonts',
														
 
															+                '/usr/local/share/fonts',
														
 
															+                os.path.expanduser('~/.fonts'),
														
 
															+                os.path.expanduser('~/.local/share/fonts'),
														
 
															+                '/Library/Fonts',  # macOS
														
 
															+                '/System/Library/Fonts'  # macOS
														
 
															+            ]
														
 
															+        else:
														
 
															+            return []
														
 
															+
														
 
															+    def normalize_font_name(self, font_name):
														
 
															+        """将字体名称规范化，例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
														
 
															+        # 替换空格为无，并将样式（Bold/Italic等）用连字符连接
														
 
															+        normalized = font_name.replace(' ', '')
														
 
															+        # 处理常见的样式后缀
														
 
															+        for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
														
 
															+            if style in normalized:
														
 
															+                normalized = normalized.replace(style, f'-{style}')
														
 
															+
														
 
															+        # todo 特殊字体名规范 后续存在需要完善
														
 
															+        if normalized ==  "TimesNewRoman" :
														
 
															+            normalized = normalized.replace("TimesNewRoman","Times-Roman")
														
 
															+        return normalized
														
 
															+
														
 
															+    def _process_ttc_font(self, ttc_font):
														
 
															+        """处理ttc文件中的所有字体"""
														
 
															+        def judge_name(name):
														
 
															+            if 'http://' in name or 'https://' in name or len(name) > 50:
														
 
															+                return False
														
 
															+            else:
														
 
															+                return True
														
 
															+        font_names = set()
														
 
															+        try:
														
 
															+            # 获取所有可用的名称记录
														
 
															+            name_records = ttc_font['name'].names
														
 
															+
														
 
															+            for idx, record in enumerate(name_records):
														
 
															+                try:
														
 
															+                    # 尝试获取中文名称（简体中文的language ID是2052）
														
 
															+                    if record.platformID == 3 and record.langID == 2052:
														
 
															+                        cn_name = record.toUnicode()
														
 
															+                        if judge_name(cn_name):
														
 
															+                            font_names.add(cn_name)
														
 
															+
														
 
															+
														
 
															+
														
 
															+                    # 回退到英文名称（language ID 1033）
														
 
															+                    elif record.platformID == 3 and record.langID == 1033:
														
 
															+                        name = record.toUnicode()
														
 
															+                        if judge_name(name):
														
 
															+                            font_names.add(name)
														
 
															+                except:
														
 
															+                    continue
														
 
															+        except KeyError:
														
 
															+            # 如果name表不存在，跳过
														
 
															+            pass
														
 
															+        return font_names
														
 
															+    def get_installed_fonts(self, ):
														
 
															+        """获取所有已安装字体的名称和家族"""
														
 
															+        font_dirs = self.get_system_font_dirs()
														
 
															+        installed_fonts = set()
														
 
															+        for font_dir in font_dirs:
														
 
															+            if not os.path.isdir(font_dir):
														
 
															+                continue
														
 
															+            for root, _, files in os.walk(font_dir):
														
 
															+                for file in files:
														
 
															+                    if file.lower().endswith(('.ttf', '.otf','.ttc')):
														
 
															+                        font_path = os.path.join(root, file)
														
 
															+
														
 
															+                        try:
														
 
															+                            if file.lower().endswith('.ttc'):
														
 
															+                                # 对于ttc文件，读取所有字体
														
 
															+                                ttc_font = ttLib_TTFont(font_path, fontNumber=0)  # 读取第一个字体
														
 
															+                                installed_fonts.update(self._process_ttc_font(ttc_font))
														
 
															+                            else:
														
 
															+                                with ttLib_TTFont(font_path) as font:
														
 
															+                                    # 提取字体全名和家族名
														
 
															+                                    name_cn = font['name'].getName(4, 3, 1, 2052)
														
 
															+                                    if name_cn:
														
 
															+                                        installed_fonts.add(name_cn.toUnicode())
														
 
															+                                    # 4=Full Name, 3=Windows, 1=Unicode
														
 
															+                                    name = font['name'].getName(4, 3, 1, 1033)
														
 
															+                                    if name:
														
 
															+                                        installed_fonts.add(name.toUnicode())
														
 
															+                                    family_cn = font['name'].getName(1, 3, 1, 2052)
														
 
															+                                    if family_cn:
														
 
															+                                        installed_fonts.add(family_cn.toUnicode())
														
 
															+                                    family = font['name'].getName(1, 3, 1, 1033)
														
 
															+                                    if family:  # 1=Family Name
														
 
															+                                        installed_fonts.add(family.toUnicode())
														
 
															+                        except Exception as e:
														
 
															+                            print(f"解析字体 {font_path} 失败: {e}")
														
 
															+        installed_fonts = list(installed_fonts)
														
 
															+        if "宋体" in installed_fonts:
														
 
															+            installed_fonts.remove("宋体")
														
 
															+            installed_fonts.insert(0, "宋体")
														
 
															+        return installed_fonts
														
 
															+
														
 
															+    def is_font_available(self, target_font):
														
 
															+        """检查目标字体是否安装"""
														
 
															+        installed_fonts = self.get_installed_fonts()
														
 
															+        return target_font in installed_fonts
														
 
															+
														
 
															+    
														
 
															+    def font_check(self):
														
 
															+        pass
														
 
															+        # logger.info("f{_tt2ps_map}")
														
 
															+        # logger.info("f{_family_alias}")
														
 
															+        
														
 
															+        # for font in self.FONTS:
														
 
															+        #     if font in _tt2ps_map.values():
														
 
															+        #         logger.info(f"已注册{font}")
														
 
															+        #     else:
														
 
															+        #         logger.warning(f"-{font}-未注册可能导致写入失败")
														
 
															+                 
														
 
															+        
														
 
															+        
														
 
															+    def register_font(self,file_name,FontName,font_b64):
														
 
															+        
														
 
															+        if font_b64:
														
 
															+            
														
 
															+            file_name = os.path.split(file_name)
														
 
															+            # logger.error(f"file_name:{file_name}")
														
 
															+            # logger.info(f"file_name:{file_name}")
														
 
															+            if isinstance(file_name, (tuple, list)):
														
 
															+                    file_name = file_name[1]
														
 
															+            if not FontName:
														
 
															+                FontName = file_name.split(".")[0]
														
 
															+
														
 
															+            try:
														
 
															+                with open(file_name, "wb") as f:
														
 
															+                    f.write(base64.b64decode(font_b64))
														
 
															+                # print("FontName", FontName, "file_name", file_name)
														
 
															+                pdfmetrics.registerFont(TTFont(FontName, file_name))
														
 
															+                self.FONTS.append(FontName)
														
 
															+            except Exception as e:
														
 
															+                logger.error(f"register_font_error:\n{e} \n 包含不支持解析字体格式")
														
 
															+            finally:
														
 
															+                if os.path.exists(file_name):
														
 
															+                    os.remove(file_name)
														
--- a/format_convert/easyofd/easyofd/draw/ofdtemplate.py
+++ b/format_convert/easyofd/easyofd/draw/ofdtemplate.py
@@ -0,0 +1,666 @@
 
															+#!/usr/bin/env python
														
 
															+#-*- coding: utf-8 -*-
														
 
															+#PROJECT_NAME: F:\code\easyofd\easyofd\draw
														
 
															+#CREATE_TIME: 2023-10-30 
														
 
															+#E_MAIL: renoyuan@foxmail.com
														
 
															+#AUTHOR: reno 
														
 
															+#note:  ofd 基础结构模板
														
 
															+import tempfile
														
 
															+import os
														
 
															+import abc
														
 
															+import copy
														
 
															+
														
 
															+from loguru import logger
														
 
															+import xmltodict
														
 
															+import zipfile
														
 
															+
														
 
															+__all__ = ["CurId", "OFDTemplate", "DocumentTemplate", "DocumentResTemplate",
														
 
															+           "PublicResTemplate", "ContentTemplate", "OFDStructure"]
														
 
															+"""
														
 
															+OFD目录结构
														
 
															+    │  OFD.xml
														
 
															+    │  
														
 
															+    └─Doc_0
														
 
															+        │  Document.xml
														
 
															+        │  DocumentRes.xml
														
 
															+        │  PublicRes.xml
														
 
															+        │  
														
 
															+        ├─Annots
														
 
															+        │  │  Annotations.xml
														
 
															+        │  │  
														
 
															+        │  └─Page_0
														
 
															+        │          Annotation.xml
														
 
															+        │          
														
 
															+        ├─Attachs
														
 
															+        │      Attachments.xml
														
 
															+        │      original_invoice.xml
														
 
															+        │      
														
 
															+        ├─Pages
														
 
															+        │  └─Page_0
														
 
															+        │          Content.xml
														
 
															+        │          
														
 
															+        ├─Res
														
 
															+        │      image_80.jb2
														
 
															+        │      
														
 
															+        ├─Signs
														
 
															+        │  │  Signatures.xml
														
 
															+        │  │  
														
 
															+        │  └─Sign_0
														
 
															+        │          Signature.xml
														
 
															+        │          SignedValue.dat
														
 
															+        │          
														
 
															+        ├─Tags
														
 
															+        │      CustomTag.xml
														
 
															+        │      CustomTags.xml
														
 
															+        │      
														
 
															+        └─Tpls
														
 
															+            └─Tpl_0
														
 
															+                    Content.xml
														
 
															+"""
														
 
															+class CurId(object):
														
 
															+    """文档内id控制对象"""
														
 
															+    def __init__(self):
														
 
															+        self.id = 1
														
 
															+        self.used = False
														
 
															+        self.uuid_map = {} # 资源文件生成id的时候手动添加进来后面构建page 可以 匹配ResourceID
														
 
															+
														
 
															+    def add_uuid_map(self, k, v):
														
 
															+        # logger.debug(f"uuid_map add {k}: {v}")
														
 
															+        self.uuid_map[k] = v
														
 
															+    def add(self):
														
 
															+        self.id += 1
														
 
															+
														
 
															+    def get_id(self):
														
 
															+        if self.used:
														
 
															+            self.add()
														
 
															+            return self.id
														
 
															+        if not self.used:
														
 
															+            cur_id = self.id
														
 
															+            self.used =True
														
 
															+            return cur_id
														
 
															+
														
 
															+    def get_max_id(self):
														
 
															+        MaxUnitID = self.id + 1
														
 
															+        return MaxUnitID
														
 
															+
														
 
															+class TemplateBase(object):
														
 
															+    """模板基类"""
														
 
															+    key_map = {}  # 变量名对应 xml 中形式 映射 如 传入   DocID -> ofd:DocID
														
 
															+    id_keys = [ ]  # 对需要的要素添加 "@ID"
														
 
															+    template_name = ""
														
 
															+    def __init__(self,*args,**kwargs):
														
 
															+        # print(args)
														
 
															+        # print(kwargs)
														
 
															+        self.id_obj: CurId = kwargs.get("id_obj")
														
 
															+        # print("id_obj", self.id_obj)
														
 
															+        self.assemble(*args, **kwargs)
														
 
															+
														
 
															+
														
 
															+    def assemble(self,*args, **kwargs):
														
 
															+        """对ofdjson组装"""
														
 
															+
														
 
															+        self.final_json = copy.deepcopy(self.ofdjson)
														
 
															+
														
 
															+        # 往模板里面添加要素值
														
 
															+        if kwargs:
														
 
															+            for k, v in kwargs.items():
														
 
															+                if k in self.key_map:
														
 
															+                    self.modify(self.final_json, self.key_map[k], v)
														
 
															+
														
 
															+        # 添加id
														
 
															+        for id_key in self.id_keys:
														
 
															+            print(f"开始gen_id >> {self.template_name}>>{id_key}")
														
 
															+            # print(f"final_json {self.final_json}")
														
 
															+            self.gen_id(self.final_json, id_key)
														
 
															+
														
 
															+    def gen_id(self,ofdjson, id_key):
														
 
															+        """生成id"""
														
 
															+        # print("id_key ", id_key, "ofdjson ", ofdjson)
														
 
															+
														
 
															+        for k, v in ofdjson.items():
														
 
															+            if k == id_key:
														
 
															+                # 添加id
														
 
															+                if isinstance(ofdjson[k], dict):
														
 
															+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
														
 
															+
														
 
															+                    # logger.info(f"添加id -> {ofdjson[k]}")
														
 
															+                elif isinstance(ofdjson[k], list):
														
 
															+                    for i in ofdjson[k]:
														
 
															+                        i["@ID"] = f"{self.id_obj.get_id()}"
														
 
															+
														
 
															+                        # logger.info(f"添加id ->i {i}")
														
 
															+
														
 
															+            elif isinstance(v, dict):
														
 
															+                # logger.debug(f"dict_v{v}")
														
 
															+                self.gen_id(v, id_key)
														
 
															+
														
 
															+
														
 
															+            elif isinstance(v, list):
														
 
															+                for v_cell in v:
														
 
															+                    if isinstance(v_cell, dict):
														
 
															+                        # logger.debug(f"dict_v{v}")
														
 
															+                        self.gen_id(v_cell, id_key)
														
 
															+
														
 
															+                    
														
 
															+    def modify(self, ofdjson, key, value):
														
 
															+        """对指定key的值更改  多个会统一改"""
														
 
															+        
														
 
															+        for k, v in ofdjson.items():
														
 
															+            if k == key:
														
 
															+                ofdjson[k] = value
														
 
															+            elif isinstance(v, dict):
														
 
															+                self.modify(v, key, value)
														
 
															+            elif isinstance(v, list):
														
 
															+                for v_cell in v:
														
 
															+                    if isinstance(v_cell, dict):
														
 
															+                        self.modify(v_cell, key, value)
														
 
															+    
														
 
															+    def save(self, path):
														
 
															+        xml_data = xmltodict.unparse(self.final_json, pretty=True)
														
 
															+        with open(path, "w", encoding="utf-8") as f:
														
 
															+            f.write(xml_data)
														
 
															+
														
 
															+class OFDTemplate(TemplateBase):
														
 
															+    """根节点全局唯一 OFD.xml"""
														
 
															+    template_name = "OFD"
														
 
															+    key_map = {"Author": "ofd:Author", "DocID": "ofd:DocID"  ,"CreationDate": "ofd:CreationDate"
														
 
															+    }
														
 
															+
														
 
															+    ofdjson = {
														
 
															+
														
 
															+        "ofd:OFD": {
														
 
															+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
														
 
															+            "@Version": "1.1",
														
 
															+            "@DocType": "OFD",
														
 
															+            "ofd:DocBody": [{
														
 
															+                "ofd:DocInfo": {
														
 
															+                    "ofd:DocID": "0C1D4F7159954EEEDE517F7285E84DC4",
														
 
															+                    "ofd:Creator": "easyofd",
														
 
															+                    "ofd:author": "renoyuan",
														
 
															+                    "ofd:authoremail": "renoyuan@foxmail.com",
														
 
															+                    "ofd:CreatorVersion": "1.0",
														
 
															+                    "ofd:CreationDate": "2023-10-27"
														
 
															+                },
														
 
															+                "ofd:DocRoot": "Doc_0/Document.xml"
														
 
															+            }]
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+class DocumentTemplate(TemplateBase):
														
 
															+    """DOC 内唯一 表示DOC内部结构 Document.xml
														
 
															+
														
 
															+    """
														
 
															+    template_name = "Document"
														
 
															+    key_map = {"Page": "ofd:Page","PhysicalBox":"ofd:PhysicalBox"}
														
 
															+    id_keys = ["ofd:Page"]
														
 
															+    ofdjson ={
														
 
															+    "ofd:Document": {
														
 
															+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
														
 
															+        "ofd:CommonData": {
														
 
															+            "ofd:MaxUnitID": 0,
														
 
															+            "ofd:PageArea": {
														
 
															+                "ofd:PhysicalBox": "0 0 140 90"
														
 
															+            },
														
 
															+            "ofd:PublicRes": "PublicRes.xml",
														
 
															+            "ofd:DocumentRes": "DocumentRes.xml"
														
 
															+        },
														
 
															+        "ofd:Pages":
														
 
															+            {
														
 
															+            "ofd:Page": [{
														
 
															+                "@ID": 0,
														
 
															+                "@BaseLoc": "Pages/Page_0/Content.xml"
														
 
															+            }]
														
 
															+        }
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+    def update_max_unit_id(self, final_json=None):
														
 
															+        if not final_json:
														
 
															+            final_json = self.final_json
														
 
															+
														
 
															+        for k, v in final_json.items():
														
 
															+            if k == "ofd:MaxUnitID":
														
 
															+                final_json["ofd:MaxUnitID"]=self.id_obj.get_max_id()
														
 
															+                return
														
 
															+
														
 
															+            elif isinstance(v, dict):
														
 
															+                self.update_max_unit_id(v)
														
 
															+            elif isinstance(v, list):
														
 
															+                for v_cell in v:
														
 
															+                    if isinstance(v_cell, dict):
														
 
															+                        self.update_max_unit_id(v_cell)
														
 
															+
														
 
															+    def update_page(self,page_num):
														
 
															+        pass
														
 
															+
														
 
															+class DocumentResTemplate(TemplateBase):
														
 
															+    """DOC 内唯一 表示MultyMedia 资源信息 如 图片 DocumentRes.xml """
														
 
															+    template_name = "DocumentRes"
														
 
															+    key_map = {"MultiMedia": "ofd:MultiMedia"}
														
 
															+    id_keys = ["ofd:DrawParam", "ofd:MultiMedia"]
														
 
															+    ofdjson = {
														
 
															+        "ofd:Res": {
														
 
															+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
														
 
															+            "@BaseLoc": "Res",
														
 
															+            "ofd:MultiMedias": {
														
 
															+                "ofd:MultiMedia": [
														
 
															+                    {
														
 
															+                        "@ID": 0,
														
 
															+                        "@Type": "Image",
														
 
															+                        "ofd:MediaFile": "Image_2.jpg"
														
 
															+                    }
														
 
															+                ]
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+    def gen_id(self,ofdjson, id_key):
														
 
															+        """生成id"""
														
 
															+        # print("id_key ", id_key, "ofdjson ", ofdjson)
														
 
															+
														
 
															+        for k, v in ofdjson.items():
														
 
															+            if k == id_key:
														
 
															+                # 添加id
														
 
															+                if isinstance(ofdjson[k], dict):
														
 
															+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
														
 
															+
														
 
															+                    res_uuid = ofdjson[k].get("res_uuid")
														
 
															+                    if res_uuid:
														
 
															+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
														
 
															+                    # logger.info(f"添加id -> {ofdjson[k]}")
														
 
															+                elif isinstance(ofdjson[k], list):
														
 
															+                    for i in ofdjson[k]:
														
 
															+
														
 
															+                        i["@ID"] = f"{self.id_obj.get_id()}"
														
 
															+                        res_uuid = i.get("res_uuid")
														
 
															+                        if res_uuid:
														
 
															+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
														
 
															+                        # logger.info(f"添加id ->i {i}")
														
 
															+
														
 
															+            elif isinstance(v, dict):
														
 
															+                # logger.debug(f"dict_v{v}")
														
 
															+                self.gen_id(v, id_key)
														
 
															+
														
 
															+
														
 
															+            elif isinstance(v, list):
														
 
															+                for v_cell in v:
														
 
															+                    if isinstance(v_cell, dict):
														
 
															+                        # logger.debug(f"dict_v{v}")
														
 
															+                        self.gen_id(v_cell, id_key)
														
 
															+
														
 
															+class PublicResTemplate(TemplateBase):
														
 
															+    """DOC 内唯一 公共配置资源信息 如 Font  Color 等 PublicRes.xml"""
														
 
															+    template_name = "PulicRes"
														
 
															+    key_map = {"Font": "ofd:Font"}
														
 
															+    id_keys = ["ofd:ColorSpace", "ofd:Font"]
														
 
															+    ofdjson = {
														
 
															+        "ofd:Res": {
														
 
															+            "@xmlns:ofd": "http://blog.yuanhaiying.cn",
														
 
															+            "@BaseLoc": "Res",
														
 
															+            "ofd:ColorSpaces": {
														
 
															+                "ofd:ColorSpace": {
														
 
															+                    "@ID": 0,
														
 
															+                    "@Type": "RGB",
														
 
															+                    "@BitsPerComponent": "8",
														
 
															+                    "#text":""
														
 
															+                }
														
 
															+            },
														
 
															+            "ofd:Fonts": {
														
 
															+                "ofd:Font": [
														
 
															+                {
														
 
															+                    "@ID": 0,
														
 
															+                    "@FontName": "宋体",
														
 
															+                    "@FamilyName": "宋体",
														
 
															+
														
 
															+                }
														
 
															+            ]
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															+    def gen_id(self,ofdjson, id_key):
														
 
															+        """生成id"""
														
 
															+        # print("id_key ", id_key, "ofdjson ", ofdjson)
														
 
															+
														
 
															+        for k, v in ofdjson.items():
														
 
															+            if k == id_key:
														
 
															+                # 添加id
														
 
															+                if isinstance(ofdjson[k], dict):
														
 
															+                    ofdjson[k]["@ID"] = f"{self.id_obj.get_id()}"
														
 
															+                    res_uuid = ofdjson[k].get("res_uuid")
														
 
															+                    if res_uuid:
														
 
															+                        self.id_obj.add_uuid_map(res_uuid, ofdjson[k]["@ID"])
														
 
															+                    # logger.info(f"添加id -> {ofdjson[k]}")
														
 
															+                elif isinstance(ofdjson[k], list):
														
 
															+                    for i in ofdjson[k]:
														
 
															+
														
 
															+                        i["@ID"] = f"{self.id_obj.get_id()}"
														
 
															+                        res_uuid = i.get("res_uuid")
														
 
															+                        if res_uuid:
														
 
															+                            self.id_obj.add_uuid_map(res_uuid, i["@ID"])
														
 
															+                        # logger.info(f"添加id ->i {i}")
														
 
															+
														
 
															+            elif isinstance(v, dict):
														
 
															+                # logger.debug(f"dict_v{v}")
														
 
															+                self.gen_id(v, id_key)
														
 
															+
														
 
															+
														
 
															+            elif isinstance(v, list):
														
 
															+                for v_cell in v:
														
 
															+                    if isinstance(v_cell, dict):
														
 
															+                        # logger.debug(f"dict_v{v}")
														
 
															+                        self.gen_id(v_cell, id_key)
														
 
															+
														
 
															+'''
														
 
															+    "ofd:Font": [
														
 
															+
														
 
															+    {
														
 
															+        "@ID": 0,
														
 
															+        "@FontName": "STSong",
														
 
															+        "@FamilyName": "SimSun",
														
 
															+        "@Serif": "true",
														
 
															+        "@FixedWidth": "true",
														
 
															+        "@Charset": "prc"
														
 
															+    }
														
 
															+            "ofd:Area": {
														
 
															+            "ofd:PhysicalBox": "0 0 210 140"
														
 
															+        },
														
 
															+'''
														
 
															+
														
 
															+
														
 
															+class ContentTemplate(TemplateBase):
														
 
															+    """正文部分 Content.xml"""
														
 
															+    #"@Type": "Body",
														
 
															+    template_name = "Content"
														
 
															+    key_map = {"ImageObject": "ofd:ImageObject",
														
 
															+               "PathObject": "ofd:PathObject",
														
 
															+               "TextObject": "ofd:TextObject",
														
 
															+               "CGTransform": "ofd:CGTransform",
														
 
															+               "PhysicalBox": "ofd:PhysicalBox",
														
 
															+               }
														
 
															+    id_keys = ["ofd:Layer", "ofd:TextObject", "ofd:PathObject", "ofd:Clips", "ofd:ImageObject"]
														
 
															+    correlate_map = {"ofd:TextObject": "@Font",
														
 
															+                     "ofd:ImageObject": "@ResourceID"
														
 
															+
														
 
															+                     }
														
 
															+
														
 
															+    ofdjson = {
														
 
															+    "ofd:Page": {
														
 
															+        "@xmlns:ofd": "http://blog.yuanhaiying.cn",
														
 
															+
														
 
															+        "ofd:Content": {
														
 
															+            "ofd:PageArea": {
														
 
															+                "ofd:PhysicalBox": "0 0 210 140"
														
 
															+            },
														
 
															+            "ofd:Layer":  {
														
 
															+                "@ID": 0,
														
 
															+                "@Type": "Foreground",
														
 
															+
														
 
															+
														
 
															+                "ofd:TextObject": [{
														
 
															+                        "@ID": 0,
														
 
															+                        "@CTM": "7.054 0 0 7.054 0 134.026",
														
 
															+                        "@Boundary": "69 7 72 7.6749",
														
 
															+                        "@Font": "69",
														
 
															+                        "@Size": "6.7028",
														
 
															+                        "ofd:FillColor": {
														
 
															+                            "@ColorSpace": "4",
														
 
															+                            "@Value": "156 82 35"
														
 
															+                        },
														
 
															+                        "ofd:CGTransform": {
														
 
															+                            "@CodePosition": "0",
														
 
															+                            "@CodeCount": "10",
														
 
															+                            "@GlyphCount": "10",
														
 
															+                            "ofd:Glyphs": "18 10 11 42 60 53 24 11 42 61"
														
 
															+                        },
														
 
															+                        "ofd:TextCode": {
														
 
															+                            "@X": "13.925",
														
 
															+                            "@Y": "10",
														
 
															+                            "@DeltaX": "7 7 7 7 7 7 7 7 7",
														
 
															+                            "#text": "电⼦发票（普通发票）"
														
 
															+                        }
														
 
															+                    }],
														
 
															+                "ofd:ImageObject": []
														
 
															+                }
														
 
															+        }}}
														
 
															+    def __init__(self,*args,**kwargs):
														
 
															+        # print(args)
														
 
															+        # print(kwargs)
														
 
															+        super().__init__(*args, **kwargs)
														
 
															+        # 关联res_uuid
														
 
															+        for key, targe_key in self.correlate_map.items():
														
 
															+            self.correlate_res_uuid(self.final_json,key,targe_key)
														
 
															+
														
 
															+    def correlate_res_uuid(self, ofdjson,key,targe_key):
														
 
															+        """correlate_res_uuid"""
														
 
															+        print("========uuid_map", self.id_obj.uuid_map)
														
 
															+        for k, v in ofdjson.items():
														
 
															+            if k == key:
														
 
															+                res_uuid = v_cell.pop("res_uuid", None)
														
 
															+                if isinstance(v, dict) and res_uuid:
														
 
															+
														
 
															+                    v[targe_key] = self.id_obj.uuid_map[res_uuid]
														
 
															+                    # logger.debug(f'{targe_key} >>> {v[targe_key]} -- {res_uuid}')
														
 
															+                elif isinstance(v, list):
														
 
															+                    for v_cell in v:
														
 
															+                        res_uuid = None
														
 
															+                        if isinstance(v_cell, dict):
														
 
															+                            res_uuid = v_cell.pop("res_uuid", None)
														
 
															+                        if isinstance(v_cell, dict) and res_uuid:
														
 
															+
														
 
															+                            v_cell[targe_key] = self.id_obj.uuid_map[res_uuid]
														
 
															+                            # logger.debug(f'{targe_key} >>> {v_cell[targe_key]} -- {res_uuid}')
														
 
															+                        else:
														
 
															+                            pass
														
 
															+                            # print(f"v_cell {v_cell}")
														
 
															+                    pass
														
 
															+                else:
														
 
															+                    pass
														
 
															+            elif isinstance(v, dict):
														
 
															+                self.correlate_res_uuid(v, key, targe_key)
														
 
															+            elif isinstance(v, list):
														
 
															+                for v_cell in v:
														
 
															+                    if isinstance(v_cell, dict):
														
 
															+                        self.correlate_res_uuid(v_cell, key, targe_key)
														
 
															+
														
 
															+
														
 
															+'''
														
 
															+                "ofd:PathObject": [{
														
 
															+                        "@ID": 0,
														
 
															+                        "@CTM": "0.3527 0 0 -0.3527 0.35 141.43001",
														
 
															+                        "@Boundary": "-0.35 -0.35 212.33 141.78999",
														
 
															+                        "@LineWidth": "1",
														
 
															+                        "@MiterLimit": "10",
														
 
															+                        "@Stroke": "false",
														
 
															+                        "@Fill": "true",
														
 
															+                        "ofd:FillColor": {
														
 
															+                            "@ColorSpace": "4",
														
 
															+                            "@Value": "255 255 255"
														
 
															+                        },
														
 
															+                        "ofd:StrokeColor": {
														
 
															+                            "@ColorSpace": "4",
														
 
															+                            "@Value": "0 0 0"
														
 
															+                        },
														
 
															+                        "ofd:Clips": {
														
 
															+                            "ofd:Clip": {
														
 
															+                                "ofd:Area": {
														
 
															+                                    "ofd:Path": {
														
 
															+                                        "@ID": 0,
														
 
															+                                        "@Boundary": "0.00766 -0.00763 600 400.00003",
														
 
															+                                        "@Stroke": "false",
														
 
															+                                        "@Fill": "true",
														
 
															+                                        "ofd:AbbreviatedData": "M 0 0 L 600 0 L 600 400.00003 L 0 400.00003 C"
														
 
															+                                    }
														
 
															+                                }
														
 
															+                            }
														
 
															+                        },
														
 
															+                        "ofd:AbbreviatedData": "M -1 401 L 601 401 L 601 -1 L -1 -1 C"
														
 
															+                    },],
														
 
															+                
														
 
															+"ofd:ImageObject": [{
														
 
															+                        "@ID": 0,
														
 
															+                        "@CTM": "19.7512 0 0 19.7512 0 0",
														
 
															+                        "@Boundary": "7.23035 7.40671 19.7512 19.7512",
														
 
															+                        "@ResourceID": "104"
														
 
															+                    }],
														
 
															+'''
														
 
															+
														
 
															+class OFDStructure(object):
														
 
															+    """OFD structure"""
														
 
															+    def __init__(self, name, ofd=None, document=None,
														
 
															+                 document_res=None, public_res=None,
														
 
															+                  content_res:list=[], res_static: dict={}):
														
 
															+        # 初始化的时候会先自动初始化 默认参数值
														
 
															+        id_obj = CurId()
														
 
															+        self.name = name
														
 
															+        self.ofd = ofd if ofd else OFDTemplate(id_obj=id_obj)
														
 
															+        self.document = document if document else DocumentTemplate(id_obj=id_obj)
														
 
															+        self.document_res = document_res if document_res else  DocumentResTemplate(id_obj=id_obj)
														
 
															+        self.public_res = public_res if public_res else PublicResTemplate(id_obj=id_obj)
														
 
															+        self.content_res = content_res if content_res else [ContentTemplate(id_obj=id_obj)]
														
 
															+        self.res_static = res_static
														
 
															+       
														
 
															+    def __call__(self, test=False):
														
 
															+        """写入文件生成ofd"""
														
 
															+        with tempfile.TemporaryDirectory() as t_dir:
														
 
															+            if test:
														
 
															+                temp_dir = r"./test"
														
 
															+                os.mkdir(temp_dir)
														
 
															+            else:
														
 
															+                temp_dir = t_dir
														
 
															+            # 创建过程目录
														
 
															+            temp_dir_doc_0 = os.path.join(temp_dir, 'Doc_0')
														
 
															+            temp_dir_pages = os.path.join(temp_dir, 'Doc_0', "Pages")
														
 
															+            temp_dir_res = os.path.join(temp_dir, 'Doc_0', "Res")  # 静态资源路径
														
 
															+            for i in [temp_dir_doc_0, temp_dir_pages, temp_dir_res]:
														
 
															+                # print(i)
														
 
															+                os.mkdir(i)
														
 
															+
														
 
															+            # 写入 OFD
														
 
															+            self.ofd.save(os.path.join(temp_dir, 'OFD.xml'))
														
 
															+
														
 
															+            # 更新 max_unit_id & 写入 Document
														
 
															+            self.document.update_max_unit_id()
														
 
															+            self.document.save(os.path.join(temp_dir_doc_0, 'Document.xml'))
														
 
															+
														
 
															+            # 写入 DocumentRes
														
 
															+            self.document_res.save(os.path.join(temp_dir_doc_0, 'DocumentRes.xml'))
														
 
															+
														
 
															+            # 写入 PublicRes
														
 
															+            self.public_res.save(os.path.join(temp_dir_doc_0, 'PublicRes.xml'))
														
 
															+
														
 
															+            # 写入 content_res
														
 
															+            for idx, page in enumerate(self.content_res):
														
 
															+                temp_dir_pages_idx = os.path.join(temp_dir_pages, f"Page_{idx}")
														
 
															+                os.mkdir(temp_dir_pages_idx)
														
 
															+                # os.mkdir(i)
														
 
															+                page.save(os.path.join(temp_dir_pages_idx, 'Content.xml'))
														
 
															+
														
 
															+            # 写入静态资源
														
 
															+            for k, v in self.res_static.items():
														
 
															+                  with open(os.path.join(temp_dir_res, k), "wb") as f:
														
 
															+                      f.write(v)
														
 
															+
														
 
															+            # 打包成ofd
														
 
															+            zip = zipfile.ZipFile("test.ofd", "w", zipfile.ZIP_DEFLATED)
														
 
															+            for path, dirnames, filenames in os.walk(temp_dir):
														
 
															+                # 去掉目标跟路径，只对目标文件夹下边的文件及文件夹进行压缩
														
 
															+                fpath = path.replace(temp_dir, '')
														
 
															+
														
 
															+                for filename in filenames:
														
 
															+                    zip.write(os.path.join(path, filename), os.path.join(fpath, filename))
														
 
															+            zip.close()
														
 
															+            with open("test.ofd", "rb") as f:
														
 
															+                content = f.read()
														
 
															+            if os.path.exists("test.ofd"):
														
 
															+               os.remove("test.ofd")
														
 
															+            return content
														
 
															+
														
 
															+if  __name__ == "__main__":
														
 
															+    print("---------")
														
 
															+    # 资源文件
														
 
															+    img_path = r"F:\code\easyofd\test\test_img0.jpg"
														
 
															+    # with open(img_path, "rb") as f:
														
 
															+    #     content = f.read()
														
 
															+    content = b""
														
 
															+    res_static = {"Image_0.jpg": content}
														
 
															+
														
 
															+    # 构建数据
														
 
															+    font = [
														
 
															+            {
														
 
															+
														
 
															+                "@FontName": "宋体",
														
 
															+                "@FamilyName": "宋体",
														
 
															+
														
 
															+            }
														
 
															+            ]
														
 
															+
														
 
															+    MultiMedia = [
														
 
															+                {
														
 
															+
														
 
															+                    "@Type": "Image",
														
 
															+                    "ofd:MediaFile": "Image_0.jpg"
														
 
															+                }
														
 
															+            ]
														
 
															+
														
 
															+    ImageObject = [{
														
 
															+
														
 
															+                        "@CTM": "200 0 0 140 0 0",
														
 
															+                        "@Boundary": "0 0 200 140",
														
 
															+                        "@ResourceID": "55"
														
 
															+                    }]
														
 
															+    TextObject = [
														
 
															+        {
														
 
															+
														
 
															+
														
 
															+        "@Boundary": "50 5 100 20",
														
 
															+        "@Font": "2",
														
 
															+        "@Size": "5",
														
 
															+        "ofd:FillColor": {
														
 
															+
														
 
															+            "@Value": "156 82 35",
														
 
															+            "@ColorSpace" : "1"
														
 
															+        },
														
 
															+
														
 
															+        "ofd:TextCode": {
														
 
															+            "@X": "5",
														
 
															+            "@Y": "5",
														
 
															+            "@DeltaX": "7 7 7 7 7 7 7 7 7",
														
 
															+            "#text": "电⼦发票（普通发票）"
														
 
															+        }
														
 
															+    }, {
														
 
															+
														
 
															+
														
 
															+        "@Boundary": "0 0 100 100",
														
 
															+        "@Font": "2",
														
 
															+        "@Size": "10",
														
 
															+        "ofd:FillColor": {
														
 
															+
														
 
															+            "@Value": "156 82 35"
														
 
															+        },
														
 
															+
														
 
															+        "ofd:TextCode": {
														
 
															+            "@X": "0",
														
 
															+            "@Y": "0",
														
 
															+            "@DeltaX": "0",
														
 
															+            "#text": "电"
														
 
															+        }
														
 
															+    }
														
 
															+    ]
														
 
															+
														
 
															+    # 实例化模板
														
 
															+    id_obj = CurId()
														
 
															+    print("id_obj实例化", id_obj)
														
 
															+
														
 
															+    ofd = OFDTemplate(id_obj=id_obj)
														
 
															+    document = DocumentTemplate(id_obj=id_obj)
														
 
															+    public_res = PublicResTemplate(Font=font, id_obj=id_obj)
														
 
															+    document_res = DocumentResTemplate(MultiMedia=MultiMedia, id_obj=id_obj)
														
 
															+    # ImageObject=ImageObject
														
 
															+    content_res = ContentTemplate(CGTransform=[], PathObject=[], TextObject=TextObject, ImageObject=[], id_obj=id_obj)
														
 
															+
														
 
															+
														
 
															+
														
 
															+    ofd_byte = OFDStructure("123",ofd=ofd, document=document,public_res=public_res,
														
 
															+                            document_res=document_res, content_res=[content_res], res_static=res_static)(test=True)
														
 
															+
														
 
															+    with open("test.ofd", "wb") as f:
														
 
															+        content = f.write(ofd_byte)
														
--- a/format_convert/easyofd/easyofd/draw/pdf_parse.py
+++ b/format_convert/easyofd/easyofd/draw/pdf_parse.py
@@ -0,0 +1,966 @@
 
															+import os
														
 
															+import re
														
 
															+import io
														
 
															+
														
 
															+import json
														
 
															+import time
														
 
															+import copy
														
 
															+import string
														
 
															+import random
														
 
															+from uuid import uuid1
														
 
															+from decimal import Decimal
														
 
															+from collections import OrderedDict
														
 
															+
														
 
															+# 第三方包
														
 
															+import fitz
														
 
															+from PIL import Image
														
 
															+# import pdfplumber
														
 
															+
														
 
															+__ALL__ = ['pdf_ocr',"DPFParser"]
														
 
															+
														
 
															+class MyEncoder(json.JSONEncoder):
														
 
															+    def default(self, obj):
														
 
															+        if isinstance(obj, bytes):
														
 
															+            return str(obj)
														
 
															+        elif isinstance(obj, Decimal):
														
 
															+            return float(obj)
														
 
															+        return json.JSONEncoder.default(self, obj)
														
 
															+
														
 
															+class DPFParser(object):
														
 
															+    def __init__(self, ):
														
 
															+        pass
														
 
															+
														
 
															+    def extract_text_with_details(self, pdf_bytes):
														
 
															+        """
														
 
															+        提取PDF每页的文本及其位置、字体信息。
														
 
															+
														
 
															+        :param pdf_path: PDF文件路径
														
 
															+        :return: 包含每页文本及其详细信息的列表
														
 
															+        [[
														
 
															+
														
 
															+        ]]
														
 
															+        """
														
 
															+        details_list = []
														
 
															+        pdf_stream = io.BytesIO(pdf_bytes)
														
 
															+
														
 
															+        # 使用fitz.open直接打开BytesIO对象
														
 
															+
														
 
															+        with fitz.open(stream=pdf_stream, filetype="pdf") as doc:
														
 
															+            res_uuid_map = {
														
 
															+                "img": {},
														
 
															+                "font": {},
														
 
															+                "other": {}
														
 
															+            } # 全局资源标识
														
 
															+            for page_num in range(len(doc)):
														
 
															+
														
 
															+
														
 
															+                page_details_list = []  # 页面内信息
														
 
															+                page = doc.load_page(page_num)
														
 
															+                rect = page.rect
														
 
															+                width = rect.width
														
 
															+                height = rect.height
														
 
															+                if res_uuid_map["other"].get("page_size"):
														
 
															+                    res_uuid_map["other"]["page_size"][page_num] = [width,height]
														
 
															+                else :
														
 
															+                    res_uuid_map["other"]["page_size"] = {page_num: [width, height]}
														
 
															+                blocks = page.get_text("dict").get("blocks")  # 获取文本块信息
														
 
															+                image_list = page.get_images(full=True)  # 获取页面上所有图片的详细信息
														
 
															+                # print(blocks)
														
 
															+                # 获取页面内文本信息
														
 
															+                for block in blocks:
														
 
															+                    block_text = block.get("text", "")
														
 
															+                    block_rect = block["bbox"]  # 文本块的边界框，格式为[x0, y0, x1, y1]
														
 
															+
														
 
															+                    # 遍历块中的每一行
														
 
															+                    for line in block.get("lines", []):
														
 
															+                        line_text = line.get("spans", [{}])[0].get("text", "")  # 单行文本
														
 
															+                        line_rect = line["bbox"]  # 行的边界框
														
 
															+
														
 
															+                        # 遍历行中的每一个跨度（span），获取字体信息
														
 
															+                        for span in line.get("spans", []):
														
 
															+                            span_text = span.get("text", "")
														
 
															+                            font_size = span.get("size")  # 字体大小
														
 
															+                            font_name = span.get("font")  # 字体名称
														
 
															+                            res_uuid = None
														
 
															+                            if font_name not in res_uuid_map["font"].values():
														
 
															+                                res_uuid = str(uuid1())
														
 
															+                                res_uuid_map["font"][res_uuid] = font_name
														
 
															+                            else:
														
 
															+                                keys = list(res_uuid_map["font"].keys())
														
 
															+                                vs = list(res_uuid_map["font"].values())
														
 
															+                                idx = vs.index(font_name)
														
 
															+                                res_uuid =keys[idx]
														
 
															+                            font_color = span.get("color")  # 字体颜色，默认可能没有
														
 
															+                            span_rect = (
														
 
															+                            line_rect[0], line_rect[1], line_rect[2], line_rect[3])  # 使用行的边界框作为参考，具体到单个字符或词可能需要更复杂的处理
														
 
															+
														
 
															+                            # 打印或存储信息
														
 
															+                            print(
														
 
															+                                f"Page: {page_num }, Text: '{span_text}', Font: {font_name}, Size: {font_size}, "
														
 
															+                                f"Color: {font_color}, Rect: {span_rect} ,res_uuid {res_uuid}")
														
 
															+
														
 
															+                            # 存储信息到details_list中（根据需要调整存储格式）
														
 
															+                            page_details_list.append({
														
 
															+                                "page": page_num,
														
 
															+                                "text": span_text,
														
 
															+                                "font": font_name,
														
 
															+                                "res_uuid": res_uuid,
														
 
															+                                "size": font_size,
														
 
															+                                "color": font_color,
														
 
															+                                "bbox": list(span_rect),
														
 
															+                                "type": "text"
														
 
															+                            })
														
 
															+
														
 
															+                for image_index, img_info in enumerate(image_list):
														
 
															+                    # 解析图片信息
														
 
															+                    xref = img_info[0]
														
 
															+                    base_image = doc.extract_image(xref)
														
 
															+
														
 
															+                    image_data = base_image["image"]  # 图片数据
														
 
															+                    res_uuid = str(uuid1())
														
 
															+
														
 
															+                    img_io = io.BytesIO(image_data)
														
 
															+                    res_uuid_map["img"][res_uuid] = img_io
														
 
															+                    image_type = base_image["ext"]  # 图片类型
														
 
															+                    smask = base_image["smask"]  # 图片类型
														
 
															+                    xres = base_image["xres"]  # 图片类型
														
 
															+                    yres = base_image["yres"]  # 图片类型
														
 
															+                    width = base_image["width"]  # 图片宽度
														
 
															+                    height = base_image["height"]  # 图片高度
														
 
															+
														
 
															+
														
 
															+
														
 
															+                    # 计算坐标（左下角和右上角）
														
 
															+                    x0, y0, x1, y1 = xres, yres,xres+width,yres+height
														
 
															+                    print(
														
 
															+                        f"Page: {page_num}, image_type: '{image_type}',x0{x0}, y0{y0}, x1{x1}, y1{y1}  ")
														
 
															+                    page_details_list.append({
														
 
															+                        "page": page_num,
														
 
															+                        "index": image_index,
														
 
															+                        "x0": x0,
														
 
															+                        "y0": y0,
														
 
															+                        "x1": x1,
														
 
															+                        "y1": y1,
														
 
															+                        "bbox": [x0,y0,width,height],
														
 
															+                        "width": width,
														
 
															+                        "height": height,
														
 
															+                        "res_uuid": res_uuid,
														
 
															+                        "image_type": image_type,
														
 
															+                        "type": "img"
														
 
															+                    })
														
 
															+
														
 
															+                details_list.append(page_details_list)
														
 
															+        # print("details_list",details_list)
														
 
															+        return details_list, res_uuid_map
														
 
															+    def to_img(self, buffer_pdf):
														
 
															+        """pdf2img"""
														
 
															+        pix_list = []
														
 
															+        pdfDoc = fitz.open(stream=buffer_pdf)
														
 
															+        for pg in range(pdfDoc.page_count):
														
 
															+            page = pdfDoc[pg]
														
 
															+            rotate = int(0)
														
 
															+            # 每个尺寸的缩放系数为1.3，这将为我们生成分辨率提高2.6的图像。
														
 
															+            # 此处若是不做设置，默认图片大小为：792X612, dpi=96
														
 
															+            zoom_x = 1.33333333 #(1.33333333-->1056x816)   (2-->1584x1224)
														
 
															+            zoom_y = 1.33333333
														
 
															+            # zoom_x,zoom_y = (1,1)
														
 
															+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
														
 
															+            pix = page.get_pixmap(matrix=mat, alpha=False)
														
 
															+
														
 
															+
														
 
															+            pix_list.append(pix)
														
 
															+        return pix_list
														
 
															+           
														
 
															+            
														
 
															+            
														
 
															+    def get_size(self):
														
 
															+        pass
														
 
															+    
														
 
															+def coast_time(func):
														
 
															+    '''
														
 
															+    计算对象执行耗时
														
 
															+    '''
														
 
															+    def fun(*agrs, **kwargs):
														
 
															+        t = time.perf_counter()
														
 
															+        result = func(*agrs, **kwargs)
														
 
															+        print(f'function {func.__name__} coast time: {time.perf_counter() - t:.8f} s')
														
 
															+        return result
														
 
															+    return fun
														
 
															+
														
 
															+
														
 
															+class BaseInit:
														
 
															+    '''
														
 
															+    解析pdf所需的基本信息
														
 
															+    '''
														
 
															+
														
 
															+    def __init__(self, pdf_path, output_path):
														
 
															+
														
 
															+        self.file_path = pdf_path
														
 
															+        self.output_path = output_path
														
 
															+        # file_name
														
 
															+        self.file_name = os.path.basename(self.file_path)
														
 
															+        # file_type
														
 
															+        self.fileType = os.path.splitext(self.file_path)[-1]
														
 
															+        # no suffix
														
 
															+        self.file_no_suffix = self.file_name[:-len(self.fileType)]
														
 
															+        self.uuidChars = tuple(list(string.ascii_letters) + list(range(10)))
														
 
															+        # 表格占位、分割符
														
 
															+        self.divide = ':'
														
 
															+        self.solid = ''
														
 
															+        # 初始化整个过程需要创建的中间目录
														
 
															+        # iou 占比
														
 
															+        self.iou_rate = 0.001
														
 
															+        self.init_file()
														
 
															+
														
 
															+    def init_file(self):
														
 
															+        """
														
 
															+        初始化项目过程中需要创建的文件夹
														
 
															+        """
														
 
															+        self.image_folder_path = os.path.join(self.output_path, 'pdf_img_save')
														
 
															+        self.json_folder_path = os.path.join(self.output_path, 'json')
														
 
															+        self.ocr_result_path = os.path.join(self.json_folder_path, self.file_no_suffix + '.json')
														
 
															+        # 后面还有txt..., 目前的流程先需要5个
														
 
															+        for path in [self.image_folder_path, self.json_folder_path]:
														
 
															+            if not os.path.exists(path):
														
 
															+                os.makedirs(path)
														
 
															+
														
 
															+    def genShortId(self, length=12):
														
 
															+        """
														
 
															+        :params length: 默认随机生成的uuid长度
														
 
															+        """
														
 
															+        uuid = str(uuid1()).replace('-', '')
														
 
															+        result = ''
														
 
															+        for i in range(0, 8):
														
 
															+            sub = uuid[i * 4: i * 4 + 4]
														
 
															+            x = int(sub, 16)
														
 
															+            result += str(self.uuidChars[x % 0x3E])
														
 
															+        return result + ''.join(random.sample(uuid, length - 8))
														
 
															+
														
 
															+
														
 
															+class PageInfo(BaseInit):
														
 
															+    '''
														
 
															+    记录每页中的 图片和表格信息
														
 
															+    '''
														
 
															+    __page_image = {}
														
 
															+    __page_table = {}
														
 
															+
														
 
															+    @classmethod
														
 
															+    def add_image(cls, page_num, image):
														
 
															+        if not cls.__page_image.get(page_num):
														
 
															+            cls.__page_image[page_num] = []
														
 
															+        cls.__page_image[page_num].append(image)
														
 
															+
														
 
															+    @classmethod
														
 
															+    def add_table(cls, page_num, table):
														
 
															+        if not cls.__page_table.get(page_num):
														
 
															+            cls.__page_table[page_num] = []
														
 
															+        cls.__page_table[page_num].append(table)
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_image(cls, page_num):
														
 
															+        return cls.__page_image.get(page_num, [])
														
 
															+
														
 
															+    @classmethod
														
 
															+    def get_table(cls, page_num):
														
 
															+        return cls.__page_table.get(page_num, [])
														
 
															+
														
 
															+    @classmethod
														
 
															+    def save_image(cls, output_path, file):
														
 
															+        '''
														
 
															+        保存图片至本地
														
 
															+        :param output:
														
 
															+        :return:
														
 
															+        '''
														
 
															+        file = file.split('.')[0]
														
 
															+        for images in cls.__page_image.values():
														
 
															+            for image in images:
														
 
															+                iamge_content = image['objContent']
														
 
															+                name = image['name']
														
 
															+                img_dir = os.path.join(output_path, 'page_img_save')
														
 
															+                img_path = os.path.join(img_dir, file + '_' + name + '.jpg')
														
 
															+                if not os.path.exists(img_dir):
														
 
															+                    os.mkdir(img_dir)
														
 
															+                with open(img_path, 'wb') as fp:
														
 
															+                    fp.write(iamge_content)
														
 
															+
														
 
															+
														
 
															+class ParseFile(PageInfo):
														
 
															+
														
 
															+    def __init__(self, pdf_path, output_path, table_type='v2', is_save=True):
														
 
															+        super().__init__(pdf_path, output_path)
														
 
															+        print('初始化 pdf 对象：{}'.format(self.file_path))
														
 
															+        self.is_save = is_save
														
 
															+        self.table_type = table_type
														
 
															+        # 第一版结果列表： 行 表分开
														
 
															+        self.page_result_list = []
														
 
															+        # 第二版结果列表： 行表合并
														
 
															+        self.combine_page_result_list = []
														
 
															+
														
 
															+    @coast_time
														
 
															+    def get_result(self):
														
 
															+        self.load_pdf()
														
 
															+        result = self.parse_pdf()
														
 
															+        self.ocr_result = result
														
 
															+        print(f'解析完成：共 {len(result)} 页  表格类型： {self.table_type}')
														
 
															+        return result
														
 
															+
														
 
															+    def load_pdf(self):
														
 
															+        self.fitz_doc = fitz.open(self.file_path, filetype='pdf')
														
 
															+        # self.pdfplum_doc_pages = pdfplumber.open(self.file_path).pages
														
 
															+        # assert len(self.fitz_doc) == len(self.pdfplum_doc_pages)
														
 
															+
														
 
															+    def parse_pdf(self):
														
 
															+        for page_no, fitz_doc in enumerate(self.fitz_doc):
														
 
															+            # 测试
														
 
															+            # if page_no != 25:
														
 
															+            #     continue
														
 
															+            self.height = fitz_doc.get_text('dict')['height']
														
 
															+            self.width = fitz_doc.get_text('dict')['width']
														
 
															+            # 聚合fitz页面解析的字符, 行, 块信息
														
 
															+            line_list = self.group_block(page_no, fitz_doc)
														
 
															+            # 获取页面表格信息
														
 
															+            table_list = self.extract_table(page_no, self.pdfplum_doc_pages[page_no])
														
 
															+            # 计算表格行列合并信息
														
 
															+            table_list = list(CalcTableRL(table_list).run())
														
 
															+            # 获取页面图片信息
														
 
															+            image_list = self.get_image(page_no)
														
 
															+            # 构造每页最终返回结果，
														
 
															+            page_result = self.construct_final_result(line_list, page_no, image_list, table_list)
														
 
															+
														
 
															+            if self.table_type == 'v2':
														
 
															+                # 合并成ocr所需格式：表格合并至行列表
														
 
															+                combine_page_result_list = self.combine_table_v2(page_result)
														
 
															+                page_result = self.construct_final_result(combine_page_result_list, page_no, image_list, table_list)
														
 
															+
														
 
															+            self.page_result_list.append(page_result)
														
 
															+            if page_no and  page_no % 10 == 0:
														
 
															+                print(f'解析前 {page_no} 页完成')
														
 
															+        final_result_list = copy.deepcopy(self.page_result_list)
														
 
															+        # 转换为符合ocr解析格式
														
 
															+        if self.table_type == 'v2':
														
 
															+            final_result_list = self.reform_ocr_result(final_result_list)
														
 
															+        # 2023/09/26 保存之前加入 contIndex 给后续 抽取模型使用
														
 
															+        for page_num, page in enumerate(final_result_list):
														
 
															+            if not page.get('lineList'):
														
 
															+                break
														
 
															+            contIndex = {}
														
 
															+            for line in page['lineList']:
														
 
															+                line_bak = dict(copy.copy(line))
														
 
															+                line_bak["objType_postpreprocess"] = f"{line_bak.get('objType','textLine')}_postpreprocess"
														
 
															+                contIndex[line_bak["lineId"]] = line_bak
														
 
															+            
														
 
															+            page["contIndex"] = contIndex
														
 
															+            for line in page['lineList']:
														
 
															+                print(page_num, line['objType'], line['objContent'])
														
 
															+        # 保存至本地
														
 
															+        if self.is_save:
														
 
															+            self.save_result(final_result_list)
														
 
															+        for page_num, page in enumerate(final_result_list):
														
 
															+            for line in page['lineList']:
														
 
															+                print(page_num, line['objType'], line['objContent'])
														
 
															+        return final_result_list
														
 
															+
														
 
															+    def combine_table_v2(self, page_result):
														
 
															+        lineList = page_result['lineList']
														
 
															+        table_list = page_result['table_list']
														
 
															+        # 先进行表格行、非表格行划分 减少后续操作的时间杂度
														
 
															+        __notable_lines, __all_table_lines = self.filter_table_line(lineList, table_list)
														
 
															+        notable_lines, all_table_lines = copy.deepcopy(__notable_lines), copy.deepcopy(__all_table_lines)
														
 
															+        del __notable_lines, __all_table_lines, lineList
														
 
															+        # 整合
														
 
															+        combine_page_result_list = self.combine_table_with_line(notable_lines, all_table_lines, table_list)
														
 
															+        return combine_page_result_list
														
 
															+
														
 
															+    def filter_table_line(self, lineList, table_list):
														
 
															+        '''
														
 
															+        筛选出属于表格的行、在 __notable_lines 属于表格的位置插庄 方便后续补全
														
 
															+        __notable_lines： 非表格的行
														
 
															+        __all_table_lines：属于表格的行
														
 
															+        '''
														
 
															+        __notable_lines = []
														
 
															+        __all_table_lines = []
														
 
															+        for table_info in table_list:
														
 
															+            table_bbox = table_info['objPos']
														
 
															+            # 属于当前表格的所有行
														
 
															+            __sub_table_lines = []
														
 
															+            is_iter_table = False
														
 
															+            while lineList:
														
 
															+                line = lineList.pop(0)
														
 
															+                line_bbox = line['objPos']
														
 
															+                # 空表格误判：行Y坐标已经超过表范围导致后续全都识别不到
														
 
															+                table_y, line_y = table_bbox[3], line_bbox[1]
														
 
															+                if line_y >= table_y:
														
 
															+                    lineList.insert(0, line)
														
 
															+                    break
														
 
															+                iou = self.count_iou(table_bbox, line_bbox)
														
 
															+                # 非表格区域
														
 
															+                if iou > 0:
														
 
															+                    __sub_table_lines.append(line)
														
 
															+                    # 首次匹配到表格行
														
 
															+                    if not is_iter_table:
														
 
															+                        is_iter_table = True
														
 
															+                        # 插入标记
														
 
															+                        __notable_lines.append('table')
														
 
															+                elif iou <= 0 and not is_iter_table:
														
 
															+                    __notable_lines.append(line)
														
 
															+                # 当前表格判断结束
														
 
															+                elif iou <= 0 and is_iter_table:
														
 
															+                    lineList.insert(0, line)
														
 
															+                    line_index, flag = self.more_judge(table_bbox, lineList)
														
 
															+                    if flag:
														
 
															+                        # 跳至index位置继续后续判断
														
 
															+                        # more_lines = copy.deepcopy()
														
 
															+                        __notable_lines.extend(lineList[:line_index])
														
 
															+                        lineList = lineList[line_index:]
														
 
															+                    else:
														
 
															+                        break
														
 
															+            __all_table_lines.append(__sub_table_lines)
														
 
															+        # 表格遍历替换完毕, 合并剩下的 page_words
														
 
															+        if lineList:
														
 
															+            __notable_lines.extend(lineList)
														
 
															+        return __notable_lines, __all_table_lines
														
 
															+
														
 
															+    def more_judge(self, table_bbox, lineList, max_judge=6):
														
 
															+        '''
														
 
															+        判断后续行列表是否还存在属于当前表格的行
														
 
															+        对于表格、行界限不明显的额外判断 如： 页面分栏、表格不全
														
 
															+        :return 是否存在 True | False
														
 
															+        '''
														
 
															+        # 往后多判断 max_judge 行
														
 
															+        if len(lineList) < max_judge:
														
 
															+            judge_lines = lineList
														
 
															+        else:
														
 
															+            judge_lines = lineList[:max_judge]
														
 
															+        for index, line in enumerate(judge_lines):
														
 
															+            line_bbox = line['objPos']
														
 
															+            iou = self.count_iou(table_bbox, line_bbox)
														
 
															+            if iou > 0:
														
 
															+                return index, True
														
 
															+        return index, False
														
 
															+
														
 
															+
														
 
															+    def combine_table_with_line(self, notable_lines, all_table_lines, table_list):
														
 
															+        '''
														
 
															+        将行、字符合并至对应的表格行、cell
														
 
															+        '''
														
 
															+        for table_id, table in enumerate(table_list):
														
 
															+            new_table_lines = []
														
 
															+            for table_line in table['lineList']:
														
 
															+                is_iter_table = False
														
 
															+                table_line_bbox = table_line['objPos']
														
 
															+                # 遍历每一行：全局匹配
														
 
															+                for __line in all_table_lines[table_id]:
														
 
															+                    line = copy.deepcopy(__line)
														
 
															+                    line_bbox = line['objPos']
														
 
															+                    iou = self.count_iou(table_line_bbox, line_bbox)
														
 
															+                    # 首次识别到表格， 将文本行的文本、坐标替换为表格行文本、坐标，文本行的其他信息不变
														
 
															+                    if iou > self.iou_rate and not is_iter_table:
														
 
															+                        is_iter_table = True
														
 
															+                        line['objContent'] = table_line['objContent']
														
 
															+                        line['objPos'] = table_line['objPos']
														
 
															+                        line['objType'] = 'table'
														
 
															+                        line['tableId'] = table_id
														
 
															+                        self.combine_cell_with_span(table_line, line)
														
 
															+                        line['cells'] = table_line['cells']
														
 
															+                        new_table_lines.append(line)
														
 
															+                    elif iou > self.iou_rate and is_iter_table:
														
 
															+                        self.combine_cell_with_span(table_line, line)
														
 
															+                    else:
														
 
															+                        pass
														
 
															+            if 'table' not in notable_lines or not new_table_lines:
														
 
															+                # FIX ERROR: 'table' is not in list
														
 
															+                # 处理大表格内识别到小表格的情况
														
 
															+                # 有可能的bug：如果此时有多个大表格嵌套会导致行分配和插庄个数不对等
														
 
															+                continue
														
 
															+            # 将表格行new_table_lines替换之前插庄table位置并展开
														
 
															+            table_index = notable_lines.index('table')
														
 
															+            new_notable_lines = notable_lines[:table_index]
														
 
															+            new_notable_lines.extend(new_table_lines)
														
 
															+            notable_lines = new_notable_lines + notable_lines[table_index+1:]
														
 
															+        return notable_lines
														
 
															+
														
 
															+    def combine_cell_with_span(self,table_line , text_line):
														
 
															+        '''
														
 
															+        将表格的cell内加上对应span的chars信息：解决表格合并时cell有多行导致chars顺序错乱的问题
														
 
															+        '''
														
 
															+        del_list = []
														
 
															+        for index, cell in enumerate(table_line['cells']):
														
 
															+            if not cell.get('chars'):
														
 
															+                cell['chars'] = []
														
 
															+            cell_bbox = cell['objPos']
														
 
															+            if cell_bbox is None:
														
 
															+                del_list.append(index)
														
 
															+                continue
														
 
															+            for span in  text_line['span']:
														
 
															+                span_bbox = span['bbox']
														
 
															+                iou = self.count_iou(cell_bbox, span_bbox)
														
 
															+                if iou < self.iou_rate:
														
 
															+                    continue
														
 
															+                # 为了解决一些 span 和 cell 长度不一致问题 将循环细分到每个字符chars
														
 
															+                for char in span['chars']:
														
 
															+                    char_bbox = char['bbox']
														
 
															+                    iou = self.count_iou(cell_bbox, char_bbox)
														
 
															+                    if iou > self.iou_rate:
														
 
															+                        cell['chars'].append(char)
														
 
															+                    else:
														
 
															+                        pass
														
 
															+        # 清除无效的span
														
 
															+        if len(del_list):
														
 
															+            for index, index_del in enumerate(del_list):
														
 
															+                index_del -= index
														
 
															+                del table_line['cells'][index_del]
														
 
															+
														
 
															+    def group_block(self, page_num, fitz_doc):
														
 
															+        """
														
 
															+        组合两个方法的block信息, 使每一个span内具有其每一个字符信息
														
 
															+        参考官方文档：https://pymupdf.readthedocs.io/en/latest/textpage.html#textpagedict
														
 
															+        :param fitz_doc:
														
 
															+        :return: total_info
														
 
															+        """
														
 
															+        line_count = 0
														
 
															+        total_line_list = []
														
 
															+        # char_blocks 最小粒度为每一个字符
														
 
															+        char_blocks = fitz_doc.get_text('rawdict')['blocks']
														
 
															+        # block_blocks 最小粒度为每行中的span
														
 
															+        block_blocks = fitz_doc.get_text('dict')['blocks']
														
 
															+        # 先进行文本块排序
														
 
															+        char_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
														
 
															+        block_blocks.sort(key=lambda x: [int(x['bbox'][1]), int(x['bbox'][0])])
														
 
															+        # 分组聚合
														
 
															+        group_blocks = zip(block_blocks, char_blocks)
														
 
															+        for span_blocks, char_block in group_blocks:
														
 
															+            if span_blocks['type'] == 1:
														
 
															+                # 保存其中的图片
														
 
															+                img_attrs = self.deal_image(page_num, line_count, span_blocks)
														
 
															+                self.add_image(page_num, img_attrs)
														
 
															+                continue
														
 
															+            for line_index, line in enumerate(span_blocks['lines']):
														
 
															+                line['text'] = ''
														
 
															+                line['chars'] = []
														
 
															+                line['span'] = []
														
 
															+                # 减少时间复杂度，在此处合并每一行
														
 
															+                # 合并每一行，并附上行内每一个字符的信息
														
 
															+                for span_index, span in enumerate(line['spans']):
														
 
															+                    span['text'] = span['text'].replace(' ', '').strip()
														
 
															+                    if not span['text']:
														
 
															+                        continue
														
 
															+                    # 给span_blocks中的span加上char_block的chars信息
														
 
															+                    span_chars = char_block['lines'][line_index]['spans'][span_index]['chars']
														
 
															+                    span_chars = [char for char in span_chars if char['c'].strip()]
														
 
															+                    line['text'] += span['text']
														
 
															+                    line['chars'].extend(span_chars)
														
 
															+                    line['span'].append({'bbox': span['bbox'], 'chars': span_chars,'text': span['text']})
														
 
															+                if not line['text']:
														
 
															+                    continue
														
 
															+                # 构造每行内部的数据结构
														
 
															+                line_info = self.construct_line_info(line['text'], line['bbox'], line['span'], line['chars'],
														
 
															+                                                     line_count, page_num)
														
 
															+                total_line_list.append(line_info)
														
 
															+                line_count += 1
														
 
															+        return total_line_list
														
 
															+
														
 
															+    def extract_table(self, page_no, plum_page):
														
 
															+        '''
														
 
															+        提取页面所有表格
														
 
															+        :param page_no:
														
 
															+        :param plum_page:
														
 
															+        :return:
														
 
															+        '''
														
 
															+        table_list = []
														
 
															+        for table in plum_page.find_tables():
														
 
															+            # 获取当前表格的边界定位
														
 
															+            table_line_list = self.merge_table_row(table)
														
 
															+            if not table_line_list:
														
 
															+                continue
														
 
															+            table_info = self.deal_table(page_no, table.bbox, table_line_list)
														
 
															+            table_list.append(table_info)
														
 
															+            # 将表格信息加入全局变量 | 此处有点有点冗余
														
 
															+            self.add_table(page_no, table_info)
														
 
															+        return table_list
														
 
															+
														
 
															+    def merge_table_row(self, table):
														
 
															+        '''
														
 
															+        表格cell 按行合并
														
 
															+        :param table:
														
 
															+        :return: [({line_text}, {line_bbox}), ...]
														
 
															+        '''
														
 
															+        table_line_list = []
														
 
															+        for item, row in zip(table.extract(), table.rows):
														
 
															+            # 表格每行预处理
														
 
															+            table_line = self.divide.join([self.clear_text(txt) for txt in item])
														
 
															+            # 判断当前行是否为空
														
 
															+            __line = self.clear_text(table_line).replace(' ', '')
														
 
															+            if not __line:
														
 
															+                continue
														
 
															+            table_line_list.append((table_line, row.bbox, zip(item, row.cells)))
														
 
															+        return table_line_list
														
 
															+
														
 
															+    def clear_text(self, txt, retrans=False):
														
 
															+
														
 
															+        if retrans:
														
 
															+            txt = txt.replace(self.solid, '').replace(self.divide, '')
														
 
															+        else:
														
 
															+            # 空列替换为占位符
														
 
															+            txt = txt if txt else self.solid
														
 
															+        return str(txt).replace('\n', '').replace(' ', '')
														
 
															+
														
 
															+    def deal_table(self, page_no, table_bbox, table_line_list):
														
 
															+        '''
														
 
															+        对表格做结构转换
														
 
															+        :param page_no:
														
 
															+        :param table_bbox:
														
 
															+        :param table_line_list:
														
 
															+        :return:
														
 
															+        '''
														
 
															+        table_first_line = self.clear_text(table_line_list[0][0], retrans=True)
														
 
															+        table_id = '{0}_{1}_'.format(page_no, table_first_line) + self.genShortId()
														
 
															+        lineList = [{
														
 
															+            'objContent': line[0],
														
 
															+            'objPos': line[1],
														
 
															+            'cells': self.deal_table_cell(line[2])
														
 
															+        } for line in table_line_list]
														
 
															+        table_info = {
														
 
															+            'tableId': table_id,
														
 
															+            'name': table_id,
														
 
															+            'objPos': table_bbox,
														
 
															+            'lineList': lineList,
														
 
															+        }
														
 
															+        return table_info
														
 
															+
														
 
															+    def deal_table_cell(self, cells):
														
 
															+        return [{"objContent": self.clear_text(text), "objPos": box} for text, box in cells]
														
 
															+
														
 
															+    def deal_image(self, page_num, name, img_attrs):
														
 
															+        '''
														
 
															+        对image做结构转换
														
 
															+        :param page_num:
														
 
															+        :param name:
														
 
															+        :param img_attrs:
														
 
															+        :return:
														
 
															+        '''
														
 
															+        image_id = '{0}_{1}_'.format(page_num, name) + self.genShortId()
														
 
															+        img_info = {
														
 
															+            'imageId': image_id,
														
 
															+            'name': image_id,  # 暂时以图片所在页面的行数命名
														
 
															+            'objPos': img_attrs['bbox'],
														
 
															+            'ext': img_attrs['ext'],
														
 
															+            'objContent': img_attrs['image'],
														
 
															+            'size': img_attrs['size']
														
 
															+        }
														
 
															+        return img_info
														
 
															+
														
 
															+    def deal_chars(self, line_num, lineId, chars):
														
 
															+        '''
														
 
															+        对chars做结构转换
														
 
															+        :param line_num:
														
 
															+        :param lineId:
														
 
															+        :param chars:
														
 
															+        :return:
														
 
															+        '''
														
 
															+        num_count = 0
														
 
															+        char_list = []
														
 
															+        for char in chars:
														
 
															+            if not char['c'].strip():
														
 
															+                continue
														
 
															+            char_dict = {
														
 
															+                'lineId': lineId,
														
 
															+                'charId': 'char_' + str(line_num) + '_' + str(num_count) + '_' + self.genShortId(),
														
 
															+                'objContent': char['c'],
														
 
															+                'objPos': char['bbox']
														
 
															+            }
														
 
															+            char_list.append(char_dict)
														
 
															+            num_count += 1
														
 
															+        return char_list
														
 
															+
														
 
															+    def construct_line_info(self, text, rect, span, chars, count, pageNo, objType='textLine'):
														
 
															+        '''
														
 
															+        对每行做结构转换
														
 
															+        # x, y, h, w = rect[0], rect[1], rect[3] - rect[1], rect[2] - rect[0]
														
 
															+        '''
														
 
															+        lineId = 'line_' + str(pageNo) + '_' + str(count) + '_' + self.genShortId()
														
 
															+        chars = self.deal_chars(count, lineId, chars)
														
 
															+        return OrderedDict({
														
 
															+            'lineNo': count,
														
 
															+            'lineId': lineId,
														
 
															+            'objType': objType,
														
 
															+            'objContent': re.sub(r'\s', '', text),
														
 
															+            'chars': chars,
														
 
															+            'objPos': rect,
														
 
															+            'span': span
														
 
															+        })
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def rect_format(bbox):
														
 
															+        '''
														
 
															+        数据坐标转换 x1, y1, x2, y2 >> y1, x1 h, w
														
 
															+        :param rect: [x1, y1, x2, y2]
														
 
															+        :return: [y, x, h, w]
														
 
															+        '''
														
 
															+        y, x, h, w = bbox[1], bbox[0], bbox[3] - bbox[1], bbox[2] - bbox[0]
														
 
															+        return [y, x, h, w]
														
 
															+
														
 
															+    def count_iou(self, RecA, RecB):
														
 
															+        '''
														
 
															+        计算边框交并比
														
 
															+        左上边界坐标为Ax0, Ay0, Bx0, By0
														
 
															+        右下边界坐标为Ax1, Ay1, Bx1, By1
														
 
															+        交集面积计算为：
														
 
															+            M = min(Ax1, Bx1) - max(Ax0, Bx0)
														
 
															+            H = min(Ay1, By1) - max(Ay0, By0)
														
 
															+        # 当前表格的边界信息
														
 
															+        left_x, top_y, right_x, botm_y： table_box_info[0], table_box_info[1], table_box_info[2], table_box_info[3]
														
 
															+        '''
														
 
															+        M = min(RecB[2], RecA[2]) - max(RecB[0], RecA[0])
														
 
															+        H = min(RecB[3], RecA[3]) - max(RecB[1], RecA[1])
														
 
															+
														
 
															+        # 计算交集部分面积
														
 
															+        interArea = max(0, M) * max(0, H)
														
 
															+
														
 
															+        # 计算两个边框的面积
														
 
															+        RecA_Area = (RecA[2] - RecA[0]) * (RecA[3] - RecA[1])
														
 
															+        RecB_Area = (RecB[2] - RecB[0]) * (RecB[3] - RecB[1])
														
 
															+        # 计算IOU
														
 
															+        iou = interArea / float(RecA_Area + RecB_Area - interArea)
														
 
															+        return iou
														
 
															+
														
 
															+    def construct_final_result(self, line_list, pageNo, image_list=[], table_list=[]):
														
 
															+        '''
														
 
															+        每页转换为最终数据结构
														
 
															+        :param line_list: ocr每行结果
														
 
															+        :param pageNo: 页码
														
 
															+        :param image_list:
														
 
															+        :param table_list:
														
 
															+        :return: type: Dict
														
 
															+        '''
														
 
															+        document_id = 'v1' + '_' + self.file_no_suffix + '_' + self.genShortId()
														
 
															+        return OrderedDict({
														
 
															+            'pageNo': pageNo,
														
 
															+            'docID': document_id,
														
 
															+            'page_info':{'size': [self.width, self.height]},
														
 
															+            'lineList': line_list,
														
 
															+            'image_list': image_list if image_list else [],
														
 
															+            'table_list': table_list if table_list else []
														
 
															+        })
														
 
															+
														
 
															+    def save_result(self, final_result_list):
														
 
															+        '''
														
 
															+        保存结果数据至本地
														
 
															+        '''
														
 
															+        if self.table_type == 'v2':
														
 
															+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
														
 
															+                json.dump(final_result_list, f, indent=4, ensure_ascii=False)
														
 
															+        else:
														
 
															+            with open(self.ocr_result_path, 'w', encoding='utf-8') as f:
														
 
															+                json.dump(self.page_result_list, f, cls=MyEncoder, indent=4, ensure_ascii=False)
														
 
															+
														
 
															+    def reform_ocr_result(self, final_result_list):
														
 
															+        """
														
 
															+        对返回的结果最最终处理 并 重新定义行号排序
														
 
															+        :param final_result_list: 本地解析和ocr解析的合并结果
														
 
															+        """
														
 
															+        for result_list in final_result_list:
														
 
															+            del result_list['image_list']
														
 
															+            del result_list['table_list']
														
 
															+            lineList = result_list['lineList']
														
 
															+            for num, line in enumerate(lineList):
														
 
															+                # 重写行号和行ID
														
 
															+                line['lineNo'] = str(num)
														
 
															+                line_split = line['lineId'].split('_')
														
 
															+                line_split[-2] = str(num)
														
 
															+                line['lineId'] = '_'.join(line_split)
														
 
															+                # 转换坐标格式
														
 
															+                obj_type = line['objType']
														
 
															+                # 计算每一个字相对于当前行想x，y 的偏移量
														
 
															+                offset_x_list, offset_y_list = self.coord_offset(line, obj_type)
														
 
															+                line['objPos'] = self.rect_format(line['objPos'])
														
 
															+                line['objPos'].append(offset_x_list)
														
 
															+                line['chars_offset'] = [offset_x_list, offset_y_list]
														
 
															+                if line.get('chars'):
														
 
															+                    del line['chars']
														
 
															+                if obj_type == 'table' and line.get('span'):
														
 
															+                    del line['span']
														
 
															+        return final_result_list
														
 
															+
														
 
															+    def coord_offset(self, line, obj_type='textLine'):
														
 
															+        '''
														
 
															+        计算每个字符的左上角 相对行左上角位置的偏移量
														
 
															+        @obj_type: textLine | table
														
 
															+        '''
														
 
															+        offset_x_list = []
														
 
															+        offset_y_list = []
														
 
															+        line_x, line_y = line['objPos'][0], line['objPos'][1]
														
 
															+        if obj_type == 'textLine':
														
 
															+            for span in line['span']:
														
 
															+                self.all_rect_format(span)
														
 
															+                for char in span['chars']:
														
 
															+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
														
 
															+                    offset_x_list.append(char_x - line_x)
														
 
															+                    offset_y_list.append(char_y - line_y)
														
 
															+                    self.all_rect_format(char)
														
 
															+        else:
														
 
															+            __cells = []
														
 
															+            for num, _cell in enumerate(line['cells']):
														
 
															+                cell = copy.deepcopy(_cell)
														
 
															+                self.all_rect_format(cell)
														
 
															+                for char in cell['chars']:
														
 
															+                    char_x, char_y = char['bbox'][0], char['bbox'][1]
														
 
															+                    offset_x_list.append(char_x - line_x)
														
 
															+                    offset_y_list.append(char_y - line_y)
														
 
															+                    self.all_rect_format(char)
														
 
															+                __cells.append(cell)
														
 
															+            line['cells'] = __cells
														
 
															+        return offset_x_list, offset_y_list
														
 
															+
														
 
															+    def all_rect_format(self, obj):
														
 
															+        '''
														
 
															+        将所有格式转换为ocr所需格式
														
 
															+        '''
														
 
															+        if 'chars' in obj:
														
 
															+            if obj.get('text'):
														
 
															+                obj['objContent'] = obj['text']
														
 
															+                del obj['text']
														
 
															+            if obj.get('objPos'):
														
 
															+                obj['objPos'] = self.rect_format(obj['objPos'])
														
 
															+            elif obj.get('bbox'):
														
 
															+                obj['objPos'] = self.rect_format(obj['bbox'])
														
 
															+                del obj['bbox']
														
 
															+        else:
														
 
															+            obj['objContent'] = obj['c']
														
 
															+            obj['objPos'] = self.rect_format(obj['bbox'])
														
 
															+            del obj['c']
														
 
															+            del obj['bbox']
														
 
															+
														
 
															+class CalcTableRL:
														
 
															+    '''
														
 
															+    还原表格虚线 计算表格行列合并信息
														
 
															+    输入目标表格结构信息：必须包含所有的cell坐标
														
 
															+    在目标表格结构cell上加上row_start_end, col_start_end属性
														
 
															+    '''
														
 
															+    def __init__(self, table_info):
														
 
															+        self.table_info = table_info
														
 
															+
														
 
															+    def run(self):
														
 
															+        if isinstance(self.table_info, list):
														
 
															+            for table_info in self.table_info:
														
 
															+                table_info = self.add_table_property(table_info)
														
 
															+                yield table_info
														
 
															+        else:
														
 
															+            table_info = self.add_table_property(self.table_info)
														
 
															+            yield table_info
														
 
															+    def add_table_property(self, table_info):
														
 
															+        '''
														
 
															+        表格结构增加行列合并信息:
														
 
															+        cell['col_start_end'] = (col_start, col_end)
														
 
															+        cell['row_start_end'] = (row_start, row_end)
														
 
															+        '''
														
 
															+        # 分别得到所有排序好的行列坐标
														
 
															+        set_x, set_y = self.collect_table_coord(table_info)
														
 
															+        # 排序 后的set_x，set_y 坐标集合就是最小粒度表格
														
 
															+        list_x, list_y = sorted(set_x), sorted(set_y)
														
 
															+        for line in table_info['lineList']:
														
 
															+            for cell in line['cells']:
														
 
															+                if cell['objPos'] == None:
														
 
															+                    continue
														
 
															+                x1, y1, x2, y2 = cell['objPos']
														
 
															+                # 查找坐标点在虚线表格中对应的位置
														
 
															+                col_start = list_x.index(x1)
														
 
															+                col_end = list_x.index(x2)
														
 
															+                row_start = list_y.index(y1)
														
 
															+                row_end = list_y.index(y2)
														
 
															+                cell['col_start_end'] = (col_start, col_end)
														
 
															+                cell['row_start_end'] = (row_start, row_end)
														
 
															+                # print(f"{cell['objContent']} 属于行：{cell['row_start_end']} 属于列：{cell['col_start_end']}")
														
 
															+        return table_info
														
 
															+
														
 
															+    def collect_table_coord(self, table_info):
														
 
															+        '''
														
 
															+        获取所有x, y坐标点
														
 
															+        传入单个表格信息，提取出其中所有cell的x1, y1, x2, y2坐标点 去重
														
 
															+        :param table_info:
														
 
															+        :return: set(x), set(y)
														
 
															+        '''
														
 
															+        set_x = set()
														
 
															+        set_y = set()
														
 
															+        for line in table_info['lineList']:
														
 
															+            for cell in line['cells']:
														
 
															+                if cell['objPos'] == None:
														
 
															+                    continue
														
 
															+                x1, y1, x2, y2 = cell['objPos']
														
 
															+                set_x.add(x1)
														
 
															+                set_x.add(x2)
														
 
															+                set_y.add(y1)
														
 
															+                set_y.add(y2)
														
 
															+        return set_x, set_y
														
 
															+
														
 
															+
														
 
															+
														
 
															+def pdf_ocr(pdf_path, output_path, table_type='v2', is_save=True):
														
 
															+    '''
														
 
															+    简单封装, 方便调用和多线程
														
 
															+    '''
														
 
															+    pdf = ParseFile(pdf_path, output_path, table_type, is_save)
														
 
															+    pdf.get_result()
														
 
															+    return pdf
														
 
															+
														
 
															+# ---------------------------以下是测试案列-----------------------------------
														
 
															+
														
 
															+@coast_time
														
 
															+def test_dir():
														
 
															+    for root in os.walk(r'E:\workplace\cjhx_test\创金和信\pdf2json\input\all_test'):
														
 
															+        dir, files = root[0], root[2]
														
 
															+        for file in files:
														
 
															+            if 'test.pdf' not in file:
														
 
															+                continue
														
 
															+            file_path = os.path.join(dir, file)
														
 
															+            output_dir = r'E:\workplace\cjhx_test\创金和信\pdf2json\file_data\all_test'
														
 
															+            pdf_ocr_result = pdf_ocr(file_path, output_dir)
														
 
															+
														
 
															+@coast_time
														
 
															+def test_single():
														
 
															+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\测试足够复杂的表格解析.pdf'
														
 
															+    file_path = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/20220913-浙江省贰号职业年金计划银华资产组合2022年二季度管理费用支付指令.pdf'
														
 
															+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\公开募集基金销售支付结算机构名录(2022年9月)(1).pdf'
														
 
															+    # file_path = r'C:\Users\Administrator\Documents\WeChat Files\wxid_x36dhycno4s121\FileStorage\File\2022-11\20210928-ZL001-西部利得天添鑫货币B-申购5000万-确认书.pdf'
														
 
															+    # file_path = r'E:\workplace\daily_work\pdf2json\input\all_test\2-信息系统部2021年大数据平台系统维护服务--工作记录表和考核表2021Q3-原版.pdf'
														
 
															+    output_dir = r'/home/yhocr/extractor/3f195fba-0916-4d74-b956-bf3bcadc77f2/电子解析'
														
 
															+    pdf = pdf_ocr(file_path, output_dir, table_type='v2')
														
 
															+    # print(pdf.ocr_result)
														
 
															+
														
 
															+@coast_time
														
 
															+def test_thread():
														
 
															+    # 多进程
														
 
															+    from concurrent.futures import ProcessPoolExecutor
														
 
															+    pool = ProcessPoolExecutor(max_workers=8)
														
 
															+    # 多线程
														
 
															+    # from concurrent.futures import ThreadPoolExecutor
														
 
															+    # pool = ThreadPoolExecutor(max_workers=8)
														
 
															+    for root in os.walk(r'E:\workplace\daily_work\pdf2json\input\签字模板二'):
														
 
															+        dir, files = root[0], root[2]
														
 
															+        for file in files:
														
 
															+            file_path = os.path.join(dir, file)
														
 
															+            output_dir = r'E:\workplace\daily_work\pdf2json\output\签字模板二'
														
 
															+            ret = pool.submit(pdf_ocr, file_path, output_dir, table_type='v2')
														
 
															+            ret.add_done_callback(print_callback)
														
 
															+    pool.shutdown()
														
 
															+
														
 
															+def print_callback(ret):
														
 
															+    # print('ret:', ret.result())
														
 
															+    pass
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    # test_dir()
														
 
															+    # test_thread()
														
 
															+    # test_single()
														
 
															+    pdf_obj = DPFParser()
														
 
															+    with open(r"F:\code\easyofd\test\test.pdf","rb") as f:
														
 
															+        pdf_bytes = f.read()
														
 
															+
														
 
															+    img_list = pdf_obj.to_img(pdf_bytes)
														
 
															+    pil_img_list = []
														
 
															+    for _img in img_list:
														
 
															+        print(_img.width,_img.height)
														
 
															+        img = Image.frombytes("RGB", [_img.width, _img.height], _img.samples)
														
 
															+        print(type(img))
														
 
															+        img.save('output_image.png')
														
 
															+      
														
 
															+    
														
--- a/format_convert/easyofd/easyofd/draw/simsun.ttc
+++ b/format_convert/easyofd/easyofd/draw/simsun.ttc
--- a/format_convert/easyofd/easyofd/ofd.py
+++ b/format_convert/easyofd/easyofd/ofd.py
@@ -0,0 +1,301 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME: F:\code\easyofd\easyofd
														
 
															+# CREATE_TIME: 2023-10-07
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# note:  ofd 基础类
														
 
															+import base64
														
 
															+import os
														
 
															+import sys
														
 
															+from io import BytesIO
														
 
															+from typing import Union
														
 
															+
														
 
															+# sys.path.insert(0, os.getcwd())
														
 
															+# sys.path.insert(0, "..")
														
 
															+
														
 
															+import fitz
														
 
															+from PIL import Image
														
 
															+from fontTools.ttLib import TTFont
														
 
															+from loguru import logger
														
 
															+
														
 
															+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../")
														
 
															+
														
 
															+from format_convert.easyofd.easyofd.parser_ofd import OFDParser
														
 
															+from format_convert.easyofd.easyofd.draw import DrawPDF, OFDWrite
														
 
															+
														
 
															+
														
 
															+class OFD(object):
														
 
															+    """ofd对象"""
														
 
															+
														
 
															+    def __init__(self, ):
														
 
															+        self.data = None
														
 
															+
														
 
															+    def read(self, ofd_f: Union[str, bytes, BytesIO], fmt="b64", save_xml=False, xml_name="testxml", save_dir=None):
														
 
															+        """_summary_
														
 
															+        Args:
														
 
															+            file (_type_): _description_
														
 
															+            fomat (str, optional): _description_. Defaults to "path".
														
 
															+            fomat in ("path","b64","binary")
														
 
															+        """
														
 
															+        if fmt == "path":
														
 
															+            with open(ofd_f, "rb") as f:
														
 
															+                ofd_f = str(base64.b64encode(f.read()), encoding="utf-8")
														
 
															+        elif fmt == "b64":
														
 
															+            pass
														
 
															+        elif fmt == "binary":
														
 
															+            ofd_f = str(base64.b64encode(ofd_f), encoding="utf-8")
														
 
															+        elif fmt == "io":
														
 
															+            ofd_f = str(base64.b64encode(ofd_f.getvalue()), encoding="utf-8")
														
 
															+        else:
														
 
															+            raise "fomat Error: %s" % fmt
														
 
															+
														
 
															+        self.data = OFDParser(ofd_f)(save_xml=save_xml, xml_name=xml_name, save_dir=save_dir)
														
 
															+
														
 
															+    def save(self, ):
														
 
															+        """
														
 
															+        draw ofd xml
														
 
															+        初始化一个xml 文件
														
 
															+        self.data > file
														
 
															+        """
														
 
															+        assert self.data, f"data is None"
														
 
															+
														
 
															+    def pdf2ofd(self, pdfbyte, optional_text=False):
														
 
															+        """pdf转ofd"""
														
 
															+        assert pdfbyte, f"pdfbyte is None"
														
 
															+        # logger.info(f"pdf2ofd")
														
 
															+        ofd_byte = OFDWrite()(pdfbyte, optional_text=optional_text)
														
 
															+        return ofd_byte
														
 
															+
														
 
															+    def to_pdf(self, return_need_convert_as_image=False):
														
 
															+        """return ofdbytes"""
														
 
															+
														
 
															+        assert self.data, f"data is None"
														
 
															+        # logger.info(f"to_pdf")
														
 
															+        obj = DrawPDF(self.data)
														
 
															+        result = obj()
														
 
															+        if not return_need_convert_as_image:
														
 
															+            return result
														
 
															+        else:
														
 
															+            return result, obj.page_need_to_image_dict
														
 
															+
														
 
															+    def pdf2img(self, pdfbytes):
														
 
															+
														
 
															+        image_list = []
														
 
															+
														
 
															+        doc = fitz.open(stream=pdfbytes, filetype="pdf")
														
 
															+
														
 
															+        for page in doc:
														
 
															+            rotate = int(0)
														
 
															+            zoom_x, zoom_y = 1.6, 1.6
														
 
															+            zoom_x, zoom_y = 2, 2
														
 
															+            mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
														
 
															+            pix = page.get_pixmap(matrix=mat, alpha=False)
														
 
															+            pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
														
 
															+            # image = np.ndarray((pix.height, pix.width, 3), dtype=np.uint8, buffer=pix.samples)
														
 
															+            # print(image.shape)
														
 
															+            # print(image[2])
														
 
															+            image_list.append(pil_image)
														
 
															+        # logger.info(f"pdf2img")
														
 
															+        return image_list
														
 
															+
														
 
															+    def jpg2ofd(self, imglist: list):
														
 
															+        """
														
 
															+        imglist: pil image list
														
 
															+        """
														
 
															+        ofd_byte = OFDWrite()(pil_img_list=imglist)
														
 
															+        return ofd_byte
														
 
															+
														
 
															+    def jpg2pfd(self, imglist: list):
														
 
															+        """
														
 
															+        imglist: PIL image list
														
 
															+        1 构建data 
														
 
															+        2 DrawPDF(self.data)()
														
 
															+        """
														
 
															+
														
 
															+        data = OFDParser(None).img2data(imglist)
														
 
															+        return DrawPDF(data)()
														
 
															+
														
 
															+    def to_jpg(self, format="jpg"):
														
 
															+        """
														
 
															+        return pil list
														
 
															+        """
														
 
															+        assert self.data, f"data is None"
														
 
															+        image_list = []
														
 
															+        pdfbytes = self.to_pdf()
														
 
															+        image_list = self.pdf2img(pdfbytes)
														
 
															+        return image_list
														
 
															+
														
 
															+    def del_data(self, ):
														
 
															+        """销毁self.data"""
														
 
															+        self.data = None
														
 
															+
														
 
															+    def __del__(self):
														
 
															+        del self
														
 
															+
														
 
															+    def disposal(self, ):
														
 
															+        """销毁对象"""
														
 
															+        self.__del__()
														
 
															+
														
 
															+
														
 
															+def find_similar_characters():
														
 
															+    similar_pairs = []
														
 
															+    for code in range(0x4E00, 0x9FFF):  # 遍历常见的中文字符范围
														
 
															+        char = chr(code)
														
 
															+        try:
														
 
															+            name = unicodedata.name(char)
														
 
															+            if name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
														
 
															+                original_char = unicodedata.lookup(name.split()[-1])
														
 
															+                similar_pairs.append((original_char, char))
														
 
															+        except (ValueError, KeyError):
														
 
															+            continue
														
 
															+    return similar_pairs
														
 
															+
														
 
															+
														
 
															+def save_chinese_characters(output_path):
														
 
															+    with open(output_path, 'w', encoding='utf-8') as file:
														
 
															+        # 遍历更多的中文字符范围
														
 
															+        # for code in range(0x3400, 0x4DFF + 1):  # CJK Unified Ideographs Extension A
														
 
															+        #     char = chr(code)
														
 
															+        #     # if not unicodedata.category(char).startswith('P'):
														
 
															+        #     file.write(char + '\n')
														
 
															+        # for code in range(0x4E00, 0x9FFF + 1):  # 常见的中文字符范围
														
 
															+        #     char = chr(code)
														
 
															+        #     # if not unicodedata.category(char).startswith('P'):
														
 
															+        #     file.write(char + '\n')
														
 
															+        # for code in range(0xF900, 0xFAFF + 1):  # CJK Compatibility Ideographs
														
 
															+        #     char = chr(code)
														
 
															+        #     # if not unicodedata.category(char).startswith('P'):
														
 
															+        #     file.write(char + '\n')
														
 
															+        # for code in range(0x2F00, 0x2FDF + 1):  # CJK Compatibility Ideographs
														
 
															+        #     char = chr(code)
														
 
															+        #     # if not unicodedata.category(char).startswith('P'):
														
 
															+        #     file.write(char + '\n')
														
 
															+
														
 
															+        for code in range(0xF900, 0xFAD9 + 1):  # CJK Compatibility Ideographs
														
 
															+            char = chr(code)
														
 
															+            # if not unicodedata.category(char).startswith('P'):
														
 
															+            file.write(char + '\n')
														
 
															+
														
 
															+
														
 
															+def map_kangxi_to_common_characters(kangxi_start=0x2F00, kangxi_end=0x2FDF, common_start=0x4E00, common_end=0x9FFF, output_path="kangxi_to_common.txt"):
														
 
															+    with open(output_path, 'w', encoding='utf-8') as file:
														
 
															+        # 遍历康熙部首范围
														
 
															+        for kangxi_code in range(kangxi_start, kangxi_end + 1):
														
 
															+            kangxi_char = chr(kangxi_code)
														
 
															+            # 遍历常见中文字符范围
														
 
															+            for common_code in range(common_start, common_end + 1):
														
 
															+                common_char = chr(common_code)
														
 
															+                # 如果字形相同，则记录匹配
														
 
															+                if kangxi_char == common_char:
														
 
															+                    file.write(f"{kangxi_char} (Kangxi: {hex(kangxi_code)}) -> {common_char} (Common: {hex(common_code)})\n")
														
 
															+                    break  # 找到匹配后，跳出内层循环
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # ofd = OFD()
														
 
															+
														
 
															+    # p = r'D:\Project\format_conversion_maxcompute\format_convert\temp\2b42e0b44cea11f0ab9644f971944973\2b4307ae4cea11f0992a44f971944973_ofd\Doc_0\Res\19.ttf'
														
 
															+    # font = TTFont(p)  # 替换为你的TTF文件路径
														
 
															+    # print('font', font.keys())
														
 
															+    #
														
 
															+    # # 访问 GlyphOrder 表
														
 
															+    # glyph_order = font['glyf']
														
 
															+    # print("Glyph Order:", glyph_order.glyphs)
														
 
															+    #
														
 
															+    # # 访问 head 表
														
 
															+    # head = font['head']
														
 
															+    # print("Font Head:")
														
 
															+    # print(f" - Font Magic Number: {head.magicNumber}")
														
 
															+    # print(f" - Font Version: {head.fontRevision}")
														
 
															+    # print(f" - Font Flags: {head.flags}")
														
 
															+    # print(f" - Units per Em: {head.unitsPerEm}")
														
 
															+    # print(f" - Created: {head.created}")
														
 
															+    # print(f" - Modified: {head.modified}")
														
 
															+    #
														
 
															+    # # 访问 hhea 表
														
 
															+    # hhea = font['hhea']
														
 
															+    # print("Horizontal Header:")
														
 
															+    # print(f" - Ascent: {hhea.ascent}")
														
 
															+    # print(f" - Descent: {hhea.descent}")
														
 
															+    # print(f" - Line Gap: {hhea.lineGap}")
														
 
															+    #
														
 
															+    # # 访问 maxp 表
														
 
															+    # maxp = font['maxp']
														
 
															+    # print("Maximum Profile:")
														
 
															+    # print(f" - Number of Glyphs: {maxp.numGlyphs}")
														
 
															+    #
														
 
															+    # # 访问 OS/2 表
														
 
															+    # os2 = font['OS/2']
														
 
															+    # print("OS/2 and Windows Metrics:")
														
 
															+    # print(f" - Weight Class: {os2.usWeightClass}")
														
 
															+    # print(f" - Width Class: {os2.usWidthClass}")
														
 
															+    # print(f" - Type: {os2.fsType}")
														
 
															+    #
														
 
															+    # # 访问 hmtx 表
														
 
															+    # hmtx = font['hmtx']
														
 
															+    # print("Horizontal Metrics:")
														
 
															+    # for glyph_name, metrics in hmtx.metrics.items():
														
 
															+    #     print(f" - Glyph '{glyph_name}': Advance Width = {metrics[0]}, Left Side Bearing = {metrics[1]}")
														
 
															+    #
														
 
															+    # # 访问 loca 表
														
 
															+    # loca = font.get('loca')
														
 
															+    # print("Locations:")
														
 
															+    # for i, location in enumerate(loca):
														
 
															+    #     print(f" - Glyph {i}: {location}")
														
 
															+    #
														
 
															+    # # 访问 glyf 表
														
 
															+    # glyf = font.get('glyf')
														
 
															+    # for glyph_name in glyf.glyphs:
														
 
															+    #     glyph = glyf[glyph_name]
														
 
															+    #     print(f"Glyph '{glyph_name}':")
														
 
															+    #     print(f" - Number of Contours: {glyph.numberOfContours}")
														
 
															+    #     if glyph.numberOfContours > 0:
														
 
															+    #         print(f" - X Minimum: {glyph.xMin}")
														
 
															+    #         print(f" - Y Minimum: {glyph.yMin}")
														
 
															+    #         print(f" - X Maximum: {glyph.xMax}")
														
 
															+    #         print(f" - Y Maximum: {glyph.yMax}")
														
 
															+    #     else:
														
 
															+    #         print(" - No Contours")
														
 
															+    #     print()
														
 
															+    #
														
 
															+    # # 访问 name 表
														
 
															+    # name = font['name']
														
 
															+    # print("Font Name Entries:")
														
 
															+    # for record in name.names:
														
 
															+    #     print(f" - Name ID: {record.nameID}")
														
 
															+    #     print(f" - Platform ID: {record.platformID}")
														
 
															+    #     print(f" - Encoding ID: {record.platEncID}")
														
 
															+    #     print(f" - Language ID: {record.langID}")
														
 
															+    #     print(f" - Name: {record.toUnicode()}")
														
 
															+    #     print()
														
 
															+    # font.close()
														
 
															+    # print(best_cmap)
														
 
															+
														
 
															+    import unicodedata
														
 
															+    #
														
 
															+    # # 示例
														
 
															+    # text = "仁和坪镇杨柳池村⼈居环境整治项⽬终⽌"
														
 
															+    # standardized_text = unicodedata.normalize('NFD', text)
														
 
															+    # print(f"标准化后的文本: {standardized_text}")
														
 
															+
														
 
															+    # import unicodedata
														
 
															+    #
														
 
															+    #
														
 
															+    #
														
 
															+    # similar_characters = find_similar_characters()
														
 
															+    #
														
 
															+    # for pair in similar_characters:
														
 
															+    #     print(f"原始字符: {pair[0]}, 兼容字符: {pair[1]}")
														
 
															+    #
														
 
															+    # print(f"共找到 {len(similar_characters)} 对相似中文字符。")
														
 
															+
														
 
															+
														
 
															+    # 使用示例
														
 
															+    output_path = 'chinese_characters.txt'
														
 
															+    # save_chinese_characters(output_path)
														
 
															+
														
 
															+    # 获取并打印 Unicode 编码
														
 
															+    # char = '⽬'
														
 
															+    # # char = '目'
														
 
															+    # print(f"字符 '{char}' 的 Unicode 编码是: {ord(char):04X}")
														
--- a/format_convert/easyofd/easyofd/parser_ofd/__init__.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/__init__.py
@@ -0,0 +1,37 @@
 
															+import os
														
 
															+import sys
														
 
															+
														
 
															+from loguru import logger
														
 
															+from reportlab.pdfbase import pdfmetrics
														
 
															+from reportlab.pdfbase.cidfonts import UnicodeCIDFont
														
 
															+from reportlab.pdfbase.ttfonts import TTFont
														
 
															+
														
 
															+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../../../../")
														
 
															+
														
 
															+
														
 
															+# from ofd_parser import *
														
 
															+
														
 
															+
														
 
															+font_map = {"simsun.ttc":["宋体", "SWPMEH+SimSun","SimSun","SWDKON+SimSun"],
														
 
															+            'simkai.ttf':["KaiTi","楷体","SWLCQE+KaiTi","SWHGME+KaiTi","BWSimKai"],
														
 
															+            # 'STKAITI.TTF':["华文楷体 常规","STKAITI","华文楷体"],
														
 
															+            "COURI.TTF":["CourierNewPSMT","CourierNew","SWCRMF+CourierNewPSMT","SWANVV+CourierNewPSMT"],
														
 
															+            "courbd.TTF":["Courier New"],
														
 
															+            "simhei.ttf":["SimHei","hei","黑体"]
														
 
															+            }
														
 
															+pdfmetrics.registerFont(UnicodeCIDFont('STSong-Light'))
														
 
															+
														
 
															+# 初始化字体
														
 
															+for font,names in font_map.items():
														
 
															+    for name in names:
														
 
															+        try:
														
 
															+            pdfmetrics.registerFont(TTFont(name, font))
														
 
															+        except:
														
 
															+            logger.warning(f"FONT  registerFont failed {font}: {name}")
														
 
															+
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.ofd_parser import OFDParser
														
 
															+__all__=["OFDParser"]
														
 
															+                                    
														
 
															+
														
 
															+
														
 
															+
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_annotation_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_annotation_parser.py
@@ -0,0 +1,145 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_annotation_parser.py
														
 
															+# CREATE_TIME: 2025/3/28 14:12
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: 注释解析
														
 
															+import re
														
 
															+
														
 
															+from loguru import logger
														
 
															+from .file_parser_base import FileParserBase
														
 
															+
														
 
															+
														
 
															+# class AnnotationsParser(FileParserBase):
														
 
															+#     """
														
 
															+#     Parser Annotations
														
 
															+#     注释信息-总
														
 
															+#     /xml_dir/Doc_0/Pages/Page_0/Content.xml
														
 
															+#     """
														
 
															+#
														
 
															+#     def __call__(self):
														
 
															+#         info = {}
														
 
															+#         annotations_res: list = []
														
 
															+#         annotations_res_key = "ofd:Page"
														
 
															+#         self.recursion_ext(self.xml_obj, annotations_res, annotations_res_key)
														
 
															+#         # logger.debug(f"annotations_res is {annotations_res}")
														
 
															+#         if annotations_res:
														
 
															+#             for i in annotations_res:
														
 
															+#                 page_id = i.get("@PageID")
														
 
															+#                 if not page_id:
														
 
															+#                     # logger.debug(f"page_id is null ")
														
 
															+#                     continue
														
 
															+#                 file_Loc = i.get("ofd:FileLoc")
														
 
															+#                 if not file_Loc:
														
 
															+#                     # logger.debug(f"file_Loc is null ")
														
 
															+#                     continue
														
 
															+#                 info[page_id] = {
														
 
															+#                     "FileLoc": file_Loc,
														
 
															+#                 }
														
 
															+#
														
 
															+#         return info
														
 
															+#
														
 
															+#
														
 
															+# class AnnotationFileParser(FileParserBase):
														
 
															+#     """
														
 
															+#     Parser Annotation
														
 
															+#     注释类 包含 签名注释 水印注释 信息注释
														
 
															+#     """
														
 
															+#
														
 
															+#     AnnoType = {
														
 
															+#         "Watermark": {
														
 
															+#             "name": "水印",
														
 
															+#             "type": "Watermark"
														
 
															+#         },
														
 
															+#         "Link": {
														
 
															+#             "name": "链接",
														
 
															+#             "type": "Link"
														
 
															+#         }
														
 
															+#         ,
														
 
															+#         "Path": {
														
 
															+#             "name": "路径",
														
 
															+#             "type": "Path"
														
 
															+#         },
														
 
															+#         "Highlight": {
														
 
															+#             "name": "高亮",
														
 
															+#             "type": "Highlight"
														
 
															+#         },
														
 
															+#         "Stamp": {
														
 
															+#             "name": "签章",
														
 
															+#             "type": "Highlight"
														
 
															+#         }
														
 
															+#     }
														
 
															+#
														
 
															+#     def normalize_font_name(self, font_name):
														
 
															+#         """将字体名称规范化，例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
														
 
															+#         # 替换空格为无，并将样式（Bold/Italic等）用连字符连接
														
 
															+#         if not isinstance(font_name, str):
														
 
															+#             return ""
														
 
															+#         normalized = font_name.replace(' ', '')
														
 
															+#         # 处理常见的样式后缀
														
 
															+#         for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
														
 
															+#             if style in normalized:
														
 
															+#                 normalized = normalized.replace(style, f'-{style}')
														
 
															+#
														
 
															+#         # todo 特殊字体名规范 后续存在需要完善
														
 
															+#         if normalized == "TimesNewRoman":
														
 
															+#             normalized = normalized.replace("TimesNewRoman", "Times-Roman")
														
 
															+#         return normalized
														
 
															+#
														
 
															+#     def __call__(self):
														
 
															+#         info = {}
														
 
															+#         public_res: list = []
														
 
															+#         public_res_key = "ofd:Page"
														
 
															+#         self.recursion_ext(self.xml_obj, public_res, public_res_key)
														
 
															+#
														
 
															+#         if public_res:
														
 
															+#             for i in public_res:
														
 
															+#                 info[i.get("@ID")] = {
														
 
															+#                     "FontName": self.normalize_font_name(i.get("@FontName")),
														
 
															+#                     "FontNameORI": i.get("@FontName"),
														
 
															+#                     "FamilyName": self.normalize_font_name(i.get("@FamilyName")),
														
 
															+#                     "FamilyNameORI": i.get("@FamilyName"),
														
 
															+#                     "Bold": i.get("@Bold"),
														
 
															+#                     "Serif": i.get("@Serif"),
														
 
															+#                     "FixedWidth": i.get("@FixedWidth"),
														
 
															+#                     "FontFile": i.get("ofd:FontFile"),
														
 
															+#                 }
														
 
															+#         return info
														
 
															+
														
 
															+
														
 
															+class AnnotationFileParser(FileParserBase):
														
 
															+    """
														
 
															+    Annotations.xml 为doc内的根节点 包含：
														
 
															+    1 文件的路径
														
 
															+
														
 
															+    /xml_dir/Doc_0/Annotations.xml
														
 
															+    """
														
 
															+
														
 
															+    def loc2page_no(self, loc, idx):
														
 
															+        pg_no = re.search(r"\d+", loc)
														
 
															+        if pg_no:
														
 
															+            pg_no = int(pg_no.group())
														
 
															+        else:
														
 
															+            pg_no = idx
														
 
															+        return pg_no
														
 
															+
														
 
															+    def __call__(self):
														
 
															+        annot_info = {}
														
 
															+
														
 
															+        # ofd:Page 正文
														
 
															+        page: list = []
														
 
															+        page_id_map = {}
														
 
															+        page_key = "ofd:Page"
														
 
															+        self.recursion_ext(self.xml_obj, page, page_key)
														
 
															+        if page:
														
 
															+            # print('page', page)
														
 
															+            page_id_map = {
														
 
															+                i.get("@PageID"): self.loc2page_no(i.get("ofd:FileLoc"), idx)
														
 
															+                for idx, i in enumerate(page)
														
 
															+            }
														
 
															+            page = [i.get("ofd:FileLoc") if isinstance(i, dict) else i for i in page]
														
 
															+
														
 
															+        annot_info["annot_page"] = page
														
 
															+        annot_info["annot_page_id_map"] = page_id_map
														
 
															+        return annot_info
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_attachment_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_attachment_parser.py
@@ -0,0 +1,7 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_attachment_parser.py
														
 
															+# CREATE_TIME: 2025/4/9 18:52
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE:
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_content_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_content_parser.py
@@ -0,0 +1,140 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_content_parser.py
														
 
															+# CREATE_TIME: 2025/3/28 11:47
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: 解析正文
														
 
															+from loguru import  logger
														
 
															+from .file_parser_base import FileParserBase
														
 
															+
														
 
															+
														
 
															+class ContentFileParser(FileParserBase):
														
 
															+    """
														
 
															+    Parser Contents&tpls
														
 
															+    /xml_dir/Doc_0/Doc_0/Pages/Page_0/Content.xml
														
 
															+    """
														
 
															+
														
 
															+    def fetch_cell_info(self, row, TextObject):
														
 
															+        """fetch_cell_info"""
														
 
															+        cell_d = {}
														
 
															+        cell_d = {}
														
 
															+        cell_d["ID"] = row['@ID']  # 字体
														
 
															+        # 字体字形信息
														
 
															+        if row.get("ofd:CGTransform"):
														
 
															+            Glyphs_d = {
														
 
															+                "Glyphs": row.get("ofd:CGTransform").get("ofd:Glyphs"),
														
 
															+                "GlyphCount": row.get("ofd:CGTransform").get("@GlyphCount"),
														
 
															+                "CodeCount": row.get("ofd:CGTransform").get("@CodeCount"),
														
 
															+                "CodePosition": row.get("ofd:CGTransform").get("@CodePosition")
														
 
															+            }
														
 
															+            cell_d["Glyphs_d"] = Glyphs_d
														
 
															+
														
 
															+        cell_d["pos"] = [float(pos_i) for pos_i in row['@Boundary'].split(" ")]  # 文本框
														
 
															+        if row.get('ofd:Clips', {}).get('ofd:Clip', {}).get('ofd:Area', {}).get('ofd:Path', {}):
														
 
															+            try:
														
 
															+                cell_d["clips_pos"] = [float(pos_i) for pos_i in
														
 
															+                                       row.get('ofd:Clips', {})
														
 
															+                                           .get('ofd:Clip', {})
														
 
															+                                           .get('ofd:Area', {})
														
 
															+                                           .get('ofd:Path', {})
														
 
															+                                           .get('@Boundary', "")
														
 
															+                                           .split(" ")]
														
 
															+            except:
														
 
															+                pass
														
 
															+        cell_d["text"] = str(TextObject.get('#text'))
														
 
															+        cell_d["font"] = row['@Font']  # 字体
														
 
															+        cell_d["size"] = float(row['@Size'])  # 字号
														
 
															+        # print("row", row)
														
 
															+
														
 
															+        color = self.ofd_param("ofd:FillColor", row).get("@Value", "0 0 0")
														
 
															+
														
 
															+        cell_d["color"] = tuple(color.split(" "))  # 颜色
														
 
															+        cell_d["DeltaY"] = TextObject.get("@DeltaY", "")  # y 轴偏移量 竖版文字表示方法之一
														
 
															+        cell_d["DeltaX"] = TextObject.get("@DeltaX", "")  # x 轴偏移量
														
 
															+        cell_d["CTM"] = row.get("@CTM", "")  # 平移矩阵换
														
 
															+
														
 
															+        cell_d["X"] = TextObject.get("@X", "")  # X 文本之与文本框距离
														
 
															+        cell_d["Y"] = TextObject.get("@Y", "")  # Y 文本之与文本框距离
														
 
															+        return cell_d
														
 
															+
														
 
															+    def __call__(self) -> list:
														
 
															+        """
														
 
															+
														
 
															+        输出主体坐标和文字信息 cell_list
														
 
															+        [{"pos":row['@Boundary'].split(" "),
														
 
															+                    "text":row['ofd:TextCode'].get('#text'),
														
 
															+                    "font":row['@Font'],
														
 
															+                    "size":row['@Size'],}]
														
 
															+        """
														
 
															+        text_list = []
														
 
															+        img_list = []
														
 
															+        line_list = []
														
 
															+
														
 
															+        content_d = {
														
 
															+            "text_list": text_list,
														
 
															+            "img_list": img_list,
														
 
															+            "line_list": line_list,
														
 
															+        }
														
 
															+
														
 
															+        text: list = []  # 正文
														
 
															+        text_key = "ofd:TextObject"
														
 
															+        self.recursion_ext(self.xml_obj, text, text_key)
														
 
															+
														
 
															+        if text:
														
 
															+            for row in text:
														
 
															+                # print("row", row.get('ofd:TextCode', {}))
														
 
															+                if isinstance(row.get('ofd:TextCode', {}), list):
														
 
															+                    for _i in row.get('ofd:TextCode', {}):
														
 
															+                        if not _i.get('#text'):
														
 
															+                            continue
														
 
															+                        cell_d = self.fetch_cell_info(row, _i)
														
 
															+                        text_list.append(cell_d)
														
 
															+
														
 
															+                elif isinstance(row.get('ofd:TextCode', {}), dict):
														
 
															+                    if not row.get('ofd:TextCode', {}).get('#text'):
														
 
															+                        continue
														
 
															+                    cell_d = self.fetch_cell_info(row, row.get('ofd:TextCode', {}))
														
 
															+                    text_list.append(cell_d)
														
 
															+
														
 
															+                else:
														
 
															+                    logger.error(f"'ofd:TextCode' format nonsupport  {row.get('ofd:TextCode', {})}")
														
 
															+                    continue
														
 
															+
														
 
															+        line: list = []  # 路径线条
														
 
															+        line_key = "ofd:PathObject"
														
 
															+        self.recursion_ext(self.xml_obj, line, line_key)
														
 
															+
														
 
															+        if line:
														
 
															+            # print(line)
														
 
															+            for _i in line:
														
 
															+                line_d = {}
														
 
															+                # print("line",_i)
														
 
															+                try:
														
 
															+                    line_d["ID"] = _i.get("@ID", "")  # 图片id
														
 
															+                    line_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")]  # 平移矩阵换
														
 
															+                    line_d["LineWidth"] = _i.get("@LineWidth", "")  # 图片id
														
 
															+                    line_d["AbbreviatedData"] = _i.get("ofd:AbbreviatedData", "")  # 路径指令
														
 
															+                    line_d["FillColor"] = self.ofd_param("ofd:FillColor", _i).get('@Value', "0 0 0").split(" ")  # 颜色
														
 
															+                    line_d["StrokeColor"] = self.ofd_param("ofd:StrokeColor", _i).get('@Value', "0 0 0")  # 颜色
														
 
															+                    line_d["CTM"] = _i.get("@CTM", "")  # 平移矩阵换
														
 
															+                except KeyError as e:
														
 
															+                    logger.error(f"{e} \n line is {_i} \n")
														
 
															+                    continue
														
 
															+                line_list.append(line_d)
														
 
															+
														
 
															+        img: list = []  # 图片
														
 
															+        img_key = "ofd:ImageObject"
														
 
															+        self.recursion_ext(self.xml_obj, img, img_key)
														
 
															+
														
 
															+        if img:
														
 
															+            for _i in img:
														
 
															+                img_d = {}
														
 
															+                img_d["CTM"] = _i.get("@CTM", "")  # 平移矩阵换
														
 
															+                img_d["ID"] = _i.get("ID", "")  # 图片id
														
 
															+                img_d["ResourceID"] = _i.get("@ResourceID", "")  # 图片id
														
 
															+                img_d["pos"] = [float(pos_i) for pos_i in _i['@Boundary'].split(" ")]  # 平移矩阵换
														
 
															+                img_list.append(img_d)
														
 
															+
														
 
															+        return content_d
														
 
															+
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_customtag_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_customtag_parser.py
@@ -0,0 +1,7 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_customtag_parser.py
														
 
															+# CREATE_TIME: 2025/4/9 18:51
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE:
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_deal.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_deal.py
@@ -0,0 +1,104 @@
 
															+# coding: utf-8
														
 
															+#!/usr/bin/env python
														
 
															+#-*- coding: utf-8 -*-
														
 
															+#PROJECT_NAME: D:\code\easyofd\easyofd\parser
														
 
															+#CREATE_TIME: 2023-07-27 
														
 
															+#E_MAIL: renoyuan@foxmail.com
														
 
															+#AUTHOR: reno 
														
 
															+#NOTE:  文件处理
														
 
															+import os
														
 
															+import base64
														
 
															+import shutil
														
 
															+from typing import Any
														
 
															+from uuid import uuid1
														
 
															+
														
 
															+import xmltodict
														
 
															+import zipfile
														
 
															+from loguru import logger
														
 
															+
														
 
															+from .path_parser import PathParser
														
 
															+
														
 
															+
														
 
															+class FileRead(object):
														
 
															+    """
														
 
															+    文件读取，清除
														
 
															+    'root': OFD.xml 
														
 
															+    "root_doc" Doc_0/Document.xml
														
 
															+    xml_path : xml_obj
														
 
															+    other_path : b64string
														
 
															+    """
														
 
															+    def __init__(self, ofdb64:str):
														
 
															+
														
 
															+        self.ofdbyte = base64.b64decode(ofdb64) 
														
 
															+        pid=os.getpid()
														
 
															+        self.name = f"{pid}_{str(uuid1())}.ofd"
														
 
															+        self.pdf_name = self.name.replace(".ofd",".pdf")
														
 
															+        self.zip_path = f"{os.getcwd()}/{self.name}"
														
 
															+        self.unzip_path = ""
														
 
															+        self.file_tree = {}
														
 
															+    
														
 
															+    def unzip_file(self, unzip_dir=None):
														
 
															+        """
														
 
															+        :param zip_path: ofd格式文件路径
														
 
															+        :param unzip_path: 解压后的文件存放目录
														
 
															+        :return: unzip_path
														
 
															+        """
														
 
															+        if unzip_dir is None:
														
 
															+            self.unzip_path = self.zip_path.split('.')[0]
														
 
															+            self.zip_path = f"{os.getcwd()}/{self.name}"
														
 
															+        else:
														
 
															+            self.unzip_path = unzip_dir
														
 
															+            self.zip_path = f"{unzip_dir}{self.name}"
														
 
															+        print('ofd self.unzip_path', self.unzip_path)
														
 
															+        print('ofd self.zip_path', self.zip_path)
														
 
															+
														
 
															+        with open(self.zip_path,"wb") as f:
														
 
															+            f.write(self.ofdbyte)
														
 
															+
														
 
															+        with zipfile.ZipFile(self.zip_path, 'r') as f:
														
 
															+            for file in f.namelist():
														
 
															+                # print('file', file)
														
 
															+                # 跳过附件，在显示中不展示
														
 
															+                if 'Attachs' in file:
														
 
															+                    continue
														
 
															+                f.extract(file, path=self.unzip_path)
														
 
															+        if self.save_xml:
														
 
															+            print("saving xml {}".format(self.xml_name))
														
 
															+            with zipfile.ZipFile(self.zip_path, 'r') as f:
														
 
															+                for file in f.namelist():
														
 
															+                    f.extract(file, path=self.xml_name)
														
 
															+       
														
 
															+    def buld_file_tree(self):
														
 
															+        "xml读取对象其他b64"
														
 
															+        self.file_tree["root"] = self.unzip_path
														
 
															+        self.file_tree["pdf_name"] = self.pdf_name
														
 
															+        for root, dirs, files in os.walk(self.unzip_path):
														
 
															+            for file in files:
														
 
															+                
														
 
															+                abs_path = os.path.join(root,file)
														
 
															+                # 资源文件 则 b64 xml 则  xml——obj
														
 
															+                self.file_tree[abs_path] = str(base64.b64encode(open(f"{abs_path}","rb").read()),"utf-8")  \
														
 
															+                    if "xml" not in file else xmltodict.parse(open(f"{abs_path}" , "r", encoding="utf-8").read())
														
 
															+        self.file_tree["root_doc"] = os.path.join(self.unzip_path,"OFD.xml") if os.path.join(self.unzip_path,"OFD.xml") in self.file_tree else ""
														
 
															+  
														
 
															+        # if os.path.exists(self.unzip_path):
														
 
															+        #     shutil.rmtree(self.unzip_path)
														
 
															+       
														
 
															+        # if os.path.exists(self.zip_path):
														
 
															+        #     os.remove(self.zip_path)
														
 
															+                   
														
 
															+    def __call__(self, *args: Any, **kwds: Any) -> Any:
														
 
															+        self.save_xml=kwds.get("save_xml",False)
														
 
															+        self.xml_name=kwds.get("xml_name")
														
 
															+        self.save_dir = kwds.get('save_dir')
														
 
															+    
														
 
															+        self.unzip_file(self.save_dir)
														
 
															+        self.buld_file_tree()
														
 
															+        return self.file_tree 
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    with open(r"D:/code/easyofd/test/增值税电子专票5.ofd","rb") as f:
														
 
															+        ofdb64 = str(base64.b64encode(f.read()),"utf-8")
														
 
															+    a = FileRead(ofdb64)()
														
 
															+    print(list(a.keys()))
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_doc_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_doc_parser.py
@@ -0,0 +1,99 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_doc_parser.py
														
 
															+# CREATE_TIME: 2025/3/28 11:46
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: 解析document
														
 
															+
														
 
															+import  re
														
 
															+
														
 
															+from .file_parser_base import FileParserBase
														
 
															+
														
 
															+
														
 
															+
														
 
															+class DocumentFileParser(FileParserBase):
														
 
															+    """
														
 
															+    Document 为doc内的根节点 包含：
														
 
															+    1 文件的路径 2 doc的size
														
 
															+
														
 
															+    /xml_dir/Doc_0/Document.xml
														
 
															+    """
														
 
															+
														
 
															+    def loc2page_no(self, loc, idx):
														
 
															+        pg_no = re.search(r"\d+", loc)
														
 
															+        if pg_no:
														
 
															+            pg_no = int(pg_no.group())
														
 
															+        else:
														
 
															+            pg_no = idx
														
 
															+        return pg_no
														
 
															+
														
 
															+    def __call__(self):
														
 
															+        document_info = {}
														
 
															+
														
 
															+        # size
														
 
															+        physical_box: list = []
														
 
															+        physical_box_key = "ofd:PhysicalBox"
														
 
															+        self.recursion_ext(self.xml_obj, physical_box, physical_box_key)
														
 
															+        document_info["size"] = physical_box[0] if physical_box else ""
														
 
															+
														
 
															+        # ofd:PublicRes路径 包含字体路径信息
														
 
															+        public_res: list = []
														
 
															+        public_res_key = "ofd:PublicRes"
														
 
															+        self.recursion_ext(self.xml_obj, public_res, public_res_key)
														
 
															+        document_info["public_res"] = public_res
														
 
															+
														
 
															+        # ofd:DocumentRes路径  包含静态资源图片
														
 
															+        document_res: list = []
														
 
															+        document_res_key = "ofd:DocumentRes"
														
 
															+        self.recursion_ext(self.xml_obj, document_res, document_res_key)
														
 
															+        document_info["document_res"] = document_res
														
 
															+
														
 
															+        # tpls
														
 
															+        tpls: list = []
														
 
															+        template_page_key = "ofd:TemplatePage"
														
 
															+        self.recursion_ext(self.xml_obj, tpls, template_page_key)
														
 
															+        if tpls:
														
 
															+            tpls = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in tpls]
														
 
															+        document_info["tpls"] = tpls
														
 
															+
														
 
															+        # ofd:Page 正文
														
 
															+        page: list = []
														
 
															+        page_id_map = {}
														
 
															+        page_key = "ofd:Page"
														
 
															+        self.recursion_ext(self.xml_obj, page, page_key)
														
 
															+        if page:
														
 
															+            page_id_map = {
														
 
															+                i.get("@ID"): self.loc2page_no(i.get("@BaseLoc"), idx)
														
 
															+                for idx, i in enumerate(page)
														
 
															+            }
														
 
															+            page = [i.get("@BaseLoc") if isinstance(i, dict) else i for i in page]
														
 
															+
														
 
															+        document_info["page"] = page
														
 
															+        document_info["page_id_map"] = page_id_map
														
 
															+
														
 
															+        # ofd:Annotations
														
 
															+        annotations: list = []
														
 
															+        annotations_key = "ofd:Annotations"
														
 
															+        self.recursion_ext(self.xml_obj, annotations, annotations_key)
														
 
															+        document_info["Annotations"] = annotations
														
 
															+
														
 
															+        # ofd:Attachments
														
 
															+        attachments: list = []
														
 
															+        attachments_key = "ofd:Attachments"
														
 
															+        self.recursion_ext(self.xml_obj, attachments, attachments_key)
														
 
															+        document_info["attachments"] = attachments
														
 
															+
														
 
															+        # ofd:CustomTags
														
 
															+        custom_tag: list = []
														
 
															+        custom_tag_key = "ofd:CustomTags"
														
 
															+        self.recursion_ext(self.xml_obj, custom_tag, custom_tag_key)
														
 
															+        document_info["custom_tag"] = custom_tag
														
 
															+
														
 
															+        return document_info
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_docres_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_docres_parser.py
@@ -0,0 +1,36 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_docres_parser.py
														
 
															+# CREATE_TIME: 2025/3/28 11:48
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: 解析 DocumentRes
														
 
															+
														
 
															+import os
														
 
															+
														
 
															+from .file_parser_base import FileParserBase
														
 
															+
														
 
															+class DocumentResFileParser(FileParserBase):
														
 
															+    """
														
 
															+    Parser DocumentRes 抽取里面图片信息
														
 
															+    /xml_dir/Doc_0/DocumentRes.xml
														
 
															+    /xml_dir/Doc_0/PublicRes.xml
														
 
															+    """
														
 
															+
														
 
															+    def __call__(self):
														
 
															+        info = {}
														
 
															+        muti_media: list = []
														
 
															+        muti_media_key = "ofd:MultiMedia"
														
 
															+        self.recursion_ext(self.xml_obj, muti_media, muti_media_key)
														
 
															+        if muti_media:
														
 
															+            for media in muti_media:
														
 
															+                name = media.get("ofd:MediaFile", "")
														
 
															+                info[media.get("@ID")] = {
														
 
															+                    "format": media.get("@Format", ""),
														
 
															+                    "wrap_pos": media.get("@wrap_pos", ""),
														
 
															+                    # "Boundary": media.get("@Boundary", ""),
														
 
															+                    "type": media.get("@Type", ""),
														
 
															+                    "suffix": os.path.splitext(name)[-1].replace(".", ""),  # 文件后缀名
														
 
															+                    "fileName": name,
														
 
															+                }
														
 
															+        return info
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_ofd_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_ofd_parser.py
@@ -0,0 +1,41 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_ofd_parser.py
														
 
															+# CREATE_TIME: 2025/3/28 11:45
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: 解析OFD
														
 
															+from .file_parser_base import FileParserBase
														
 
															+
														
 
															+class OFDFileParser(FileParserBase):
														
 
															+    """
														
 
															+    Parser OFD 文件
														
 
															+    /xml_dir/OFD.xml
														
 
															+    """
														
 
															+    def __call__(self):
														
 
															+        info = {}
														
 
															+        # DocRoot
														
 
															+        doc_root: list = []
														
 
															+        doc_root_key = "ofd:DocRoot"
														
 
															+        # print(self.xml_obj,doc_root)
														
 
															+        self.recursion_ext(self.xml_obj, doc_root, doc_root_key)
														
 
															+        info["doc_root"] = doc_root
														
 
															+
														
 
															+        signatures: list = []
														
 
															+        signatures_key = "ofd:Signatures"
														
 
															+        self.recursion_ext(self.xml_obj, signatures, signatures_key)
														
 
															+        info["signatures"] = signatures
														
 
															+
														
 
															+        # ofd:Creator
														
 
															+        creator: list = []
														
 
															+        creator_key = "ofd:Creator"
														
 
															+        self.recursion_ext(self.xml_obj, creator, creator_key)
														
 
															+        info["creator"] = creator
														
 
															+
														
 
															+        # ofd:CreationDate
														
 
															+        reation_date: list = []
														
 
															+        creation_date_key = "ofd:CreationDate"
														
 
															+        self.recursion_ext(self.xml_obj, reation_date, creation_date_key)
														
 
															+        info["creationDate"] = reation_date
														
 
															+
														
 
															+        return info
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_parser.py
@@ -0,0 +1,58 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME: D:\code\easyofd\easyofd\parser
														
 
															+# CREATE_TIME: 2023-07-27
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: 每种类型的文件定义一个解析器
														
 
															+
														
 
															+import sys
														
 
															+
														
 
															+sys.path.insert(0, "..")
														
 
															+import logging
														
 
															+import os
														
 
															+import traceback
														
 
															+import base64
														
 
															+import re
														
 
															+from typing import Any
														
 
															+from .parameter_parser import ParameterParser
														
 
															+logger = logging.getLogger("root")
														
 
															+
														
 
															+
														
 
															+class FileParserBase(object):
														
 
															+    """xml解析"""
														
 
															+
														
 
															+    def __init__(self, xml_obj):
														
 
															+        assert xml_obj
														
 
															+        self.ofd_param = ParameterParser()
														
 
															+        self.xml_obj = xml_obj
														
 
															+        # print(xml_obj)
														
 
															+
														
 
															+    def recursion_ext(self, need_ext_obj, ext_list, key):
														
 
															+        """
														
 
															+        抽取需要xml要素
														
 
															+        need_ext_obj : xmltree
														
 
															+        ext_list: data container
														
 
															+        key: key
														
 
															+        """
														
 
															+        if isinstance(need_ext_obj, dict):
														
 
															+            for k, v in need_ext_obj.items():
														
 
															+                if k == key:
														
 
															+                    if isinstance(v, (dict, str)):
														
 
															+                        ext_list.append(v)
														
 
															+                    elif isinstance(v, list):
														
 
															+                        ext_list.extend(v)
														
 
															+                else:
														
 
															+                    if isinstance(v, dict):
														
 
															+                        self.recursion_ext(v, ext_list, key)
														
 
															+                    elif isinstance(v, list):
														
 
															+                        for cell in v:
														
 
															+                            self.recursion_ext(cell, ext_list, key)
														
 
															+                    else:
														
 
															+                        pass
														
 
															+        else:
														
 
															+            print(type(need_ext_obj))
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    FileParserBase("")()
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_parser_base.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_parser_base.py
@@ -0,0 +1,63 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_parser_base.py
														
 
															+# CREATE_TIME: 2025/3/28 11:43
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: base 解析器
														
 
															+
														
 
															+import sys
														
 
															+
														
 
															+sys.path.insert(0, "..")
														
 
															+import logging
														
 
															+import os
														
 
															+import traceback
														
 
															+import base64
														
 
															+import re
														
 
															+from typing import Any
														
 
															+from .parameter_parser import ParameterParser
														
 
															+logger = logging.getLogger("root")
														
 
															+
														
 
															+
														
 
															+class FileParserBase(object):
														
 
															+    """xml解析"""
														
 
															+
														
 
															+    def __init__(self, xml_obj):
														
 
															+        assert xml_obj
														
 
															+        self.ofd_param = ParameterParser()
														
 
															+        self.xml_obj = xml_obj
														
 
															+        # print(xml_obj)
														
 
															+
														
 
															+    def recursion_ext(self, need_ext_obj, ext_list, key):
														
 
															+        """
														
 
															+        抽取需要xml要素
														
 
															+        need_ext_obj : xmltree
														
 
															+        ext_list: data container
														
 
															+        key: key
														
 
															+        """
														
 
															+
														
 
															+        if isinstance(need_ext_obj, dict):
														
 
															+
														
 
															+            for k, v in need_ext_obj.items():
														
 
															+                if k == key:
														
 
															+
														
 
															+                    if isinstance(v, (dict, str)):
														
 
															+                        ext_list.append(v)
														
 
															+                    elif isinstance(v, list):
														
 
															+                        ext_list.extend(v)
														
 
															+
														
 
															+
														
 
															+                else:
														
 
															+
														
 
															+                    if isinstance(v, dict):
														
 
															+                        self.recursion_ext(v, ext_list, key)
														
 
															+                    elif isinstance(v, list):
														
 
															+                        for cell in v:
														
 
															+                            self.recursion_ext(cell, ext_list, key)
														
 
															+                    else:
														
 
															+
														
 
															+                        pass
														
 
															+        else:
														
 
															+
														
 
															+            print(type(need_ext_obj))
														
 
															+
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_publicres_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_publicres_parser.py
@@ -0,0 +1,52 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_publicres_parser.py
														
 
															+# CREATE_TIME: 2025/3/28 11:49
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: PublicResFileParser
														
 
															+
														
 
															+from .file_parser_base import FileParserBase
														
 
															+
														
 
															+
														
 
															+class PublicResFileParser(FileParserBase):
														
 
															+    """
														
 
															+    Parser PublicRes 抽取里面 获取公共信息 字体信息
														
 
															+    /xml_dir/Doc_0/PublicRes.xml
														
 
															+    """
														
 
															+
														
 
															+    def normalize_font_name(self, font_name):
														
 
															+        """将字体名称规范化，例如 'Times New Roman Bold' -> 'TimesNewRoman-Bold'"""
														
 
															+        # 替换空格为无，并将样式（Bold/Italic等）用连字符连接
														
 
															+        if not isinstance(font_name, str):
														
 
															+            return ""
														
 
															+        normalized = font_name.replace(' ', '')
														
 
															+        # 处理常见的样式后缀
														
 
															+        for style in ['Bold', 'Italic', 'Regular', 'Light', 'Medium', ]:
														
 
															+            if style in normalized:
														
 
															+                normalized = normalized.replace(style, f'-{style}')
														
 
															+
														
 
															+        # todo 特殊字体名规范 后续存在需要完善
														
 
															+        if normalized == "TimesNewRoman":
														
 
															+            normalized = normalized.replace("TimesNewRoman", "Times-Roman")
														
 
															+        return normalized
														
 
															+
														
 
															+    def __call__(self):
														
 
															+        info = {}
														
 
															+        public_res: list = []
														
 
															+        public_res_key = "ofd:Font"
														
 
															+        self.recursion_ext(self.xml_obj, public_res, public_res_key)
														
 
															+
														
 
															+        if public_res:
														
 
															+            for i in public_res:
														
 
															+                info[i.get("@ID")] = {
														
 
															+                    "FontName": self.normalize_font_name(i.get("@FontName")),
														
 
															+                    "FontNameORI": i.get("@FontName"),
														
 
															+                    "FamilyName": self.normalize_font_name(i.get("@FamilyName")),
														
 
															+                    "FamilyNameORI": i.get("@FamilyName"),
														
 
															+                    "Bold": i.get("@Bold"),
														
 
															+                    "Serif": i.get("@Serif"),
														
 
															+                    "FixedWidth": i.get("@FixedWidth"),
														
 
															+                    "FontFile": i.get("ofd:FontFile"),
														
 
															+                }
														
 
															+        return info
														
--- a/format_convert/easyofd/easyofd/parser_ofd/file_signature_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/file_signature_parser.py
@@ -0,0 +1,63 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  file_signature_parser.py
														
 
															+# CREATE_TIME: 2025/3/28 14:13
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: 签章解析
														
 
															+
														
 
															+from .file_parser_base import FileParserBase
														
 
															+
														
 
															+class SignaturesFileParser(FileParserBase):
														
 
															+    """
														
 
															+    Parser Signatures
														
 
															+    签章信息-总
														
 
															+    /xml_dir/Doc_0/PublicRes.xml
														
 
															+    """
														
 
															+
														
 
															+    def __call__(self):
														
 
															+        info = {}
														
 
															+        signature_res: list = []
														
 
															+        signature_res_key = "ofd:Signature"
														
 
															+        self.recursion_ext(self.xml_obj, signature_res, signature_res_key)
														
 
															+
														
 
															+        if signature_res:
														
 
															+            for i in signature_res:
														
 
															+                info[i.get("@ID")] = {
														
 
															+                    "BaseLoc": i.get("@BaseLoc"),
														
 
															+                    "Type": i.get("@Type"),
														
 
															+                    "ID": i.get("@ID"),
														
 
															+
														
 
															+                }
														
 
															+        return info
														
 
															+
														
 
															+
														
 
															+class SignatureFileParser(FileParserBase):
														
 
															+    """
														
 
															+    Parser Signature
														
 
															+    签章信息
														
 
															+    """
														
 
															+
														
 
															+    def __call__(self, prefix=""):
														
 
															+        info = {}
														
 
															+        StampAnnot_res: list = []
														
 
															+        StampAnnot_res_key = "ofd:StampAnnot"
														
 
															+
														
 
															+        self.recursion_ext(self.xml_obj, StampAnnot_res, StampAnnot_res_key)
														
 
															+
														
 
															+        SignedValue_res: list = []
														
 
															+        SignedValue_res_key = "ofd:SignedValue"
														
 
															+        self.recursion_ext(self.xml_obj, SignedValue_res, SignedValue_res_key)
														
 
															+
														
 
															+        # print("SignedValue_res", SignedValue_res)
														
 
															+        # print("prefix", prefix)
														
 
															+        if StampAnnot_res:
														
 
															+            for i in StampAnnot_res:
														
 
															+                info = {
														
 
															+                    "PageRef": i.get("@PageRef"),  # page id
														
 
															+                    "Boundary": i.get("@Boundary"),
														
 
															+                    "ID": i.get("@ID"),
														
 
															+                    "SignedValue": f"{prefix}/{SignedValue_res[0]}" if SignedValue_res else f"{prefix}/SignedValue.dat",
														
 
															+                }
														
 
															+
														
 
															+        return info
														
--- a/format_convert/easyofd/easyofd/parser_ofd/find_seal_img.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/find_seal_img.py
@@ -0,0 +1,100 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME: easyofd read_seal_img
														
 
															+# CREATE_TIME: 2024/5/28 14:13
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: renoyuan
														
 
															+# note: 根据 ASN.1 解析签章 拿到 签章图片
														
 
															+import io
														
 
															+
														
 
															+from PIL import Image, UnidentifiedImageError
														
 
															+from loguru import logger
														
 
															+from pyasn1.codec.der.decoder import decode
														
 
															+from pyasn1.type import univ
														
 
															+from pyasn1.error import PyAsn1Error
														
 
															+
														
 
															+
														
 
															+
														
 
															+class SealExtract(object):
														
 
															+    def __init__(self,):
														
 
															+        pass
														
 
															+    def read_signed_value(self, path):
														
 
															+        # 读取二进制文件
														
 
															+        with open(path, 'rb') as file:
														
 
															+            binary_data = file.read()
														
 
															+        # 尝试解码为通用的 ASN.1 结构
														
 
															+        try:
														
 
															+            decoded_data, _ = decode(binary_data)
														
 
															+        except PyAsn1Error as e:
														
 
															+            # print(f"Decoding failed: {e}")
														
 
															+            decoded_data = None
														
 
															+        finally:
														
 
															+           return  decoded_data
														
 
															+
														
 
															+
														
 
															+    def find_octet_strings(self, asn1_data,octet_strings:list):
														
 
															+
														
 
															+        # 递归查找所有的 OctetString 实例
														
 
															+
														
 
															+        if isinstance(asn1_data, univ.OctetString):
														
 
															+
														
 
															+            octet_strings.append(asn1_data)
														
 
															+        elif isinstance(asn1_data, univ.Sequence) or isinstance(asn1_data, univ.Set):
														
 
															+            for component in asn1_data:
														
 
															+                self.find_octet_strings(asn1_data[f"{component}"], octet_strings)
														
 
															+        elif isinstance(asn1_data, univ.Choice):
														
 
															+            self.find_octet_strings(asn1_data.getComponent(), octet_strings)
														
 
															+        elif isinstance(asn1_data, univ.Any):
														
 
															+            try:
														
 
															+                sub_data, _ = decode(asn1_data.asOctets())
														
 
															+                self.find_octet_strings(sub_data, octet_strings)
														
 
															+            except PyAsn1Error:
														
 
															+                pass
														
 
															+
														
 
															+
														
 
															+    def hex_to_image(self, hex_data, image_format='PNG',inx=0):
														
 
															+        """
														
 
															+        将16进制数据转换为图片并保存。
														
 
															+
														
 
															+        :param hex_data: 图片的16进制数据字符串
														
 
															+        :param image_format: 图片的格式，默认为'PNG'
														
 
															+        """
														
 
															+        # 将16进制数据转换为二进制数据
														
 
															+
														
 
															+        binary_data = bytes.fromhex(hex_data)
														
 
															+
														
 
															+        # 创建BytesIO对象以读取二进制数据
														
 
															+        image_stream = io.BytesIO(binary_data)
														
 
															+
														
 
															+        # 使用Pillow打开图像数据并保存
														
 
															+        try:
														
 
															+            image = Image.open(image_stream)
														
 
															+            # image.save(f'{inx}_image.{image_format}', format=image_format)
														
 
															+            # print(f"图片已保存为'image.{image_format}'")
														
 
															+            return image
														
 
															+        except UnidentifiedImageError:
														
 
															+            pass
														
 
															+            # logger.info("not img ")
														
 
															+
														
 
															+    def __call__(self, path):
														
 
															+        decoded_data = self.read_signed_value(path)
														
 
															+        octet_strings = []
														
 
															+        img_list = []  # 目前是只有一个的，若存在多个的话关联后面考虑
														
 
															+        if decoded_data:
														
 
															+            self.find_octet_strings(decoded_data, octet_strings)
														
 
															+
														
 
															+            for i, octet_string in enumerate(octet_strings):
														
 
															+
														
 
															+                if str(octet_string.prettyPrint()).startswith("0x"):
														
 
															+
														
 
															+                    img = self.hex_to_image(str(octet_string.prettyPrint())[2:],inx= i)
														
 
															+                    if img:
														
 
															+                        img_list.append(img)
														
 
															+        else:
														
 
															+            pass
														
 
															+            # logger.info("No valid ASN.1 data found.")
														
 
															+        return  img_list
														
 
															+
														
 
															+if __name__=="__main__":
														
 
															+    print(SealExtract()(r"F:\code\easyofd\test\1111_xml\Doc_0\Signs\Sign_0\SignedValue.dat" ))
														
 
															+
														
--- a/format_convert/easyofd/easyofd/parser_ofd/img_deal.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/img_deal.py
@@ -0,0 +1,35 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME: easyofd img_deal
														
 
															+# CREATE_TIME: 2024/7/18 11:20
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: renoyuan
														
 
															+# note: img 操作
														
 
															+from io import BytesIO
														
 
															+class DealImg(object):
														
 
															+    def __init__(self):
														
 
															+        pass
														
 
															+    def resize(self):
														
 
															+        """resize img"""
														
 
															+        pass
														
 
															+    def pil2bytes(self, image):
														
 
															+        """pil2bytes"""
														
 
															+        # 创建一个 BytesIO 对象
														
 
															+        img_bytesio = BytesIO()
														
 
															+        # 将图像保存到 BytesIO 对象
														
 
															+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
														
 
															+        # 获取 BytesIO 对象中的字节
														
 
															+        img_bytes = img_bytesio.getvalue()
														
 
															+        # 关闭 BytesIO 对象
														
 
															+        img_bytesio.close()
														
 
															+        return img_bytes
														
 
															+    def pil2bytes_io(self, image):
														
 
															+        """pil2bytes_io"""
														
 
															+        # 创建一个 BytesIO 对象
														
 
															+        img_bytesio = BytesIO()
														
 
															+        # 将图像保存到 BytesIO 对象
														
 
															+        image.save(img_bytesio, format='PNG')  # 你可以根据需要选择其他图像格式
														
 
															+        return img_bytesio
														
 
															+
														
 
															+
														
 
															+
														
--- a/format_convert/easyofd/easyofd/parser_ofd/ofd_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/ofd_parser.py
@@ -0,0 +1,607 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME: D:\code\easyofd\easyofd\parser
														
 
															+# CREATE_TIME: 2023-07-27
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE: ofd解析主流程
														
 
															+
														
 
															+import os
														
 
															+import sys
														
 
															+sys.path.append(os.path.dirname(__file__) + "/../../../../")
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.file_ofd_parser import OFDFileParser
														
 
															+from jbig2_parser import jbig2_parser
														
 
															+import traceback
														
 
															+import base64
														
 
															+import re
														
 
															+import io
														
 
															+# import jbigkit
														
 
															+from typing import Any, List
														
 
															+from PIL import Image
														
 
															+from PIL.Image import Image as ImageClass
														
 
															+from loguru import logger
														
 
															+
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.img_deal import DealImg
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.file_deal import FileRead
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.file_ofd_parser import OFDFileParser
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.file_doc_parser import DocumentFileParser
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.file_docres_parser import DocumentResFileParser
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.file_content_parser import ContentFileParser
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.file_annotation_parser import AnnotationFileParser
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.file_publicres_parser import PublicResFileParser
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.file_signature_parser import SignaturesFileParser,SignatureFileParser
														
 
															+from format_convert.easyofd.easyofd.parser_ofd.path_parser import PathParser
														
 
															+# todo 解析流程需要大改
														
 
															+
														
 
															+
														
 
															+class OFDParser(object):
														
 
															+    """
														
 
															+    OFDParser 解析
														
 
															+    1 解压文件 创建文件映射表 释放文件
														
 
															+    2 解析 xml 逐级去 收集需要信息  结构文本 以及 资源
														
 
															+    2 调用font 注册 字体
														
 
															+
														
 
															+    图层顺序 tlp>content>annotation
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, ofdb64):
														
 
															+        self.img_deal = DealImg()
														
 
															+        self.ofdb64 = ofdb64
														
 
															+        self.file_tree = None
														
 
															+        # self.jbig2dec_path = r"C:/msys64/mingw64/bin/jbig2dec.exe"
														
 
															+        self.jbig2dec_path = r'D:\Anaconda3\pkgs\jbig2dec-0.18-ha9979f8_0\Library\bin\jbig2dec.exe'
														
 
															+
														
 
															+    def img2data(self, imglist: List[ImageClass]):
														
 
															+        """
														
 
															+        imglist to ofd data
														
 
															+        
														
 
															+        """
														
 
															+        OP = 200 / 25.4
														
 
															+        doc_list = []
														
 
															+        img_info = {}
														
 
															+        page_size = []
														
 
															+        font_info = {}
														
 
															+        page_info_d = {}
														
 
															+
														
 
															+        for idx, img_pil in enumerate(imglist):
														
 
															+            w, h = img_pil.size
														
 
															+            img_bytes = self.img_deal.pil2bytes(img_pil)
														
 
															+            imgb64 = str(base64.b64encode(img_bytes), encoding="utf-8")
														
 
															+            img_info[str(idx)] = {
														
 
															+                "format": "jpg",
														
 
															+                "wrap_pos": "",
														
 
															+                "type": "IMG",
														
 
															+                "suffix": "jpg",
														
 
															+                "fileName": f"{idx}.jpg",
														
 
															+                "imgb64": imgb64,
														
 
															+
														
 
															+            }
														
 
															+            text_list = []
														
 
															+            img_list = []
														
 
															+            img_d = {}
														
 
															+            img_d["CTM"] = ""  # 平移矩阵换 平移 缩放 旋转
														
 
															+            img_d["ID"] = str(idx)  # 图片id
														
 
															+            img_d["ResourceID"] = str(idx)  # 图片id
														
 
															+            img_d["pos"] = [0, 0, w / OP, h / OP]  # 平移矩阵换
														
 
															+            page_size = [0, 0, w / OP, h / OP]
														
 
															+            # print(page_size)
														
 
															+            img_list.append(img_d)
														
 
															+
														
 
															+            content_d = {
														
 
															+                "text_list": text_list,
														
 
															+                "img_list": img_list,
														
 
															+            }
														
 
															+            page_info_d[idx] = content_d
														
 
															+        doc_list.append({
														
 
															+            "pdf_name": "demo.pdf",
														
 
															+            "doc_no": "0",
														
 
															+            "images": img_info,
														
 
															+            "page_size": page_size,
														
 
															+            "fonts": font_info,
														
 
															+            "page_info": page_info_d
														
 
															+        })
														
 
															+
														
 
															+        return doc_list
														
 
															+
														
 
															+    # 获得xml 对象
														
 
															+    def get_xml_obj(self, label):
														
 
															+        assert label
														
 
															+        # print(self.file_tree.keys())
														
 
															+        label =label.lstrip('./')
														
 
															+        for abs_p in self.file_tree:
														
 
															+            # 统一符号，避免win linux 路径冲突
														
 
															+
														
 
															+            abs_p_compare = abs_p.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
														
 
															+            label_compare = label.replace("\\\\", "-").replace("//", "-").replace("\\", "-").replace("/", "-")
														
 
															+            if label_compare in abs_p_compare:
														
 
															+                # logger.info(f"{label} {abs_p}")
														
 
															+                return self.file_tree[abs_p]
														
 
															+        # logger.info(f"{label} ofd file path is not")
														
 
															+        return ""
														
 
															+
														
 
															+    def jb22png_old(self, img_d: dict):
														
 
															+        """
														
 
															+        jb22png
														
 
															+        没有安装 jbig2dec 无法操作 
														
 
															+        """
														
 
															+        if not os.path.exists(self.jbig2dec_path):
														
 
															+            logger.warning(f"未安装jbig2dec，无法处理jb2文件")
														
 
															+            return
														
 
															+
														
 
															+        # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
														
 
															+        fileName = img_d["fileName"]
														
 
															+        print('jb2 file_name', fileName)
														
 
															+        new_fileName = img_d['fileName'].replace(".jb2", ".png")
														
 
															+        with open(fileName, "wb") as f:
														
 
															+            f.write(base64.b64decode(img_d["imgb64"]))
														
 
															+        command = "{} -o {} {}"
														
 
															+        res = os.system(command.format(self.jbig2dec_path, new_fileName, fileName))
														
 
															+        if res != 0:
														
 
															+            pass
														
 
															+            # logger.warning(f"jbig2dec处理失败")
														
 
															+        # if os.path.exists(fileName):
														
 
															+        #     os.remove(fileName)
														
 
															+        if os.path.exists(new_fileName):
														
 
															+            # logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
														
 
															+            img_d["fileName"] = new_fileName
														
 
															+            img_d["suffix"] = "png"
														
 
															+            img_d["format"] = "png"
														
 
															+            with open(new_fileName, "rb") as f:
														
 
															+                data = f.read()
														
 
															+                img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
														
 
															+
														
 
															+            # os.remove(new_fileName)
														
 
															+
														
 
															+    def jb22png(self, img_d: dict):
														
 
															+        """
														
 
															+        jb22png
														
 
															+        没有安装 jbig2dec 无法操作
														
 
															+        """
														
 
															+
														
 
															+        file_name = img_d["fileName"]
														
 
															+        # print('jb2 file_name', file_name)
														
 
															+        new_file_name = img_d['fileName'].replace(".jb2", ".png")
														
 
															+        with open(file_name, "rb") as f:
														
 
															+            data = f.read()
														
 
															+        png_data = jbig2_parser.parse_jbig2(data)
														
 
															+        png_bytes = bytes(png_data)
														
 
															+        # print('png_data', png_data)
														
 
															+
														
 
															+        # # 将字节缓冲区转换为图像对象
														
 
															+        # image = Image.open(io.BytesIO(png_data))
														
 
															+        #
														
 
															+        # # 保存图像为 PNG 文件
														
 
															+        # image.save(new_file_name, 'PNG')
														
 
															+
														
 
															+        with open(new_file_name, 'wb') as f:
														
 
															+            f.write(png_bytes)
														
 
															+
														
 
															+        if os.path.exists(new_file_name):
														
 
															+            # logger.info(f"jbig2dec处理成功{fileName}>>{new_fileName}")
														
 
															+            img_d["fileName"] = new_file_name
														
 
															+            img_d["suffix"] = "png"
														
 
															+            img_d["format"] = "png"
														
 
															+            with open(new_file_name, "rb") as f:
														
 
															+                data = f.read()
														
 
															+                img_d["imgb64"] = str(base64.b64encode(data), encoding="utf-8")
														
 
															+
														
 
															+        # decoder = jbigkit.JbgDecoder()
														
 
															+        # with open(file_name, "rb") as f:
														
 
															+        #     data = f.read()
														
 
															+        # status, processed_len = decoder.decode_in(data)
														
 
															+        # if status != jbigkit.JbgErrno.EOK or processed_len != len(data):
														
 
															+        #     print('jb2 file error!')
														
 
															+        #     return
														
 
															+        # assert status == jbigkit.JbgErrno.EOK
														
 
															+        # assert processed_len == len(data)
														
 
															+        #
														
 
															+        # w, h = decoder.get_width(), decoder.get_height()
														
 
															+        #
														
 
															+        # ith_plane = decoder.get_plane(0)  # 获取第一个平面
														
 
															+        # img = Image.frombytes('1', (w, h), bytes(ith_plane), 'raw', '1;I')
														
 
															+        # img.save(new_file_name)
														
 
															+
														
 
															+        # os.remove(new_fileName)
														
 
															+
														
 
															+    def bmp2jpg(self, img_d: dict):
														
 
															+
														
 
															+        fileName = img_d["fileName"]
														
 
															+        new_fileName = img_d['fileName'].replace(".bmp", ".jpg")
														
 
															+        b64_nmp = self.get_xml_obj(fileName)
														
 
															+        image_data = base64.b64decode(b64_nmp)
														
 
															+        image = Image.open(io.BytesIO(image_data))
														
 
															+        rgb_image = image.convert("RGB")
														
 
															+        output_buffer = io.BytesIO()
														
 
															+        rgb_image.save(output_buffer, format="JPEG")
														
 
															+        image.close()
														
 
															+        jpeg_bytes = output_buffer.getvalue()
														
 
															+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
														
 
															+        output_buffer.close()
														
 
															+
														
 
															+        if b64_jpeg:
														
 
															+            logger.info(f"bmp2jpg处理成功{fileName}>>{new_fileName}")
														
 
															+            img_d["fileName"] = new_fileName
														
 
															+            img_d["suffix"] = "jpg"
														
 
															+            img_d["format"] = "jpg"
														
 
															+            img_d["imgb64"] = b64_jpeg
														
 
															+
														
 
															+    def tif2jpg(self, img_d: dict):
														
 
															+        fileName = img_d["fileName"]
														
 
															+        new_fileName = img_d['fileName'].replace(".tif", ".jpg")
														
 
															+        tif_nmp = self.get_xml_obj(fileName)
														
 
															+        image_data = base64.b64decode(tif_nmp)
														
 
															+        image = Image.open(io.BytesIO(image_data))
														
 
															+        if image.mode in ("RGBA", "LA") or (image.mode == "P" and "transparency" in image.info):
														
 
															+            image = image.convert("RGB")
														
 
															+
														
 
															+            # 创建一个字节流来保存处理后的图像
														
 
															+        output_buffer = io.BytesIO()
														
 
															+
														
 
															+        # 保存图像为 JPEG 格式到字节流中
														
 
															+        image.save(output_buffer, format="JPEG", quality=95)
														
 
															+
														
 
															+        # 获取字节流中的内容并编码为 Base64 字符串
														
 
															+        jpeg_bytes = output_buffer.getvalue()
														
 
															+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
														
 
															+
														
 
															+        # 关闭图像对象和字节流
														
 
															+        image.close()
														
 
															+        output_buffer.close()
														
 
															+
														
 
															+        if b64_jpeg:
														
 
															+            logger.info(f"tif2jpg处理成功{fileName}>>{new_fileName}")
														
 
															+            img_d["fileName"] = new_fileName
														
 
															+            img_d["suffix"] = "jpg"
														
 
															+            img_d["format"] = "jpg"
														
 
															+            img_d["imgb64"] = b64_jpeg
														
 
															+
														
 
															+    def gif2jpg(self, img_d: dict):
														
 
															+        fileName = img_d["fileName"]
														
 
															+        new_fileName = img_d['fileName'].replace(".bmp", ".jpg")
														
 
															+        b64_gif = self.get_xml_obj(fileName)
														
 
															+        image_data = base64.b64decode(b64_gif)
														
 
															+        image = Image.open(io.BytesIO(image_data))
														
 
															+        if image.mode != "RGB":
														
 
															+            image = image.convert("RGB")
														
 
															+        output_buffer = io.BytesIO()
														
 
															+        image.save(output_buffer, format="JPEG", quality=95)
														
 
															+        image.close()
														
 
															+        jpeg_bytes = output_buffer.getvalue()
														
 
															+        b64_jpeg = base64.b64encode(jpeg_bytes).decode('utf-8')
														
 
															+        output_buffer.close()
														
 
															+
														
 
															+        if b64_jpeg:
														
 
															+            logger.info(f"gif2jpg处理成功{fileName}>>{new_fileName}")
														
 
															+            img_d["fileName"] = new_fileName
														
 
															+            img_d["suffix"] = "jpg"
														
 
															+            img_d["format"] = "jpg"
														
 
															+            img_d["imgb64"] = b64_jpeg
														
 
															+
														
 
															+    def parser(self, save_dir):
														
 
															+        """
														
 
															+        解析流程
														
 
															+        doc_0默认只有 一层
														
 
															+        OFD >  Document.xml > [DocumentRes.xml, PublicRes.xml, Signatures.xml Annotations.xml] > []
														
 
															+        """
														
 
															+
														
 
															+        page_size_details = []
														
 
															+        default_page_size = []
														
 
															+        doc_list = []
														
 
															+        ofd_xml_obj = self.get_xml_obj(self.file_tree["root_doc"])  # OFD.xml xml 对象 
														
 
															+
														
 
															+        if ofd_xml_obj:
														
 
															+            ofd_obj_res = OFDFileParser(ofd_xml_obj)()
														
 
															+            doc_root_name = ofd_obj_res.get("doc_root")
														
 
															+            signatures = ofd_obj_res.get("signatures")
														
 
															+        else:
														
 
															+            # 考虑根节点丢失情况
														
 
															+            doc_root_name = ["Doc_0/Document.xml"]
														
 
															+            signatures = ["Doc_0/Signs/Signatures.xml"]
														
 
															+
														
 
															+        doc_root_xml_obj = self.get_xml_obj(doc_root_name[0])
														
 
															+        doc_root_info = DocumentFileParser(doc_root_xml_obj)()
														
 
															+        doc_page_size = self.get_page_size(doc_root_xml_obj)
														
 
															+        # print('doc_page_size', doc_page_size)
														
 
															+
														
 
															+        # 注释文本
														
 
															+        annotations_root_name = doc_root_info.get("Annotations")
														
 
															+        if annotations_root_name:
														
 
															+            annotations_root_name = annotations_root_name[0]
														
 
															+            annot_root_xml_obj = self.get_xml_obj(annotations_root_name)
														
 
															+            # print('annot_root_xml_obj', annot_root_xml_obj)
														
 
															+            annot_root_info = AnnotationFileParser(annot_root_xml_obj)()
														
 
															+            # print('annot_root_info', annot_root_info)
														
 
															+            doc_root_info.update(annot_root_info)
														
 
															+        doc_size = doc_root_info.get("size")
														
 
															+
														
 
															+        if doc_size:
														
 
															+            try:
														
 
															+                default_page_size = [float(pos_i) for pos_i in doc_size.split(" ") if re.match("[\d\.]", pos_i)]
														
 
															+            except:
														
 
															+                traceback.print_exc()
														
 
															+
														
 
															+        # 字体信息
														
 
															+        font_info = {}
														
 
															+        public_res_name: list = doc_root_info.get("public_res")
														
 
															+        if public_res_name:
														
 
															+            public_xml_obj = self.get_xml_obj(public_res_name[0])
														
 
															+            font_info = PublicResFileParser(public_xml_obj)()
														
 
															+
														
 
															+            # 注册字体
														
 
															+            for font_id, font_v in font_info.items():
														
 
															+                file_name = font_v.get("FontFile")
														
 
															+                if file_name:
														
 
															+                    font_b64 = self.get_xml_obj(file_name)
														
 
															+                    if font_b64:
														
 
															+                        font_v["font_b64"] = font_b64
														
 
															+
														
 
															+        # 图片资源
														
 
															+        img_info: dict = dict()
														
 
															+        document_res_name: list = doc_root_info.get("document_res")
														
 
															+        # print('doc_root_info', doc_root_info)
														
 
															+        if document_res_name:
														
 
															+            document_res_xml_obj = self.get_xml_obj(document_res_name[0])
														
 
															+            # print('document_res_xml_obj', document_res_xml_obj)
														
 
															+            img_info = DocumentResFileParser(document_res_xml_obj)()
														
 
															+            # 找到图片b64
														
 
															+            for img_id, img_v in img_info.items():
														
 
															+                img_v["imgb64"] = self.get_xml_obj(img_v.get("fileName"))
														
 
															+                img_v['fileName'] = f"{save_dir}Doc_0\Res\{img_v['fileName']}"
														
 
															+                # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
														
 
															+                if img_v["suffix"] == 'jb2':
														
 
															+                    self.jb22png(img_v)
														
 
															+                elif img_v["suffix"] == 'bmp':
														
 
															+                    self.bmp2jpg(img_v)
														
 
															+                elif img_v["suffix"] == 'tif':
														
 
															+                    self.tif2jpg(img_v)
														
 
															+                elif img_v["suffix"] == 'gif':
														
 
															+                    self.gif2jpg(img_v)
														
 
															+
														
 
															+        img_info2: dict = dict()
														
 
															+        public_res_name: list = doc_root_info.get("public_res")
														
 
															+        # print('doc_root_info', doc_root_info)
														
 
															+        if public_res_name:
														
 
															+            public_res_xml_obj = self.get_xml_obj(public_res_name[0])
														
 
															+            # print('public_res_xml_obj', public_res_xml_obj)
														
 
															+            img_info2 = DocumentResFileParser(public_res_xml_obj)()
														
 
															+            # 找到图片b64
														
 
															+            for img_id, img_v in img_info2.items():
														
 
															+                img_v["imgb64"] = self.get_xml_obj(img_v.get("fileName"))
														
 
															+                # print('img_id, img_v[filename]', img_id, img_v.get('fileName'))
														
 
															+                img_v['fileName'] = f"{save_dir}Doc_0\Res\{img_v['fileName']}"
														
 
															+
														
 
															+                # todo ib2 转png C:/msys64/mingw64/bin/jbig2dec.exe -o F:\code\easyofd\test\image_80.png F:\code\easyofd\test\image_80.jb2
														
 
															+                if img_v["suffix"] == 'jb2':
														
 
															+                    self.jb22png(img_v)
														
 
															+                elif img_v["suffix"] == 'bmp':
														
 
															+                    self.bmp2jpg(img_v)
														
 
															+                elif img_v["suffix"] == 'tif':
														
 
															+                    self.tif2jpg(img_v)
														
 
															+                elif img_v["suffix"] == 'gif':
														
 
															+                    self.gif2jpg(img_v)
														
 
															+            img_info.update(img_info2)
														
 
															+
														
 
															+        page_id_map: list = doc_root_info.get("page_id_map")
														
 
															+        # print('doc_root_info', doc_root_info)
														
 
															+
														
 
															+        signatures_page_id = {}
														
 
															+        # 签章信息
														
 
															+        signatures_xml_obj = None
														
 
															+        # if signatures:
														
 
															+        #     signatures_xml_obj = self.get_xml_obj(signatures[0])
														
 
															+        # if signatures and signatures_xml_obj:
														
 
															+        # # if signatures and (signatures_xml_obj := self.get_xml_obj(signatures[0])):
														
 
															+        # #     logger.debug(f"signatures_xml_obj is {signatures_xml_obj } signatures is {signatures} ")
														
 
															+        #     signatures_info = SignaturesFileParser(signatures_xml_obj)()
														
 
															+        #     if signatures_info:  # 获取签章具体信息
														
 
															+        #         for _, signatures_cell in signatures_info.items():
														
 
															+        #             # print(signatures_info)
														
 
															+        #             BaseLoc = signatures_cell.get("BaseLoc")
														
 
															+        #             signature_xml_obj = self.get_xml_obj(BaseLoc)
														
 
															+        #             # print(BaseLoc)
														
 
															+        #             prefix = BaseLoc.split("/")[0]
														
 
															+        #             signatures_info = SignatureFileParser(signature_xml_obj)(prefix=prefix)
														
 
															+        #             # print(signatures_info)
														
 
															+        #             # logger.debug(f"signatures_info {signatures_info}")
														
 
															+        #             PageRef = signatures_info.get("PageRef")
														
 
															+        #             Boundary = signatures_info.get("Boundary")
														
 
															+        #             SignedValue = signatures_info.get("SignedValue")
														
 
															+        #             sing_page_no = page_id_map.get(PageRef)
														
 
															+        #             # print("self.file_tree",self.file_tree.keys)
														
 
															+        #             # print(page_id_map,PageRef)
														
 
															+        #             # print(SignedValue, self.get_xml_obj(SignedValue))
														
 
															+        #             # with open("b64.txt","w") as f:
														
 
															+        #             #     f.write(self.get_xml_obj(SignedValue))
														
 
															+        #             if signatures_page_id.get(sing_page_no):
														
 
															+        #                 signatures_page_id[sing_page_no].append(
														
 
															+        #                     {
														
 
															+        #                         "sing_page_no": sing_page_no,
														
 
															+        #                         "PageRef": PageRef,
														
 
															+        #                         "Boundary": Boundary,
														
 
															+        #                         "SignedValue": self.get_xml_obj(SignedValue),
														
 
															+        #                     }
														
 
															+        #                 )
														
 
															+        #             else:
														
 
															+        #                 signatures_page_id[sing_page_no] = [
														
 
															+        #                     {
														
 
															+        #                         "sing_page_no": sing_page_no,
														
 
															+        #                         "PageRef": PageRef,
														
 
															+        #                         "Boundary": Boundary,
														
 
															+        #                         "SignedValue": self.get_xml_obj(SignedValue),
														
 
															+        #                     }
														
 
															+        #                 ]
														
 
															+
														
 
															+        # 注释信息
														
 
															+        # print('doc_root_info', doc_root_info)
														
 
															+        # annotation_name: list = doc_root_info.get("Annotations")
														
 
															+        # annotation_xml_obj = None
														
 
															+        # if annotation_name:
														
 
															+        #     annotation_xml_obj = self.get_xml_obj(annotation_name[0])
														
 
															+        # if annotation_name and annotation_xml_obj:
														
 
															+        # # if annotation_name and (annotation_xml_obj:= self.get_xml_obj(annotation_name[0])):
														
 
															+        #     # todo 注释解析
														
 
															+        #
														
 
															+        #     # annotation_info = AnnotationFileParser(annotation_xml_obj)()
														
 
															+        #     annotation_info = AnnotationFileParser(annotation_xml_obj)()
														
 
															+        #     # logger.debug(f"annotation_info is {annotation_info}")
														
 
															+
														
 
															+
														
 
															+        # 正文信息 会有多页 情况
														
 
															+        page_name: list = doc_root_info.get("page")
														
 
															+        page_info_d = {}
														
 
															+        if page_name:
														
 
															+            for index, _page in enumerate(page_name):
														
 
															+                page_xml_obj = self.get_xml_obj(_page)
														
 
															+                # 重新获取页面size
														
 
															+                try:
														
 
															+                    page_size = [float(pos_i) for pos_i in
														
 
															+                                     page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox",
														
 
															+                                                                                              "").split(" ")
														
 
															+                                     if re.match("[\d\.]", pos_i)]
														
 
															+                    if page_size and len(page_size) >= 2:
														
 
															+                        page_size_details.append(page_size)
														
 
															+                    else:
														
 
															+                        if doc_page_size:
														
 
															+                            page_size_details.append(doc_page_size)
														
 
															+                        else:
														
 
															+                            page_size_details.append([])
														
 
															+                except Exception as e:
														
 
															+                    traceback.print_exc()
														
 
															+                    page_size.append([])
														
 
															+                page_info = ContentFileParser(page_xml_obj)()
														
 
															+                pg_no = re.search(r"\d+", _page)
														
 
															+                if pg_no:
														
 
															+                    pg_no = int(pg_no.group())
														
 
															+                else:
														
 
															+                    pg_no = index
														
 
															+                page_info_d[pg_no] = page_info
														
 
															+                # 只跑一页
														
 
															+                # print('odf_parser parser() 只跑一页')
														
 
															+                # break
														
 
															+
														
 
															+        # 注释作为正文提取
														
 
															+        annot_page_info_d = {}
														
 
															+        annot_page_name: list = doc_root_info.get("annot_page")
														
 
															+        if annot_page_name:
														
 
															+            for index, _page in enumerate(annot_page_name):
														
 
															+                annot_page_xml_obj = self.get_xml_obj(_page)
														
 
															+                annot_page_info = ContentFileParser(annot_page_xml_obj)()
														
 
															+                pg_no = re.search(r"\d+", _page)
														
 
															+                if pg_no:
														
 
															+                    pg_no = int(pg_no.group())
														
 
															+                else:
														
 
															+                    pg_no = index
														
 
															+
														
 
															+                # 重新获取页面size
														
 
															+                # try:
														
 
															+                #     page_size = [float(pos_i) for pos_i in
														
 
															+                #                  annot_page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox",
														
 
															+                #                                                                           "").split(" ")
														
 
															+                #                  if re.match("[\d\.]", pos_i)]
														
 
															+                #     if page_size and len(page_size) >= 2:
														
 
															+                #         # page_size_details.append(page_size)
														
 
															+                #         pass
														
 
															+                #     else:
														
 
															+                #         page_size = []
														
 
															+                # except Exception as e:
														
 
															+                #     traceback.print_exc()
														
 
															+                #     page_size.append([])
														
 
															+                page_size = self.get_page_size(annot_page_xml_obj)
														
 
															+                # if not page_size:
														
 
															+                #     page_size = doc_page_size
														
 
															+
														
 
															+                # annot_page_info['annot_page_size'] = page_size
														
 
															+                annot_page_info_d[pg_no] = annot_page_info
														
 
															+                # 只跑一页
														
 
															+                # print('odf_parser parser() 只跑一页')
														
 
															+                # break
														
 
															+        # 注释文本信息合到正文信息中
														
 
															+        for page_id, page_d in page_info_d.items():
														
 
															+            if page_id not in annot_page_info_d.keys():
														
 
															+                continue
														
 
															+            annot_page_d = annot_page_info_d.get(page_id)
														
 
															+            # print("annot_page_d.get('text_list')", annot_page_d.get('text_list'))
														
 
															+            page_d['text_list'] += annot_page_d.get('text_list')
														
 
															+            page_d['annot_text_list'] = annot_page_d.get('text_list')
														
 
															+            # page_d['annot_page_size'] = annot_page_d.get('annot_page_size')
														
 
															+        # print('page_info_d', page_info_d)
														
 
															+        # print('annot_page_info_d', annot_page_info_d)
														
 
															+
														
 
															+        # 模板信息
														
 
															+        tpls_name: list = doc_root_info.get("tpls")
														
 
															+        # if tpls_name:
														
 
															+        #     for index, _tpl in enumerate(tpls_name):
														
 
															+        #         tpl_xml_obj = self.get_xml_obj(_tpl)
														
 
															+        #         tpl_info = ContentFileParser(tpl_xml_obj)()
														
 
															+        #         tpl_no = re.search(r"\d+", _tpl)
														
 
															+        #
														
 
															+        #         if tpl_no:
														
 
															+        #             tpl_no = int(tpl_no.group())
														
 
															+        #         else:
														
 
															+        #             tpl_no = index
														
 
															+        #
														
 
															+        #         if tpl_no in page_info_d:
														
 
															+        #             page_info_d[pg_no]["text_list"].extend(tpl_info["text_list"])
														
 
															+        #             page_info_d[pg_no]["text_list"].sort(
														
 
															+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
														
 
															+        #             page_info_d[pg_no]["img_list"].extend(tpl_info["img_list"])
														
 
															+        #             page_info_d[pg_no]["img_list"].sort(
														
 
															+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
														
 
															+        #             page_info_d[pg_no]["line_list"].extend(tpl_info["line_list"])
														
 
															+        #             page_info_d[pg_no]["line_list"].sort(
														
 
															+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
														
 
															+        #         else:
														
 
															+        #             page_info_d[tpl_no] = tpl_info
														
 
															+        #             page_info_d[tpl_no].sort(
														
 
															+        #                 key=lambda pos_text: (float(pos_text.get("pos")[1]), float(pos_text.get("pos")[0])))
														
 
															+
														
 
															+        # todo 读取注释信息
														
 
															+        page_ID = 0  # 没遇到过doc多个的情况
														
 
															+        # print("page_info",len(page_info))
														
 
															+        doc_list.append({
														
 
															+            "default_page_size": default_page_size,
														
 
															+            "page_size": page_size_details,
														
 
															+            "pdf_name": self.file_tree["pdf_name"],
														
 
															+            "doc_no": page_ID,
														
 
															+            "images": img_info,
														
 
															+            "signatures_page_id": signatures_page_id,
														
 
															+            "page_id_map": page_id_map,
														
 
															+            "fonts": font_info,
														
 
															+            "page_info": page_info_d,
														
 
															+            "page_tpl_info": page_info_d,
														
 
															+            "page_content_info": page_info_d,
														
 
															+            # "annot_page_info": annot_page_info_d,
														
 
															+        })
														
 
															+        return doc_list
														
 
															+
														
 
															+    def get_page_size(self, page_xml_obj):
														
 
															+        try:
														
 
															+            page_size = [float(pos_i) for pos_i in page_xml_obj.get('ofd:Page', {}).get("ofd:Area", {}).get("ofd:PhysicalBox", "").split(" ")if re.match("[\d\.]", pos_i)]
														
 
															+            if not (page_size and len(page_size) >= 2):
														
 
															+                page_size = [float(pos_i) for pos_i in page_xml_obj.get('ofd:Document', {}).get('ofd:CommonData', {}).get("ofd:PageArea", {}).get("ofd:PhysicalBox", "").split(" ")if re.match("[\d\.]", pos_i)]
														
 
															+                if not (page_size and len(page_size) >= 2):
														
 
															+                    page_size = []
														
 
															+        except Exception as e:
														
 
															+            traceback.print_exc()
														
 
															+            page_size = []
														
 
															+        return page_size
														
 
															+
														
 
															+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
														
 
															+        """
														
 
															+        输出ofd解析结果
														
 
															+        """
														
 
															+        save_xml = kwargs.get("save_xml", False)
														
 
															+        xml_name = kwargs.get("xml_name")
														
 
															+        save_dir = kwargs.get("save_dir")
														
 
															+        self.file_tree = FileRead(self.ofdb64)(save_xml=save_xml, xml_name=xml_name, save_dir=save_dir)
														
 
															+        # logger.info(self.file_tree)
														
 
															+        return self.parser(save_dir)
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
														
 
															+    with open(p, "rb") as f:
														
 
															+        ofdb64 = str(base64.b64encode(f.read()), "utf-8")
														
 
															+    obj_list = OFDParser(ofdb64)()
														
 
															+    for obj in obj_list:
														
 
															+        print('obj', obj)
														
--- a/format_convert/easyofd/easyofd/parser_ofd/parameter_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/parameter_parser.py
@@ -0,0 +1,31 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME: easyofd
														
 
															+# CREATE_TIME: 
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: renoyuan
														
 
															+# note:参数解析器
														
 
															+from loguru import logger
														
 
															+from typing import List, Dict, Any, Union, Tuple, Optional
														
 
															+
														
 
															+
														
 
															+class ParameterParser(object):
														
 
															+    parameter = {
														
 
															+        "ofd:FillColor": (dict, dict),
														
 
															+        "ofd:StrokeColor": (dict, dict),
														
 
															+        "ofd:Test": ((str, int), str),
														
 
															+        "ofd:Font": (str, str),
														
 
															+        "@Value": (str, str)
														
 
															+    }
														
 
															+
														
 
															+    def __call__(self, key, container):
														
 
															+        if key in ParameterParser.parameter:
														
 
															+            v = container.get(key, None)
														
 
															+            t = ParameterParser.parameter[key]
														
 
															+            if isinstance(v, t[0]):
														
 
															+                return v
														
 
															+            else:
														
 
															+                return t[1]()
														
 
															+        else:
														
 
															+            logger.warning(f"{key} not in ParameterParser")
														
 
															+            return None
														
--- a/format_convert/easyofd/easyofd/parser_ofd/path_parser.py
+++ b/format_convert/easyofd/easyofd/parser_ofd/path_parser.py
@@ -0,0 +1,61 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  path_parser.py
														
 
															+# CREATE_TIME: 2025/4/9 16:31
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE:
														
 
															+from enum import Enum
														
 
															+import os
														
 
															+
														
 
															+class PathType(Enum):
														
 
															+    absolutely = 1
														
 
															+    relative = 2
														
 
															+
														
 
															+class PathParser:
														
 
															+    """
														
 
															+    Parser Path
														
 
															+    路径解析器
														
 
															+    解析文件路径返回绝对路径
														
 
															+    "/ROOT/a.xml"
														
 
															+    "./ROOT/a.xml"
														
 
															+    "../ROOT/a.xml"
														
 
															+    "ROOT/a.xml"
														
 
															+    """
														
 
															+
														
 
															+    def __init__(self, root_path:str):
														
 
															+        if os.name == 'nt':
														
 
															+            self.os = "nt"
														
 
															+        else:
														
 
															+            self.os = "posix"
														
 
															+
														
 
															+        self.root_path = self.format_path(root_path)
														
 
															+
														
 
															+    def format_path(self,path:str):
														
 
															+        normalized = os.path.normpath(path)
														
 
															+        if self.os == "nt":
														
 
															+            return normalized.replace("/","\\")
														
 
															+        else:
														
 
															+            return normalized.replace("\\","/")
														
 
															+
														
 
															+    def get_path_type(self, path:str):
														
 
															+        if os.path.isabs(path):
														
 
															+            return PathType.absolutely
														
 
															+        else:
														
 
															+            return PathType.relative
														
 
															+
														
 
															+    def __call__(self,cur_path:str,loc_path:str):
														
 
															+        """
														
 
															+        loc_path is posix style
														
 
															+        """
														
 
															+        path_type = self.get_path_type(loc_path)
														
 
															+        if path_type == PathType.absolutely:
														
 
															+            return self.format_path(loc_path)
														
 
															+        if path_type == PathType.relative:
														
 
															+            if loc_path.startswith("./"):
														
 
															+                path = os.path.join(cur_path, self.format_path(loc_path[2:]))
														
 
															+            elif loc_path.startswith("../"):
														
 
															+                path = os.path.join(os.path.dirname(cur_path), self.format_path(loc_path[3:]))
														
 
															+            else:
														
 
															+                path = os.path.join(os.path.dirname(cur_path), self.format_path(loc_path))
														
 
															+            return path
														
--- a/format_convert/easyofd/easyofd/template_ofd/__init__.py
+++ b/format_convert/easyofd/easyofd/template_ofd/__init__.py
@@ -0,0 +1,7 @@
 
															+#!/usr/bin/env python
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# PROJECT_NAME:  __init__.py.py
														
 
															+# CREATE_TIME: 2025/3/28 15:43
														
 
															+# E_MAIL: renoyuan@foxmail.com
														
 
															+# AUTHOR: reno
														
 
															+# NOTE:
														
--- a/format_convert/font_map/extend_to_normal_dict.txt
+++ b/format_convert/font_map/extend_to_normal_dict.txt
@@ -0,0 +1,53 @@
 
															+{
														
 
															+ "⺁":"厂",
														
 
															+ "⺇":"几",
														
 
															+ "⺌":"小",
														
 
															+ "⺎":"兀",
														
 
															+ "⺏":"尣",
														
 
															+ "⺐":"尢",
														
 
															+ "⺑":"𡯂",
														
 
															+ "⺒":"巳",
														
 
															+ "⺓":"幺",
														
 
															+ "⺛":"旡",
														
 
															+ "⺝":"月",
														
 
															+ "⺟":"母",
														
 
															+ "⺠":"民",
														
 
															+ "⺱":"冈",
														
 
															+ "⺸":"芈",
														
 
															+ "⻁":"虎",
														
 
															+ "⻄":"西",
														
 
															+ "⻅":"见",
														
 
															+ "⻆":"角",
														
 
															+ "⻇":"𧢲",
														
 
															+ "⻉":"贝",
														
 
															+ "⻋":"车",
														
 
															+ "⻒":"镸",
														
 
															+ "⻓":"长",
														
 
															+ "⻔":"门",
														
 
															+ "⻗":"雨",
														
 
															+ "⻘":"青",
														
 
															+ "⻙":"韦",
														
 
															+ "⻚":"页",
														
 
															+ "⻛":"风",
														
 
															+ "⻜":"飞",
														
 
															+ "⻝":"食",
														
 
															+ "⻡":"𩠐",
														
 
															+ "⻢":"马",
														
 
															+ "⻣":"骨",
														
 
															+ "⻤":"鬼",
														
 
															+ "⻥":"鱼",
														
 
															+ "⻦":"鸟",
														
 
															+ "⻧":"卤",
														
 
															+ "⻨":"麦",
														
 
															+ "⻩":"黄",
														
 
															+ "⻬":"齐",
														
 
															+ "⻮":"齿",
														
 
															+ "⻯":"竜",
														
 
															+ "⻰":"龙",
														
 
															+ "⻳":"龟",
														
 
															+ "⾅":"臼",
														
 
															+ "⼝":"口",
														
 
															+ "⼾":"户",
														
 
															+ "⼉":"儿",
														
 
															+ "⼱":"巾"
														
 
															+}
														
--- a/format_convert/font_map/kangxi_to_normal
+++ b/format_convert/font_map/kangxi_to_normal
@@ -0,0 +1,214 @@
 
															+⼀ 2F00　一 4E00
														
 
															+⼁ 2F01　丨 4E28
														
 
															+⼂ 2F02　丶 4E36
														
 
															+⼃ 2F03　丿 4E3F
														
 
															+⼄ 2F04　乙 4E59
														
 
															+⼅ 2F05　亅 4E85
														
 
															+⼆ 2F06　二 4E8C
														
 
															+⼇ 2F07　亠 4EA0
														
 
															+⼈ 2F08　人 4EBA
														
 
															+⼉ 2F09　儿 513F
														
 
															+⼊ 2F0A　入 5165
														
 
															+⼋ 2F0B　八 516B
														
 
															+⼌ 2F0C　冂 5182
														
 
															+⼍ 2F0D　冖 5196
														
 
															+⼎ 2F0E　冫 51AB　
														
 
															+⼏ 2F0F　几 51E0
														
 
															+⼐ 2F10　凵 51F5
														
 
															+⼑ 2F11　刀 5200
														
 
															+⼒ 2F12　力 529B
														
 
															+⼓ 2F13　勹 52F9
														
 
															+⼔ 2F14　匕 5315　
														
 
															+⼕ 2F15　匚 531A　
														
 
															+⼖ 2F16　匸 5338　
														
 
															+⼗ 2F17　十 5341
														
 
															+⼘ 2F18　卜 535C
														
 
															+⼙ 2F19　卩 5369
														
 
															+⼚ 2F1A　厂 5382
														
 
															+⼛ 2F1B　厶 53B6
														
 
															+⼜ 2F1C　又 53C8
														
 
															+⼝ 2F1D　口 53E3
														
 
															+⼞ 2F1E　囗 56D7
														
 
															+⼟ 2F1F　土 571F
														
 
															+⼠ 2F20　士 58EB
														
 
															+⼡ 2F21　夂 5902
														
 
															+⼢ 2F22　夊 590A
														
 
															+⼣ 2F23　夕 5915
														
 
															+⼤ 2F24　大 5927
														
 
															+⼥ 2F25　女 5973
														
 
															+⼦ 2F26　子 5B50
														
 
															+⼧ 2F27　宀 5B80
														
 
															+⼨ 2F28　寸 5BF8
														
 
															+⼩ 2F29　小 5C0F
														
 
															+⼪ 2F2A　尢 5C22
														
 
															+⼫ 2F2B　尸 5C38
														
 
															+⼬ 2F2C　屮 5C6E
														
 
															+⼭ 2F2D　山 5C71
														
 
															+⼮ 2F2E　巛 5DDB
														
 
															+⼯ 2F2F　工 5DE5
														
 
															+⼰ 2F30　己 5DF1
														
 
															+⼱ 2F31　巾 5DFE
														
 
															+⼲ 2F32　干 5E72
														
 
															+⼳ 2F33　幺 5E7A
														
 
															+⼴ 2F34　广 5E7F
														
 
															+⼵ 2F35　廴 5EF4
														
 
															+⼶ 2F36　廾 5EFE
														
 
															+⼷ 2F37　弋 5F0B
														
 
															+⼸ 2F38　弓 5F13
														
 
															+⼹ 2F39　彐 5F50
														
 
															+⼺ 2F3A　彡 5F61
														
 
															+⼻ 2F3B　彳 5F73
														
 
															+⼼ 2F3C　心 5FC3
														
 
															+⼽ 2F3D　戈 6208
														
 
															+⼾ 2F3E　戶 6236
														
 
															+⼿ 2F3F　手 624B
														
 
															+⽀ 2F40　支 652F
														
 
															+⽁ 2F41　攴 6534
														
 
															+⽂ 2F42　文 6587
														
 
															+⽃ 2F43　斗 6597
														
 
															+⽄ 2F44　斤 65A4
														
 
															+⽅ 2F45　方 65B9
														
 
															+⽆ 2F46　无 65E0
														
 
															+⽇ 2F47　日 65E5
														
 
															+⽈ 2F48　曰 66F0
														
 
															+⽉ 2F49　月 6708
														
 
															+⽊ 2F4A　木 6728
														
 
															+⽋ 2F4B　欠 6B20
														
 
															+⽌ 2F4C　止 6B62
														
 
															+⽍ 2F4D　歹 6B79
														
 
															+⽎ 2F4E　殳 6BB3
														
 
															+⽏ 2F4F　毋 6BCB
														
 
															+⽐ 2F50　比 6BD4
														
 
															+⽑ 2F51　毛 6BDB
														
 
															+⽒ 2F52　氏 6C0F
														
 
															+⽓ 2F53　气 6C14
														
 
															+⽔ 2F54　水 6C34
														
 
															+⽕ 2F55　火 706B
														
 
															+⽖ 2F56　爪 722A
														
 
															+⽗ 2F57　父 7236
														
 
															+⽘ 2F58　爻 723B
														
 
															+⽙ 2F59　爿 723F
														
 
															+⽚ 2F5A　片 7247
														
 
															+⽛ 2F5B　牙 7259
														
 
															+⽜ 2F5C　牛 725B
														
 
															+⽝ 2F5D　犬 72AC
														
 
															+⽞ 2F5E　玄 7384
														
 
															+⽟ 2F5F　玉 7389
														
 
															+⽠ 2F60　瓜 74DC
														
 
															+⽡ 2F61　瓦 74E6
														
 
															+⽢ 2F62　甘 7518
														
 
															+⽣ 2F63　生 751F
														
 
															+⽤ 2F64　用 7528
														
 
															+⽥ 2F65　田 7530
														
 
															+⽦ 2F66　疋 758B
														
 
															+⽧ 2F67　疒 7592
														
 
															+⽨ 2F68　癶 7676
														
 
															+⽩ 2F69　白 767D
														
 
															+⽪ 2F6A　皮 76AE
														
 
															+⽫ 2F6B　皿 76BF
														
 
															+⽬ 2F6C　目 76EE
														
 
															+⽭ 2F6D　矛 77DB
														
 
															+⽮ 2F6E　矢 77E2
														
 
															+⽯ 2F6F　石 77F3
														
 
															+⽰ 2F70　示 793A
														
 
															+⽱ 2F71　禸 79B8
														
 
															+⽲ 2F72　禾 79BE
														
 
															+⽳ 2F73　穴 7A74
														
 
															+⽴ 2F74　立 7ACB
														
 
															+⽵ 2F75　竹 7AF9
														
 
															+⽶ 2F76　米 7C73
														
 
															+⽷ 2F77　糸 7CF8
														
 
															+⽸ 2F78　缶 7F36
														
 
															+⽹ 2F79　网 7F51
														
 
															+⽺ 2F7A　羊 7F8A
														
 
															+⽻ 2F7B　羽 7FBD
														
 
															+⽼ 2F7C　老 8001
														
 
															+⽽ 2F7D　而 800C
														
 
															+⽾ 2F7E　耒 8012
														
 
															+⽿ 2F7F　耳 8033
														
 
															+⾀ 2F80　聿 807F
														
 
															+⾁ 2F81　肉 8089
														
 
															+⾂ 2F82　臣 81E3
														
 
															+⾃ 2F83　自 81EA
														
 
															+⾄ 2F84　至 81F3
														
 
															+⾅ 2F85　臼 81FC
														
 
															+⾆ 2F86　舌 820C
														
 
															+⾇ 2F87　舛 821B
														
 
															+⾈ 2F88　舟 821F
														
 
															+⾉ 2F89　艮 826E
														
 
															+⾊ 2F8A　色 8272
														
 
															+⾋ 2F8B　艸 8278
														
 
															+⾌ 2F8C　虍 864D
														
 
															+⾍ 2F8D　虫 866B
														
 
															+⾎ 2F8E　血 8840
														
 
															+⾏ 2F8F　行 884C
														
 
															+⾐ 2F90　衣 8863
														
 
															+⾑ 2F91　襾 897E
														
 
															+⾒ 2F92　見 898B
														
 
															+⾓ 2F93　角 89D2
														
 
															+⾔ 2F94　言 8A00
														
 
															+⾕ 2F95　谷 8C37
														
 
															+⾖ 2F96　豆 8C46
														
 
															+⾗ 2F97　豕 8C55
														
 
															+⾘ 2F98　豸 8C78
														
 
															+⾙ 2F99　貝 8C9D
														
 
															+⾚ 2F9A　赤 8D64
														
 
															+⾛ 2F9B　走 8D70
														
 
															+⾜ 2F9C　足 8DB3
														
 
															+⾝ 2F9D　身 8EAB
														
 
															+⾞ 2F9E　車 8ECA
														
 
															+⾟ 2F9F　辛 8F9B
														
 
															+⾠ 2FA0　辰 8FB0
														
 
															+⾡ 2FA1　辵 8FB5
														
 
															+⾢ 2FA2　邑 9091
														
 
															+⾣ 2FA3　酉 9149
														
 
															+⾤ 2FA4　采 91C7
														
 
															+⾥ 2FA5　里 91CC
														
 
															+⾦ 2FA6　金 91D1
														
 
															+⾧ 2FA7　長 9577
														
 
															+⾨ 2FA8　門 9580
														
 
															+⾩ 2FA9　阜 961C
														
 
															+⾪ 2FAA　隶 96B6
														
 
															+⾫ 2FAB　隹 96B9
														
 
															+⾬ 2FAC　雨 96E8
														
 
															+⾭ 2FAD　青 9752
														
 
															+⾮ 2FAE　非 975E
														
 
															+⾯ 2FAF　面 9762
														
 
															+⾰ 2FB0　革 9769
														
 
															+⾱ 2FB1　韋 97CB
														
 
															+⾲ 2FB2　韭 97ED
														
 
															+⾳ 2FB3　音 97F3
														
 
															+⾴ 2FB4　頁 9801
														
 
															+⾵ 2FB5　風 98A8
														
 
															+⾶ 2FB6　飛 98DB
														
 
															+⾷ 2FB7　食 98DF
														
 
															+⾸ 2FB8　首 9996
														
 
															+⾹ 2FB9　香 9999
														
 
															+⾺ 2FBA　馬 99AC
														
 
															+⾻ 2FBB　骨 9AA8
														
 
															+⾼ 2FBC　高 9AD8
														
 
															+⾽ 2FBD　髟 9ADF
														
 
															+⾾ 2FBE　鬥 9B25
														
 
															+⾿ 2FBF　鬯 9B2F
														
 
															+⿀ 2FC0　鬲 9B32
														
 
															+⿁ 2FC1　鬼 9B3C
														
 
															+⿂ 2FC2　魚 9B5A
														
 
															+⿃ 2FC3　鳥 9CE5
														
 
															+⿄ 2FC4　鹵 9E75
														
 
															+⿅ 2FC5　鹿 9E7F
														
 
															+⿆ 2FC6　麥 9EA5
														
 
															+⿇ 2FC7　麻 9EBB
														
 
															+⿈ 2FC8　黃 9EC3
														
 
															+⿉ 2FC9　黍 9ECD
														
 
															+⿊ 2FCA　黑 9ED1
														
 
															+⿋ 2FCB　黹 9EF9
														
 
															+⿌ 2FCC　黽 9EFD
														
 
															+⿍ 2FCD　鼎 9F0E
														
 
															+⿎ 2FCE　鼓 9F13
														
 
															+⿏ 2FCF　鼠 9F20
														
 
															+⿐ 2FD0　鼻 9F3B
														
 
															+⿑ 2FD1　齊 9F4A
														
 
															+⿒ 2FD2　齒 9F52
														
 
															+⿓ 2FD3　龍 9F8D
														
 
															+⿔ 2FD4　龜 9F9C
														
 
															+⿕ 2FD5　龠 9FA0
														
--- a/format_convert/font_map/kangxi_to_normal_dict.txt
+++ b/format_convert/font_map/kangxi_to_normal_dict.txt
@@ -0,0 +1,154 @@
 
															+{
														
 
															+    "⼀": "一",
														
 
															+    "⼄": "乙",
														
 
															+    "⼆": "二",
														
 
															+    "⼈": "人",
														
 
															+    "⼉": "儿",
														
 
															+    "⼊": "入",
														
 
															+    "⼋": "八",
														
 
															+    "⼏": "几",
														
 
															+    "⼑": "刀",
														
 
															+    "⼒": "力",
														
 
															+    "⼔": "匕",
														
 
															+    "⼗": "十",
														
 
															+    "⼘": "卜",
														
 
															+    "⼚": "厂",
														
 
															+    "⼜": "又",
														
 
															+    "⼝": "口",
														
 
															+    "⼞": "口",
														
 
															+    "⼟": "土",
														
 
															+    "⼠": "士",
														
 
															+    "⼤": "大",
														
 
															+    "⼥": "女",
														
 
															+    "⼦": "子",
														
 
															+    "⼨": "寸",
														
 
															+    "⼩": "小",
														
 
															+    "⼫": "尸",
														
 
															+    "⼭": "山",
														
 
															+    "⼯": "工",
														
 
															+    "⼰": "己",
														
 
															+    "⼲": "干",
														
 
															+    "⼴": "广",
														
 
															+    "⼸": "弓",
														
 
															+    "⼼": "心",
														
 
															+    "⼽": "戈",
														
 
															+    "⼿": "手",
														
 
															+    "⽀": "支",
														
 
															+    "⽂": "文",
														
 
															+    "⽃": "斗",
														
 
															+    "⽄": "斤",
														
 
															+    "⽅": "方",
														
 
															+    "⽆": "无",
														
 
															+    "⽇": "日",
														
 
															+    "⽈": "曰",
														
 
															+    "⽉": "月",
														
 
															+    "⽊": "木",
														
 
															+    "⽋": "欠",
														
 
															+    "⽌": "止",
														
 
															+    "⽍": "歹",
														
 
															+    "⽏": "毋",
														
 
															+    "⽐": "比",
														
 
															+    "⽑": "毛",
														
 
															+    "⽒": "氏",
														
 
															+    "⽓": "气",
														
 
															+    "⽔": "水",
														
 
															+    "⽕": "火",
														
 
															+    "⽖": "爪",
														
 
															+    "⽗": "父",
														
 
															+    "⽚": "片",
														
 
															+    "⽛": "牙",
														
 
															+    "⽜": "牛",
														
 
															+    "⽝": "犬",
														
 
															+    "⽞": "玄",
														
 
															+    "⽟": "玉",
														
 
															+    "⽠": "瓜",
														
 
															+    "⽡": "瓦",
														
 
															+    "⽢": "甘",
														
 
															+    "⽣": "生",
														
 
															+    "⽤": "用",
														
 
															+    "⽥": "田",
														
 
															+    "⽩": "白",
														
 
															+    "⽪": "皮",
														
 
															+    "⽫": "皿",
														
 
															+    "⽬": "目",
														
 
															+    "⽭": "矛",
														
 
															+    "⽮": "矢",
														
 
															+    "⽯": "石",
														
 
															+    "⽰": "示",
														
 
															+    "⽲": "禾",
														
 
															+    "⽳": "穴",
														
 
															+    "⽴": "立",
														
 
															+    "⽵": "竹",
														
 
															+    "⽶": "米",
														
 
															+    "⽸": "缶",
														
 
															+    "⽹": "网",
														
 
															+    "⽺": "羊",
														
 
															+    "⽻": "羽",
														
 
															+    "⽼": "老",
														
 
															+    "⽽": "而",
														
 
															+    "⽿": "耳",
														
 
															+    "⾁": "肉",
														
 
															+    "⾂": "臣",
														
 
															+    "⾃": "自",
														
 
															+    "⾄": "至",
														
 
															+    "⾆": "舌",
														
 
															+    "⾈": "舟",
														
 
															+    "⾉": "艮",
														
 
															+    "⾊": "色",
														
 
															+    "⾍": "虫",
														
 
															+    "⾎": "血",
														
 
															+    "⾏": "行",
														
 
															+    "⾐": "衣",
														
 
															+    "⾒": "儿",
														
 
															+    "⾓": "角",
														
 
															+    "⾔": "言",
														
 
															+    "⾕": "谷",
														
 
															+    "⾖": "豆",
														
 
															+    "⾚": "赤",
														
 
															+    "⾛": "走",
														
 
															+    "⾜": "足",
														
 
															+    "⾝": "身",
														
 
															+    "⾞": "车",
														
 
															+    "⾟": "辛",
														
 
															+    "⾠": "辰",
														
 
															+    "⾢": "邑",
														
 
															+    "⾣": "酉",
														
 
															+    "⾤": "采",
														
 
															+    "⾥": "里",
														
 
															+    "⾦": "金",
														
 
															+    "⾧": "长",
														
 
															+    "⾨": "门",
														
 
															+    "⾩": "阜",
														
 
															+    "⾪": "隶",
														
 
															+    "⾬": "雨",
														
 
															+    "⾭": "青",
														
 
															+    "⾮": "非",
														
 
															+    "⾯": "面",
														
 
															+    "⾰": "革",
														
 
															+    "⾲": "韭",
														
 
															+    "⾳": "音",
														
 
															+    "⾴": "页",
														
 
															+    "⾵": "风",
														
 
															+    "⾶": "飞",
														
 
															+    "⾷": "食",
														
 
															+    "⾸": "首",
														
 
															+    "⾹": "香",
														
 
															+    "⾺": "马",
														
 
															+    "⾻": "骨",
														
 
															+    "⾼": "高",
														
 
															+    "⿁": "鬼",
														
 
															+    "⿂": "鱼",
														
 
															+    "⿃": "鸟",
														
 
															+    "⿄": "卤",
														
 
															+    "⿅": "鹿",
														
 
															+    "⿇": "麻",
														
 
															+    "⿉": "黍",
														
 
															+    "⿊": "黑",
														
 
															+    "⿍": "鼎",
														
 
															+    "⿎": "鼓",
														
 
															+    "⿏": "鼠",
														
 
															+    "⿐": "鼻",
														
 
															+    "⿒": "齿",
														
 
															+    "⿓": "龙",
														
 
															+    "⼣": "夕"
														
 
															+}
														
--- a/format_convert/ofd/ofd_parser.py
+++ b/format_convert/ofd/ofd_parser.py
@@ -0,0 +1,327 @@
 
															+import os
														
 
															+import zipfile
														
 
															+import xml.etree.ElementTree as ET
														
 
															+from typing import Dict, List, Any, Optional
														
 
															+from pathlib import Path
														
 
															+
														
 
															+
														
 
															+class OFDParser:
														
 
															+    """OFD文件解析器"""
														
 
															+
														
 
															+    def __init__(self, ofd_path: str):
														
 
															+        """初始化解析器并验证OFD文件"""
														
 
															+        self.ofd_path = ofd_path
														
 
															+        self.temp_dir = Path("./ofd_temp")
														
 
															+        self.ofd_info = {}
														
 
															+        self.documents = []
														
 
															+
														
 
															+        if not os.path.exists(ofd_path):
														
 
															+            raise FileNotFoundError(f"OFD文件不存在: {ofd_path}")
														
 
															+
														
 
															+        if not zipfile.is_zipfile(ofd_path):
														
 
															+            raise ValueError(f"文件不是有效的OFD文件(Zip格式): {ofd_path}")
														
 
															+
														
 
															+    def parse(self) -> Dict[str, Any]:
														
 
															+        """解析OFD文件并返回内容结构"""
														
 
															+        try:
														
 
															+            self._extract_ofd()
														
 
															+            self._parse_ofd_xml()
														
 
															+            self._parse_documents()
														
 
															+            return {
														
 
															+                "file_info": self.ofd_info,
														
 
															+                "documents": self.documents
														
 
															+            }
														
 
															+        finally:
														
 
															+            self._cleanup()
														
 
															+
														
 
															+    def _extract_ofd(self) -> None:
														
 
															+        """解压OFD文件到临时目录"""
														
 
															+        self.temp_dir.mkdir(exist_ok=True)
														
 
															+        with zipfile.ZipFile(self.ofd_path, 'r') as zip_ref:
														
 
															+            zip_ref.extractall(self.temp_dir)
														
 
															+
														
 
															+    def _parse_ofd_xml(self) -> None:
														
 
															+        """解析OFD.xml文件获取基本信息"""
														
 
															+        ofd_xml_path = self.temp_dir / "OFD.xml"
														
 
															+        if not ofd_xml_path.exists():
														
 
															+            raise ValueError("OFD.xml文件缺失")
														
 
															+
														
 
															+        root = ET.parse(ofd_xml_path).getroot()
														
 
															+        namespace = {'ofd': 'http://www.ofdspec.org/2016'}
														
 
															+
														
 
															+        # 解析文档基本信息
														
 
															+        doc_body = root.find('ofd:DocBody', namespace)
														
 
															+        if doc_body is not None:
														
 
															+            # 解析文档根信息
														
 
															+            doc_file = doc_body.find('ofd:DocFile', namespace)
														
 
															+            if doc_file is not None:
														
 
															+                self.ofd_info['doc_file'] = doc_file.text
														
 
															+
														
 
															+            # 解析签名信息
														
 
															+            signatures = doc_body.find('ofd:Signatures', namespace)
														
 
															+            if signatures is not None:
														
 
															+                self.ofd_info['signatures'] = {
														
 
															+                    'file': signatures.get('FileRef'),
														
 
															+                    'count': int(signatures.get('Count', 0))
														
 
															+                }
														
 
															+
														
 
															+    def _parse_documents(self) -> None:
														
 
															+        """解析文档内容"""
														
 
															+        # 获取所有Document.xml文件
														
 
															+        doc_xml_files = list(self.temp_dir.rglob("Document.xml"))
														
 
															+        for doc_xml in doc_xml_files:
														
 
															+            doc_info = self._parse_document(doc_xml)
														
 
															+            self.documents.append(doc_info)
														
 
															+
														
 
															+    def _parse_document(self, doc_xml_path: Path) -> Dict[str, Any]:
														
 
															+        """解析单个文档"""
														
 
															+        namespace = {'ofd': 'http://www.ofdspec.org/2016'}
														
 
															+        root = ET.parse(doc_xml_path).getroot()
														
 
															+
														
 
															+        document = {
														
 
															+            'path': str(doc_xml_path),
														
 
															+            'pages': [],
														
 
															+            'fonts': self._parse_fonts(root, namespace),
														
 
															+            'metadata': self._parse_metadata(root, namespace)
														
 
															+        }
														
 
															+
														
 
															+        # 解析页面信息
														
 
															+        pages_node = root.find('.//ofd:Pages', namespace)
														
 
															+        if pages_node is not None:
														
 
															+            page_references = pages_node.findall('ofd:Page', namespace)
														
 
															+            for page_ref in page_references:
														
 
															+                page_id = page_ref.get('ID')
														
 
															+                page_file = page_ref.find('ofd:PageFile', namespace)
														
 
															+                if page_file is not None:
														
 
															+                    page_path = self.temp_dir / page_file.text
														
 
															+                    if page_path.exists():
														
 
															+                        page_info = self._parse_page(page_path)
														
 
															+                        document['pages'].append({
														
 
															+                            'id': page_id,
														
 
															+                            'content': page_info
														
 
															+                        })
														
 
															+
														
 
															+        return document
														
 
															+
														
 
															+    def _parse_fonts(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, str]]:
														
 
															+        """解析文档字体信息"""
														
 
															+        fonts = []
														
 
															+        font_list = root.find('.//ofd:Fonts', ns)
														
 
															+        if font_list is not None:
														
 
															+            for font_node in font_list.findall('ofd:Font', ns):
														
 
															+                font = {
														
 
															+                    'id': font_node.get('ID'),
														
 
															+                    'name': font_node.get('FontName'),
														
 
															+                    'family': font_node.get('FamilyName'),
														
 
															+                    'format': font_node.get('FontFormat'),
														
 
															+                    'bold': font_node.get('Bold') == 'true',
														
 
															+                    'italic': font_node.get('Italic') == 'true',
														
 
															+                    'serif': font_node.get('Serif') == 'true',
														
 
															+                    'fixed_width': font_node.get('FixedWidth') == 'true'
														
 
															+                }
														
 
															+                fonts.append(font)
														
 
															+        return fonts
														
 
															+
														
 
															+    def _parse_metadata(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, str]:
														
 
															+        """解析文档元数据"""
														
 
															+        metadata = {}
														
 
															+        doc_info = root.find('.//ofd:DocInfo', ns)
														
 
															+        if doc_info is not None:
														
 
															+            for attr in ['Title', 'Author', 'Subject', 'Keywords', 'Creator',
														
 
															+                         'CreatorVersion', 'CreationDate', 'ModDate']:
														
 
															+                element = doc_info.find(f'ofd:{attr}', ns)
														
 
															+                if element is not None and element.text:
														
 
															+                    metadata[attr] = element.text
														
 
															+        return metadata
														
 
															+
														
 
															+    def _parse_page(self, page_path: Path) -> Dict[str, Any]:
														
 
															+        """解析页面内容"""
														
 
															+        namespace = {
														
 
															+            'ofd': 'http://www.ofdspec.org/2016',
														
 
															+            'ofdtext': 'http://www.ofdspec.org/2016',
														
 
															+            'ofdgraph': 'http://www.ofdspec.org/2016',
														
 
															+            'ofdimg': 'http://www.ofdspec.org/2016'
														
 
															+        }
														
 
															+        root = ET.parse(page_path).getroot()
														
 
															+
														
 
															+        page = {
														
 
															+            'size': self._parse_page_size(root, namespace),
														
 
															+            'text_content': self._extract_text_content(root, namespace),
														
 
															+            'images': self._extract_images(root, namespace),
														
 
															+            'graphics': self._extract_graphics(root, namespace),
														
 
															+            'layers': self._parse_layers(root, namespace)
														
 
															+        }
														
 
															+
														
 
															+        return page
														
 
															+
														
 
															+    def _parse_page_size(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, float]:
														
 
															+        """解析页面尺寸"""
														
 
															+        box = root.find('.//ofd:Area/ofd:PhysicalBox', ns)
														
 
															+        if box is not None:
														
 
															+            return {
														
 
															+                'width': float(box.get('Width', 0)),
														
 
															+                'height': float(box.get('Height', 0)),
														
 
															+                'x': float(box.get('x', 0)),
														
 
															+                'y': float(box.get('y', 0))
														
 
															+            }
														
 
															+        return {'width': 0, 'height': 0, 'x': 0, 'y': 0}
														
 
															+
														
 
															+    def _extract_text_content(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
														
 
															+        """提取页面文本内容，包含位置和样式信息"""
														
 
															+        text_objects = root.findall('.//ofdtext:TextObject', ns)
														
 
															+        texts = []
														
 
															+
														
 
															+        for text_obj in text_objects:
														
 
															+            # 获取文本对象的基本属性
														
 
															+            text_info = {
														
 
															+                'id': text_obj.get('ID'),
														
 
															+                'bounding_box': {
														
 
															+                    'x': float(text_obj.get('BoundaryBox').split()[0]),
														
 
															+                    'y': float(text_obj.get('BoundaryBox').split()[1]),
														
 
															+                    'width': float(text_obj.get('BoundaryBox').split()[2]),
														
 
															+                    'height': float(text_obj.get('BoundaryBox').split()[3])
														
 
															+                },
														
 
															+                'transform': text_obj.get('CTM'),
														
 
															+                'content': []
														
 
															+            }
														
 
															+
														
 
															+            # 获取文本样式
														
 
															+            style = text_obj.find('ofdtext:TextStyle', ns)
														
 
															+            if style is not None:
														
 
															+                text_info['style'] = {
														
 
															+                    'font': style.get('Font'),
														
 
															+                    'size': float(style.get('Size', 0)),
														
 
															+                    'color': style.get('FillColor'),
														
 
															+                    'weight': style.get('Weight'),
														
 
															+                    'italic': style.get('Italic') == 'true',
														
 
															+                    'underline': style.get('Underline') == 'true',
														
 
															+                    'strikeout': style.get('StrikeOut') == 'true'
														
 
															+                }
														
 
															+
														
 
															+            # 提取实际文本内容
														
 
															+            text_codecs = text_obj.findall('ofdtext:TextCode', ns)
														
 
															+            for codec in text_codecs:
														
 
															+                if codec.text:
														
 
															+                    text_info['content'].append({
														
 
															+                        'text': codec.text.strip(),
														
 
															+                        'position': {
														
 
															+                            'x': float(codec.get('X', 0)),
														
 
															+                            'y': float(codec.get('Y', 0))
														
 
															+                        }
														
 
															+                    })
														
 
															+
														
 
															+            if text_info['content']:
														
 
															+                texts.append(text_info)
														
 
															+
														
 
															+        return texts
														
 
															+
														
 
															+    def _extract_images(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
														
 
															+        """提取页面中的图像信息"""
														
 
															+        images = []
														
 
															+        image_objects = root.findall('.//ofdimg:ImageObject', ns)
														
 
															+
														
 
															+        for img_obj in image_objects:
														
 
															+            image = {
														
 
															+                'id': img_obj.get('ID'),
														
 
															+                'bounding_box': {
														
 
															+                    'x': float(img_obj.get('BoundaryBox').split()[0]),
														
 
															+                    'y': float(img_obj.get('BoundaryBox').split()[1]),
														
 
															+                    'width': float(img_obj.get('BoundaryBox').split()[2]),
														
 
															+                    'height': float(img_obj.get('BoundaryBox').split()[3])
														
 
															+                },
														
 
															+                'resource_id': img_obj.get('ResourceID'),
														
 
															+                'transform': img_obj.get('CTM')
														
 
															+            }
														
 
															+            images.append(image)
														
 
															+
														
 
															+        return images
														
 
															+
														
 
															+    def _extract_graphics(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
														
 
															+        """提取页面中的图形信息"""
														
 
															+        graphics = []
														
 
															+        graphic_objects = root.findall('.//ofdgraph:PathObject', ns)
														
 
															+
														
 
															+        for graphic_obj in graphic_objects:
														
 
															+            graphic = {
														
 
															+                'id': graphic_obj.get('ID'),
														
 
															+                'bounding_box': {
														
 
															+                    'x': float(graphic_obj.get('BoundaryBox').split()[0]),
														
 
															+                    'y': float(graphic_obj.get('BoundaryBox').split()[1]),
														
 
															+                    'width': float(graphic_obj.get('BoundaryBox').split()[2]),
														
 
															+                    'height': float(graphic_obj.get('BoundaryBox').split()[3])
														
 
															+                },
														
 
															+                'fill_color': graphic_obj.get('FillColor'),
														
 
															+                'stroke_color': graphic_obj.get('StrokeColor'),
														
 
															+                'line_width': float(graphic_obj.get('LineWidth', 0)),
														
 
															+                'path_data': graphic_obj.find('ofdgraph:PathData', ns).text if graphic_obj.find('ofdgraph:PathData',
														
 
															+                                                                                                ns) is not None else ''
														
 
															+            }
														
 
															+            graphics.append(graphic)
														
 
															+
														
 
															+        return graphics
														
 
															+
														
 
															+    def _parse_layers(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
														
 
															+        """解析页面图层信息"""
														
 
															+        layers = []
														
 
															+        layer_nodes = root.findall('.//ofd:Layer', ns)
														
 
															+
														
 
															+        for layer in layer_nodes:
														
 
															+            layer_info = {
														
 
															+                'type': layer.get('Type'),
														
 
															+                'objects': {
														
 
															+                    'text': len(layer.findall('.//ofdtext:TextObject', ns)),
														
 
															+                    'images': len(layer.findall('.//ofdimg:ImageObject', ns)),
														
 
															+                    'graphics': len(layer.findall('.//ofdgraph:PathObject', ns))
														
 
															+                }
														
 
															+            }
														
 
															+            layers.append(layer_info)
														
 
															+
														
 
															+        return layers
														
 
															+
														
 
															+    def _cleanup(self) -> None:
														
 
															+        """清理临时文件"""
														
 
															+        import shutil
														
 
															+        # if self.temp_dir.exists():
														
 
															+        #     shutil.rmtree(self.temp_dir)
														
 
															+
														
 
															+
														
 
															+# 使用示例
														
 
															+if __name__ == "__main__":
														
 
															+    try:
														
 
															+        p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
														
 
															+        parser = OFDParser(p)
														
 
															+        result = parser.parse()
														
 
															+
														
 
															+        # 打印文档基本信息
														
 
															+        print("文档信息:", result["file_info"])
														
 
															+
														
 
															+        # 打印所有页面的文本内容
														
 
															+        for doc_idx, document in enumerate(result["documents"], 1):
														
 
															+            print(f"\n文档 {doc_idx}:")
														
 
															+            print(f"  字体数量: {len(document['fonts'])}")
														
 
															+            print(f"  页面数量: {len(document['pages'])}")
														
 
															+
														
 
															+            # 打印文档元数据
														
 
															+            if document['metadata']:
														
 
															+                print("  元数据:")
														
 
															+                for key, value in document['metadata'].items():
														
 
															+                    print(f"    {key}: {value}")
														
 
															+
														
 
															+            # 打印页面内容摘要
														
 
															+            for page_idx, page in enumerate(document["pages"], 1):
														
 
															+                print(f"\n  页面 {page_idx}:")
														
 
															+                print(f"    尺寸: {page['content']['size']['width']} x {page['content']['size']['height']}")
														
 
															+                print(f"    文本元素: {len(page['content']['text_content'])}")
														
 
															+                print(f"    图像元素: {len(page['content']['images'])}")
														
 
															+                print(f"    图形元素: {len(page['content']['graphics'])}")
														
 
															+                print(f"    图层数量: {len(page['content']['layers'])}")
														
 
															+
														
 
															+                # 打印前5行文本
														
 
															+                if page['content']['text_content']:
														
 
															+                    print("    前5行文本:")
														
 
															+                    for i, text_elem in enumerate(page['content']['text_content'][:5]):
														
 
															+                        text_lines = " ".join([t['text'] for t in text_elem['content']])
														
 
															+                        print(f"      {i + 1}. {text_lines[:50]}{'...' if len(text_lines) > 50 else ''}")
														
 
															+
														
 
															+    except Exception as e:
														
 
															+        print(f"解析OFD文件时出错: {e}")
														
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -9,13 +9,18 @@ import pickle
 
															 import socket
														
 
															 import subprocess
														
 
															 import sys
														
 
															+from glob import glob
														
 
															 from io import BytesIO
														
 
															 from subprocess import Popen
														
 
															+import pynvml
														
 
															+import datetime
														
 
															+import PyPDF2
														
 
															 from shapely.geometry import LineString
														
 
															 import cv2
														
 
															 import requests
														
 
															 from PIL import Image
														
 
															-
														
 
															+from reportlab.pdfbase import pdfmetrics
														
 
															+from reportlab.pdfbase.ttfonts import TTFont
														
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															 import difflib
														
 
															 import logging
														
@@ -43,6 +48,14 @@ from shapely.geometry import Polygon
 
															 config_file_path = os.path.dirname(os.path.abspath(__file__)) + "/../config/interface_new.yml"
														
 
															+# 特殊中文转基本中文
														
 
															+with open(os.path.abspath(os.path.dirname(__file__)) + '/font_map/extend_to_normal_dict.txt', 'r', encoding='utf-8') as f:
														
 
															+    extend_to_normal_dict = f.read()
														
 
															+    extend_to_normal_dict = eval(extend_to_normal_dict)
														
 
															+with open(os.path.abspath(os.path.dirname(__file__)) + '/font_map/kangxi_to_normal_dict.txt', 'r', encoding='utf-8') as f:
														
 
															+    kangxi_to_normal_dict = f.read()
														
 
															+    kangxi_to_normal_dict = eval(kangxi_to_normal_dict)
														
 
															+
														
 
															 def has_intersection(poly1, poly2):
														
 
															     """
														
@@ -62,7 +75,7 @@ def has_intersection(poly1, poly2):
 
															 def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13,
														
 
															-                                  -14, -15, -16, -17, -18, -19, -20, -21, -22]):
														
 
															+                                  -14, -15, -16, -17, -18, -19, -20, -21, -22, -23]):
														
 
															     """
														
 
															     [0] : continue
														
 
															     [-1]: 逻辑处理错误
														
@@ -87,6 +100,7 @@ def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -1
 
															     [-20]: requests请求超时
														
 
															     [-21]: requests请求返回错误状态码
														
 
															     [-22]: requests请求拒绝连接
														
 
															+    [-23]: 两列无边框表格提取报错
														
 
															     """
														
 
															     for c in code:
														
 
															         if isinstance(_list, list) and _list == [c]:
														
@@ -366,11 +380,45 @@ def slash_replace(_str, reverse=False):
 
															     return _str
														
 
															+def align_table_lines(line_list, threshold=7):
														
 
															+    """
														
 
															+    对齐横线竖线，包括越过合并单元格的线
														
 
															+    否则在生成表格时会因为线错位出错
														
 
															+
														
 
															+    :return:
														
 
															+    """
														
 
															+    rows = []
														
 
															+    cols = []
														
 
															+    for line in line_list:
														
 
															+        x0, y0, x1, y1 = line.bbox
														
 
															+        if abs(x0-x1) > abs(y0-y1):
														
 
															+            rows.append(line)
														
 
															+        else:
														
 
															+            cols.append(line)
														
 
															+    if not rows or not cols:
														
 
															+        return line_list
														
 
															+
														
 
															+    rows.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
														
 
															+    last_line = rows[0]
														
 
															+    for line in rows[1:]:
														
 
															+        if abs(line.bbox[1] - last_line.bbox[1]) <= threshold and line.bbox[1] != last_line.bbox[1]:
														
 
															+            last_line.bbox = (last_line.bbox[0], line.bbox[1], last_line.bbox[2], line.bbox[3])
														
 
															+        last_line = line
														
 
															+
														
 
															+    cols.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
														
 
															+    last_line = cols[0]
														
 
															+    for line in cols[1:]:
														
 
															+        if abs(line.bbox[0] - last_line.bbox[0]) <= threshold and line.bbox[0] != last_line.bbox[0]:
														
 
															+            last_line.bbox = (line.bbox[0], last_line.bbox[1], line.bbox[2], last_line.bbox[3])
														
 
															+        last_line = line
														
 
															+    line_list = rows + cols
														
 
															+    return line_list
														
 
															+
														
 
															+
														
 
															 class LineTable:
														
 
															     def recognize_table(self, list_textbox, list_line, sourceP_LB=False,
														
 
															                         splited=False, from_pdf=False, is_reverse=False, show=0):
														
 
															         self.list_line = list_line
														
 
															-        self.list_crosspoints = self.recognize_crosspoints(list_line)
														
 
															         self.from_pdf = from_pdf
														
 
															         self.splited = splited
														
 
															         self.connect_bbox_list = []
														
@@ -381,6 +429,13 @@ class LineTable:
 
															             # 展示原始表格及文字
														
 
															             self._plot(list_line, list_textbox, title='list_line,list_textbox')
														
 
															+        list_line = align_table_lines(list_line)
														
 
															+        if self.show:
														
 
															+            self._plot(list_line, list_textbox, title='align_table_lines')
														
 
															+
														
 
															+        # 获取交点
														
 
															+        self.list_crosspoints = self.recognize_crosspoints(list_line)
														
 
															+
														
 
															         # 聚类
														
 
															         cluster_crosspoints = []
														
 
															         for _point in self.list_crosspoints:
														
@@ -1189,6 +1244,15 @@ class LineTable:
 
															     def fix_rect(self, _table, list_x, list_y, sourceP_LB, margin):
														
 
															         self.fix_span(_table, list_x, list_y, sourceP_LB)
														
 
															+        if self.show:
														
 
															+            # 打印_table
														
 
															+            temp_list = []
														
 
															+            for t in _table:
														
 
															+                print('------ fix_span row ------')
														
 
															+                for c in t:
														
 
															+                    print('fix_span col', c)
														
 
															+                    temp_list.append(c)
														
 
															+            self._plot([], [], temp_list, title='fix_span table')
														
 
															         for _line in _table:
														
 
															             _line.sort(key=lambda x: x.get('bbox')[0])
														
@@ -1646,7 +1710,7 @@ def sort_object(obj_list, is_reverse=False):
 
															     if len(obj_list) == 0:
														
 
															         return obj_list
														
 
															     if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
														
 
															-        obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
														
 
															+        obj_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]), reverse=is_reverse)
														
 
															         return obj_list
														
 
															     elif isinstance(obj_list[0], _Page):
														
 
															         obj_list.sort(key=lambda x: x.page_no)
														
@@ -2544,6 +2608,237 @@ def dynamic_get_port(start_port, mode='-1', num=10):
 
															     return None
														
 
															+def text_bbox_to_lt(text_list, bbox_list):
														
 
															+    from format_convert.convert_tree import TextBox
														
 
															+    lt_text_box_list = []
														
 
															+    for i in range(len(bbox_list)):
														
 
															+        bbox = bbox_list[i]
														
 
															+        b_text = text_list[i]
														
 
															+        lt_text_box_list.append(TextBox([bbox[0][0], bbox[0][1], bbox[2][0], bbox[2][1]], b_text))
														
 
															+    return lt_text_box_list
														
 
															+
														
 
															+
														
 
															+def extract_one_page_pdf(input_pdf_path, output_pdf_path, page_no):
														
 
															+    try:
														
 
															+        # 打开源PDF文件
														
 
															+        with open(input_pdf_path, 'rb') as input_file:
														
 
															+            pdf_reader = PyPDF2.PdfFileReader(input_file)
														
 
															+
														
 
															+            # 检查页码是否有效
														
 
															+            if page_no < 0 or page_no >= len(pdf_reader.pages):
														
 
															+                print("页码超出范围")
														
 
															+                return
														
 
															+
														
 
															+            # 创建一个新的PDF写入对象
														
 
															+            pdf_writer = PyPDF2.PdfFileWriter()
														
 
															+
														
 
															+            # 添加指定页到写入对象
														
 
															+            pdf_writer.addPage(pdf_reader.pages[page_no])
														
 
															+
														
 
															+            # 将新的PDF写入到输出文件
														
 
															+            with open(output_pdf_path, 'wb') as output_file:
														
 
															+                pdf_writer.write(output_file)
														
 
															+
														
 
															+        print(f"成功提取第 {page_no + 1} 页并保存为 {output_pdf_path}")
														
 
															+    except Exception as e:
														
 
															+        print(f"提取页面失败：{e}")
														
 
															+
														
 
															+
														
 
															+def get_gpu_memory_usage():
														
 
															+    try:
														
 
															+        # 初始化 NVML
														
 
															+        pynvml.nvmlInit()
														
 
															+        # 获取 GPU 设备数量
														
 
															+        device_count = pynvml.nvmlDeviceGetCount()
														
 
															+        # 获取当前时间
														
 
															+        now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
														
 
															+
														
 
															+        # 遍历每个 GPU
														
 
															+        for i in range(device_count):
														
 
															+            # 获取 GPU 句柄
														
 
															+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
														
 
															+
														
 
															+            # 获取 GPU 名称
														
 
															+            gpu_name = pynvml.nvmlDeviceGetName(handle)
														
 
															+
														
 
															+            # 获取显存信息
														
 
															+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
														
 
															+            total_memory = mem_info.total / (1024 * 1024)  # 转换为 MiB
														
 
															+            used_memory = mem_info.used / (1024 * 1024)   # 转换为 MiB
														
 
															+            free_memory = mem_info.free / (1024 * 1024)   # 转换为 MiB
														
 
															+
														
 
															+            info = f'  时间：{now}\n'
														
 
															+            info += f"  GPU信息 {i}: {gpu_name.decode('utf-8')}\n"
														
 
															+            info += f"    总显存: {total_memory:.2f} MiB\n"
														
 
															+            info += f"    已用显存: {used_memory:.2f} MiB\n"
														
 
															+            info += f"    剩余显存: {free_memory:.2f} MiB\n\n"
														
 
															+
														
 
															+            # 获取进程信息
														
 
															+            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
														
 
															+            if processes:
														
 
															+                info += f"  GPU进程信息: {i}\n"
														
 
															+                for p in processes:
														
 
															+                    pid = p.pid
														
 
															+                    used_memory = p.usedGpuMemory / (1024 * 1024)
														
 
															+                    try:
														
 
															+                        # 获取进程的启动命令
														
 
															+                        proc = psutil.Process(pid)
														
 
															+                        cmdline = proc.cmdline()
														
 
															+                        info += f"    {' '.join(cmdline)[-17:-14]} {pid}: {used_memory:.2f} MiB\n"
														
 
															+                    except:
														
 
															+                        traceback.print_exc()
														
 
															+            print(info)
														
 
															+
														
 
															+        # 关闭 NVML
														
 
															+        pynvml.nvmlShutdown()
														
 
															+    except:
														
 
															+        traceback.print_exc()
														
 
															+        pass
														
 
															+
														
 
															+
														
 
															+def get_current_process_gpu_id():
														
 
															+    try:
														
 
															+        # 初始化 NVML
														
 
															+        pynvml.nvmlInit()
														
 
															+
														
 
															+        # 获取当前进程的 PID
														
 
															+        current_pid = os.getpid()
														
 
															+        # print(f"Current PID: {current_pid}")
														
 
															+
														
 
															+        # 获取 GPU 设备数量
														
 
															+        device_count = pynvml.nvmlDeviceGetCount()
														
 
															+
														
 
															+        # 遍历每个 GPU 设备
														
 
															+        for i in range(device_count):
														
 
															+            # 获取 GPU 句柄
														
 
															+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
														
 
															+
														
 
															+            # 获取运行在该 GPU 上的进程
														
 
															+            try:
														
 
															+                processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
														
 
															+            except pynvml.NVMLError:
														
 
															+                processes = []
														
 
															+
														
 
															+            # 查找当前进程
														
 
															+            for p in processes:
														
 
															+                if p.pid == current_pid:
														
 
															+                    print(f"Process {current_pid} is running on GPU {i}")
														
 
															+                    return i
														
 
															+
														
 
															+        print("Current process not found on any GPU")
														
 
															+        return None
														
 
															+    except:
														
 
															+        traceback.print_exc()
														
 
															+        return None
														
 
															+    finally:
														
 
															+        # 关闭 NVML
														
 
															+        pynvml.nvmlShutdown()
														
 
															+
														
 
															+
														
 
															+def register_all_fonts(font_dir):
														
 
															+    # 遍历字体目录
														
 
															+    for root, dirs, files in os.walk(font_dir):
														
 
															+        for file in files:
														
 
															+            # 检查文件扩展名是否为 TrueType 或 OpenType
														
 
															+            if file.endswith((".ttf", ".otf")):
														
 
															+                font_path = os.path.join(root, file)
														
 
															+                # 提取字体名称（去掉扩展名）
														
 
															+                font_name = os.path.splitext(file)[0]
														
 
															+                try:
														
 
															+                    # 注册字体
														
 
															+                    pdfmetrics.registerFont(TTFont(font_name, font_path))
														
 
															+                    print(f"Font registered: {font_name}")
														
 
															+                except Exception as e:
														
 
															+                    print(f"Failed to register font {font_name}: {e}")
														
 
															+
														
 
															+
														
 
															+def ascii85_decode(data):
														
 
															+    """
														
 
															+    手动实现 ASCII85 解码
														
 
															+    """
														
 
															+    decoded = b''
														
 
															+    i = 0
														
 
															+    while i < len(data):
														
 
															+        # ASCII85 编码以 '!' 开始，以 'z' 结束
														
 
															+        if data[i] == ord('z'):
														
 
															+            decoded += b'\0\0\0\0'
														
 
															+            i += 1
														
 
															+        else:
														
 
															+            # 取 5 个字符进行解码
														
 
															+            block = data[i:i+5]
														
 
															+            i += 5
														
 
															+            # 转换为整数值
														
 
															+            value = 0
														
 
															+            for c in block:
														
 
															+                if ord('!') <= c <= ord('u'):
														
 
															+                    value = value * 85 + (c - ord('!'))
														
 
															+                elif c == ord('z'):
														
 
															+                    value = 0
														
 
															+                else:
														
 
															+                    # 无效字符，跳过
														
 
															+                    continue
														
 
															+            # 转换为 4 个字节
														
 
															+            bytes_value = value.to_bytes(4, byteorder='big')
														
 
															+            decoded += bytes_value
														
 
															+    return decoded
														
 
															+
														
 
															+
														
 
															+def special_font_to_normal(text):
														
 
															+    """
														
 
															+    特殊中文转基本中文unicode
														
 
															+
														
 
															+    :return:
														
 
															+    """
														
 
															+    # print('type(extend_to_normal_dict)', type(extend_to_normal_dict), type(kangxi_to_normal_dict))
														
 
															+    extend_set = set(extend_to_normal_dict.keys())
														
 
															+    kangxi_set = set(kangxi_to_normal_dict.keys())
														
 
															+    text_list = list(text)
														
 
															+    for i, c in enumerate(text_list):
														
 
															+        if c in extend_set:
														
 
															+            text_list[i] = extend_to_normal_dict.get(c)
														
 
															+        elif c in kangxi_set:
														
 
															+            text_list[i] = kangxi_to_normal_dict.get(c)
														
 
															+    text = ''.join(text_list)
														
 
															+    return text
														
 
															+
														
 
															+
														
 
															+def image_resize_by_ratio(img, max_width=1800, max_height=2600):
														
 
															+    # 获取原图的宽度和高度
														
 
															+    width, height = img.size
														
 
															+    # print('width, height, max_width, max_height', width, height, max_width, max_height)
														
 
															+
														
 
															+    # 计算宽高比
														
 
															+    aspect_ratio = width / height
														
 
															+    # 判断哪条边超出最大值更多
														
 
															+    if width > max_width and height > max_height:
														
 
															+        # 计算宽度和高度超出最大值的比例
														
 
															+        width_exceed_ratio = width / max_width
														
 
															+        height_exceed_ratio = height / max_height
														
 
															+
														
 
															+        # 选择超出比例更大的边作为基准进行缩放
														
 
															+        if width_exceed_ratio > height_exceed_ratio:
														
 
															+            new_width = max_width
														
 
															+            new_height = int(new_width / aspect_ratio)
														
 
															+        else:
														
 
															+            new_height = max_height
														
 
															+            new_width = int(new_height * aspect_ratio)
														
 
															+        # print('new_width, new_height1', new_width, new_height)
														
 
															+    elif width > max_width:
														
 
															+        new_width = max_width
														
 
															+        new_height = int(new_width / aspect_ratio)
														
 
															+        # print('new_width, new_height2', new_width, new_height)
														
 
															+    elif height > max_height:
														
 
															+        new_height = max_height
														
 
															+        new_width = int(new_height * aspect_ratio)
														
 
															+        # print('new_width, new_height3', new_width, new_height)
														
 
															+    else:
														
 
															+        new_width, new_height = width, height
														
 
															+
														
 
															+    if new_width != width or new_height != height:
														
 
															+        img = img.resize((new_width, new_height), Image.LANCZOS)
														
 
															+    return img
														
 
															+
														
 
															+
														
 
															 if __name__ == "__main__":
														
 
															     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
														
 
															     # print(slash_replace(strs))
														
@@ -2572,14 +2867,27 @@ if __name__ == "__main__":
 
															     # print(parse_yaml())
														
 
															-    print(get_ip_port())
														
 
															+    # print(get_ip_port())
														
 
															     # set_flask_global()
														
 
															-    print(get_all_ip())
														
 
															-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
														
 
															-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
														
 
															-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
														
 
															-    print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
														
 
															+    # print(get_all_ip())
														
 
															+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
														
 
															+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
														
 
															+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
														
 
															+    # print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
														
 
															     # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
														
 
															     # print(get_intranet_ip())
														
 
															-    # _path = "C:/Users/Administrator/Downloads/3.png"
														
 
															-    # remove_red_seal(cv2.imread(_path))
														
 
															+
														
 
															+    # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_pdf\*.pdf')
														
 
															+    # save_dir = r'D:\Project\format_conversion_maxcompute\save_b_table_pdf'
														
 
															+    # index = 0
														
 
															+    # for p in ps:
														
 
															+    #     save_path = f'{save_dir}/e-{index}.pdf'
														
 
															+    #     page_no = int(re.split('\.|-', p)[1])
														
 
															+    #     extract_one_page_pdf(p, save_path, page_no)
														
 
															+    #     index += 1
														
 
															+
														
 
															+    # _ss = 'otr_interface:app'
														
 
															+    # print(_ss[-17:-14])
														
 
															+
														
 
															+    _ss = '仁和坪镇杨柳池村⼈居环境整治项⽬终⽌'
														
 
															+    print(special_font_to_normal(_ss))
														
--- a/monitor/watch_10_minutes_process.sh
+++ b/monitor/watch_10_minutes_process.sh
@@ -1,3 +1,6 @@
 
															 #!/bin/bash
														
 
															 sed -n '/2024-05-29 17:30:00/,/2024-05-29 17:40:00/p' /convert.out | grep 'is_success' | wc -l
														
 
															+
														
 
															+
														
 
															+sed -n '/2025-06-11 12:50:00/,/2025-06-11 13:00:00/p' /convert.out | grep 'is_success: ' | awk -F '[\\[\\] ]+' '{file_type=$(NF-2); time=$NF; map[file_type] += time; count[file_type]++} END {for (key in map) print key, "-", map[key], "-", count[key], "-", map[key]/count[key]}'
														
--- a/ocr/ocr_interface.py
+++ b/ocr/ocr_interface.py
@@ -5,7 +5,7 @@ import multiprocessing as mp
 
															 import socket
														
 
															 import sys
														
 
															 import os
														
 
															-
														
 
															+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
														
 
															 from PIL import Image
														
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
@@ -91,7 +91,10 @@ def picture2text(img_data, ocr_model, only_rec=0):
 
															         text_list = []
														
 
															         bbox_list = []
														
 
															         if only_rec:
														
 
															-            text_list = [results[0][0]]
														
 
															+            if results:
														
 
															+                text_list = [results[0][0]]
														
 
															+            else:
														
 
															+                text_list = []
														
 
															             bbox_list = []
														
 
															         else:
														
 
															             for line in results:
														
@@ -176,27 +179,6 @@ def test_ocr_model(from_remote=True):
 
															 if __name__ == '__main__':
														
 
															-    test_ocr_model()
														
 
															-
														
 
															-#     src = """
														
 
															-# data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASwAAAAeCAYAAACWuCNnAAAE3ElEQVR42u2dQWjUQBSGi4gnEURERKQgIiIiggdP4sWDiPTgvSAK4sFDEe8iIohHEW8iUsSLFBERQaSI9CBIERGRgpQiHtpuMkl2Pa/vH2fW2ZhssttkO8v+AyHTzCT53nT23zdvZpOJIAh2TEhqNpu7VaS+THiWWq3WHuFaQV7279bW1rarWP2y5eRnYv8ZI36l1PF2u71N52P11V7Ap+QaKbwPGo3GPudv8jOx/4wLv5xw2+ajKDrhY4cT5Z23xkKJhfOG5KeSJDlEfib2H/L7Y2ysHotRp62xsp8J4/Bc2ljyM7H/kN+LFEbhGxipjU3C86LYiRy7R34m9h/ye2s0DJZtEsaK0aujFO8ZdX72H/Kz/zup3W5vsfmgGRyJ4/hkug7cSBnn3nHrkZ+J/Yf8/8agkZoXRfyOPKYoaxnnRuqHbMti5EH9N1Q4DE91KXQYHnWNJf+IxJDY/uQfFj8qQzn1tCSglLpfm8v4d2y7omJ1E4bLvWa7WKQRBjGW/JsrVmx/8lfOf/HSlbatZPO4gVTeKhUjZxx6FdOR6XPSKa8s77hc8wIM1flYfcD4VrYW6kOpJf9etre97plxzZ78lX4wc/jlG+WZy9/nNYfGX4tYsf3JXwP/f2KCfd4GY+H6bUSwsrb13+t7jaLOWaMxBkZZEAT7scfisrKChelSaaxvmE6F2ouhd8M4vObyV5ny+PHNAH6U28VxPvLX0IHZ/uSvhb8jIlZQ5GbTUvmyzkdqCSeZ/CrGoQBBsAzHAFRWsNz7ZNWT63+Se10P4uCsLbOBO5ev0FjDb66p+c0Yu4sf5Xn8Aw5/uvjtP63v62wSf2XtwPYnf438mQJjhOsJ9nLya5xkXb8sDynPe8oSpqxyd02G68FJfjHPKytovDnwgzvNL2WfcQ/TGAuVjN8z+E1+MUmSXQP884fKX0Psiu1P/lr482NYcpJUPuMMB/VJ5keKjwYZEmYNPdPlcAVdYXLXavQVwzL8thHT/HV9WMHf+W3UBtaabBZ/laLF9id/1fyd9RO9YlcG5LVZYj/Vy9MZ9LgrZJi+xB7rNoqE0F3/Iar8Aq5pl1IbxXf5G0njcI3xmwU7/Zrmz0q+8feb2P7kHyY/Mk/hHloxgAvnBshc7wqPh8BUY1pgsrynokB7nujBjRSmnWUmAYwhmh8zEln82ugUf82ehebv4xvJK/4BvlHZ/uQfGn/noIlbvbRDMkwpOrGkn0Y9Z/MC4Hkxq6LydF3X4ys71HRmJzQ/HlNh+U15h1/KDwzL4+jH/fWFf1D3ne1P/mHw60pZImIj9WYt1LH0MvpeolM09CsTQC8rWJZfjProuKVTll8HBTP4vfmwk5/85C/PLwUPpcItu/zdDAFnigLcvQRlIx5WmetjOhZKnMWvV9wKv88BafKTn/wb4O9aHBqraayhQMCsKLheVmjq8LDMbMcS8piJ0NO1WBjn8Lvupm+J/OQn/wD8GC+aRz9Muk/8w0mIzPuq0pia1UaTn/zkHx9+85D45bxAmGdGzpOf/OQfY34peI7f+kDpfDQWv0XSY1tRYROQe0V+8pN/TPn5miPyk5/8I8PP1xyRn/zkHxl+viaI/OQnP/mrMpavOSI/+ck/SomvOSI/+ck/cqLF1xyRn/zk9yLxNUfkJz/5i9If9M5atZCy5xcAAAAASUVORK5CYII=
														
 
															-# """
														
 
															-#
														
 
															-#     image_data = src.split('data:image/png;base64,')[1]
														
 
															-#
														
 
															-#     # 解码 base64 字符串
														
 
															-#     image_bytes = base64.b64decode(image_data)
														
 
															-#
														
 
															-#     # 将字节转换为图像
														
 
															-#     # image = Image.open(io.BytesIO(image_bytes))
														
 
															-#
														
 
															-#     # image.show('img')
														
 
															-#
														
 
															-#     # with open(r'C:\Users\Administrator\Desktop\test_image\error16.jpg', 'rb') as f:
														
 
															-#     #     image_bytes = f.read()
														
 
															-#
														
 
															-#     image = bytes2np(image_bytes)
														
 
															-#
														
 
															-#     cv2.imshow('img', image)
														
 
															-#     cv2.imwrite('./1.png', image)
														
 
															-#     cv2.waitKey(0)
														
 
															+    # test_ocr_model()
														
 
															+
														
 
															+    app.run(host='127.0.0.1', port=17000, debug=False)
														
--- a/ocr/ppocr/data/__init__.py
+++ b/ocr/ppocr/data/__init__.py
@@ -25,6 +25,9 @@ import signal
 
															 import random
														
 
															 __dir__ = os.path.dirname(os.path.abspath(__file__))
														
 
															+
														
 
															+from format_convert.utils import get_platform
														
 
															+
														
 
															 sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
														
 
															 import copy
														
@@ -49,8 +52,9 @@ def term_mp(sig_num, frame):
 
															     os.killpg(pgid, signal.SIGKILL)
														
 
															-signal.signal(signal.SIGINT, term_mp)
														
 
															-signal.signal(signal.SIGTERM, term_mp)
														
 
															+if get_platform() != 'Windows':
														
 
															+    signal.signal(signal.SIGINT, term_mp)
														
 
															+    signal.signal(signal.SIGTERM, term_mp)
														
 
															 def build_dataloader(config, mode, device, logger, seed=None):
														
--- a/ocr/test_lock.py
+++ b/ocr/test_lock.py
@@ -0,0 +1,39 @@
 
															+import multiprocessing
														
 
															+import os
														
 
															+import sys
														
 
															+import time
														
 
															+sys.path.append(os.path.abspath(os.path.dirname(__file__)) + '/../')
														
 
															+from format_convert.utils import file_lock
														
 
															+
														
 
															+
														
 
															+def run(a):
														
 
															+    while True:
														
 
															+        try:
														
 
															+            time2 = time.time()
														
 
															+            lock_file_sub = 'ocr'
														
 
															+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
														
 
															+            f = file_lock(lock_file)
														
 
															+            print(os.getpid(),"get file_lock " + lock_file + " time ", time.time()-time2)
														
 
															+            time2 = time.time()
														
 
															+            time.sleep(2)
														
 
															+            raise
														
 
															+            print(os.getpid(), "sleep", time.time()-time2)
														
 
															+
														
 
															+
														
 
															+        except Exception:
														
 
															+            print('RuntimeError')
														
 
															+        finally:
														
 
															+            f.close()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    # 要处理的数据
														
 
															+    data = [1, 2, 3]
														
 
															+
														
 
															+    # 创建进程池，指定进程数为 CPU 核心数
														
 
															+    with multiprocessing.Pool(processes=3) as pool:
														
 
															+        # 使用 map 方法分配任务并获取结果
														
 
															+        results = pool.map(run, data)
														
 
															+
														
 
															+    # 输出结果
														
 
															+    # print(results)
														
--- a/ocr/tools/infer/predict_det_pytorch.py
+++ b/ocr/tools/infer/predict_det_pytorch.py
@@ -19,7 +19,8 @@ import sys
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
														
 
															 import requests
														
 
															 from format_convert import _global
														
 
															-from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock
														
 
															+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock, \
														
 
															+    get_gpu_memory_usage, get_current_process_gpu_id
														
 
															 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
														
 
															 import cv2
														
@@ -120,6 +121,11 @@ class TextDetector(object):
 
															         self.predictor.to(self.device)
														
 
															         self.predictor.eval()
														
 
															+        if str(self.device) != 'cpu':
														
 
															+            self.gpu_id = get_current_process_gpu_id()
														
 
															+        else:
														
 
															+            self.gpu_id = None
														
 
															+
														
 
															         # self.predictor, self.input_tensor, self.output_tensors = utility.create_predictor(
														
 
															         #     args, 'det', logger)  # paddle.jit.load(args.det_model_dir)
														
 
															         # self.predictor.eval()
														
@@ -189,55 +195,44 @@ class TextDetector(object):
 
															         shape_list = np.expand_dims(shape_list, axis=0)
														
 
															         img = img.copy()
														
 
															         starttime = time.time()
														
 
															-
														
 
															+        tensor = torch.from_numpy(img).float()
														
 
															         # self.input_tensor.copy_from_cpu(img)
														
 
															-        img = torch.from_numpy(img).float()
														
 
															-        img = img.to(self.device)
														
 
															-        try:
														
 
															+        # if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not MAX_COMPUTE:
														
 
															+        if get_platform() != "Windows" and not MAX_COMPUTE and self.gpu_id is not None:
														
 
															             # 加锁，防止太多大图片同时预测，爆显存
														
 
															-            if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not MAX_COMPUTE:
														
 
															+            time2 = time.time()
														
 
															+            lock_file_sub = f'ocr_{self.gpu_id}'
														
 
															+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
														
 
															+            f = file_lock(lock_file)
														
 
															+            log("det get file_lock " + lock_file + " time " + str(time.time()-time2))
														
 
															+
														
 
															+            try:
														
 
															                 time2 = time.time()
														
 
															-                lock_file_sub = 'ocr'
														
 
															-                lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
														
 
															-                f = file_lock(lock_file)
														
 
															-                log("get file_lock " + lock_file_sub + " time " + str(time.time()-time2))
														
 
															+                if str(self.device) != 'cpu':
														
 
															+                    torch.cuda.empty_cache()
														
 
															+                tensor = tensor.to(self.device)
														
 
															                 with torch.no_grad():
														
 
															-                    out = self.predictor(img)
														
 
															+                    out = self.predictor(tensor)
														
 
															+                log("get file_lock run det" + " time " + str(time.time()-time2))
														
 
															+            except RuntimeError:
														
 
															+                log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
														
 
															+                log("det predictor shrink memory! ori_im.shape " + str(ori_im.shape))
														
 
															+                get_gpu_memory_usage()
														
 
															+                raise RuntimeError
														
 
															+            finally:
														
 
															                 f.close()
														
 
															-            else:
														
 
															-                with torch.no_grad():
														
 
															-                    out = self.predictor(img)
														
 
															-        except RuntimeError:
														
 
															-            log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
														
 
															-            log("predictor shrink memory!")
														
 
															-            # self.predictor.clear_intermediate_tensor()
														
 
															-            # self.predictor.try_shrink_memory()
														
 
															-            if str(self.device)!='cpu':
														
 
															-                torch.cuda.empty_cache()
														
 
															-                gc.collect()
														
 
															-            raise RuntimeError
														
 
															-
														
 
															-        # outputs = []
														
 
															-        # for output_tensor in self.output_tensors:
														
 
															-        #     output = output_tensor.copy_to_cpu()
														
 
															-        #     outputs.append(output)
														
 
															-        out = out.cpu().numpy()
														
 
															+                if str(self.device) != 'cpu':
														
 
															+                    torch.cuda.empty_cache()
														
 
															+                # gc.collect()
														
 
															+        else:
														
 
															+            tensor = tensor.to(self.device)
														
 
															+            with torch.no_grad():
														
 
															+                out = self.predictor(tensor)
														
 
															+        out = out.cpu().numpy()
														
 
															         preds = {}
														
 
															         preds['maps'] = out
														
 
															-        # if self.det_algorithm == "EAST":
														
 
															-        #     preds['f_geo'] = outputs[0]
														
 
															-        #     preds['f_score'] = outputs[1]
														
 
															-        # elif self.det_algorithm == 'SAST':
														
 
															-        #     preds['f_border'] = outputs[0]
														
 
															-        #     preds['f_score'] = outputs[1]
														
 
															-        #     preds['f_tco'] = outputs[2]
														
 
															-        #     preds['f_tvo'] = outputs[3]
														
 
															-        # elif self.det_algorithm == 'DB':
														
 
															-        #     preds['maps'] = outputs[0]
														
 
															-        # else:
														
 
															-        #     raise NotImplementedError
														
 
															         post_result = self.postprocess_op(preds, shape_list)
														
 
															         dt_boxes = post_result[0]['points']
														
 
															         if self.det_algorithm == "SAST" and self.det_sast_polygon:
														
@@ -246,17 +241,6 @@ class TextDetector(object):
 
															             dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
														
 
															         elapse = time.time() - starttime
														
 
															-        # 释放内存
														
 
															-        # print("TextDetector", self.predictor)
														
 
															-        # if TextDetector.shrink_memory_count % 100 == 0:
														
 
															-            # print("TextDetector shrink memory")
														
 
															-        # self.predictor.clear_intermediate_tensor()
														
 
															-        # self.predictor.try_shrink_memory()
														
 
															-        # TextDetector.shrink_memory_count += 1
														
 
															-        if str(self.device) != 'cpu':
														
 
															-            torch.cuda.empty_cache()
														
 
															-            # gc.collect()
														
 
															-
														
 
															         return dt_boxes, elapse
														
--- a/ocr/tools/infer/predict_rec_pytorch.py
+++ b/ocr/tools/infer/predict_rec_pytorch.py
@@ -37,8 +37,9 @@ import ocr.tools.infer.utility as utility
 
															 from ocr.ppocr.postprocess import build_post_process
														
 
															 from ocr.ppocr.utils.logging import get_logger
														
 
															 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
														
 
															-
														
 
															-from format_convert.utils import judge_error_code, log, namespace_to_dict,get_platform
														
 
															+from config.max_compute_config import MAX_COMPUTE
														
 
															+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock, \
														
 
															+    get_gpu_memory_usage, get_current_process_gpu_id
														
 
															 from format_convert import _global
														
 
															 import torch
														
@@ -56,6 +57,8 @@ class TextRecognizer(object):
 
															         self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
														
 
															         self.character_type = args.rec_char_type
														
 
															         self.rec_batch_num = args.rec_batch_num
														
 
															+        self.rec_batch_num = 16
														
 
															+        print('self.rec_batch_num', self.rec_batch_num)
														
 
															         self.rec_algorithm = args.rec_algorithm
														
 
															         postprocess_params = {
														
 
															             'name': 'CTCLabelDecode',
														
@@ -64,23 +67,7 @@ class TextRecognizer(object):
 
															             # "use_space_char": args.use_space_char
														
 
															             "use_space_char": False
														
 
															         }
														
 
															-        # if self.rec_algorithm == "SRN":
														
 
															-        #     postprocess_params = {
														
 
															-        #         'name': 'SRNLabelDecode',
														
 
															-        #         "character_type": args.rec_char_type,
														
 
															-        #         "character_dict_path": args.rec_char_dict_path,
														
 
															-        #         "use_space_char": args.use_space_char
														
 
															-        #     }
														
 
															-        # elif self.rec_algorithm == "RARE":
														
 
															-        #     postprocess_params = {
														
 
															-        #         'name': 'AttnLabelDecode',
														
 
															-        #         "character_type": args.rec_char_type,
														
 
															-        #         "character_dict_path": args.rec_char_dict_path,
														
 
															-        #         "use_space_char": args.use_space_char
														
 
															-        #     }
														
 
															         self.postprocess_op = build_post_process(postprocess_params)
														
 
															-        # self.predictor, self.input_tensor, self.output_tensors = \
														
 
															-        #     utility.create_predictor(args, 'rec', logger)
														
 
															         rec_model_path = args.rec_model_dir
														
 
															         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
														
@@ -100,19 +87,22 @@ class TextRecognizer(object):
 
															         self.predictor.to(self.device)
														
 
															         self.predictor.eval()
														
 
															+        if str(self.device) != 'cpu':
														
 
															+            self.gpu_id = get_current_process_gpu_id()
														
 
															+        else:
														
 
															+            self.gpu_id = None
														
 
															+
														
 
															     def resize_norm_img(self, img, max_wh_ratio):
														
 
															         h, w = img.shape[:2]
														
 
															         imgC, imgH, imgW = self.rec_image_shape
														
 
															         assert imgC == img.shape[2]
														
 
															         # print('max_wh_ratio', max_wh_ratio)
														
 
															+        # max_wh_ratio h是w的10倍，直接返回
														
 
															         if max_wh_ratio < 0.1:
														
 
															-            # if h > imgW:
														
 
															-            #     resized_image = cv2.resize(img, (w, imgW))
														
 
															-            # else:
														
 
															-            #     resized_image = img
														
 
															-
														
 
															-            # max_wh_ratio h是w的10倍，直接跳过
														
 
															-            resized_w = None
														
 
															+            # log('max_wh_ratio < 0.1', )
														
 
															+            resized_image = img.astype('float32')
														
 
															+            resized_image = resized_image.transpose((2, 0, 1)) / 255
														
 
															+            return resized_image
														
 
															         else:
														
 
															             if self.character_type == "ch":
														
 
															                 imgW = int((32 * max_wh_ratio))
														
@@ -138,186 +128,211 @@ class TextRecognizer(object):
 
															             padding_im[:, :, 0:resized_w] = resized_image
														
 
															         return padding_im
														
 
															-    def resize_norm_img_srn(self, img, image_shape):
														
 
															-        imgC, imgH, imgW = image_shape
														
 
															-
														
 
															-        img_black = np.zeros((imgH, imgW))
														
 
															-        im_hei = img.shape[0]
														
 
															-        im_wid = img.shape[1]
														
 
															-
														
 
															-        if im_wid <= im_hei * 1:
														
 
															-            img_new = cv2.resize(img, (imgH * 1, imgH))
														
 
															-        elif im_wid <= im_hei * 2:
														
 
															-            img_new = cv2.resize(img, (imgH * 2, imgH))
														
 
															-        elif im_wid <= im_hei * 3:
														
 
															-            img_new = cv2.resize(img, (imgH * 3, imgH))
														
 
															+    def predict(self, norm_img_batch):
														
 
															+        tensor = torch.from_numpy(norm_img_batch).float()
														
 
															+        # if norm_img.shape[3] >= 100 and get_platform() != "Windows" and not MAX_COMPUTE:
														
 
															+        if get_platform() != "Windows" and not MAX_COMPUTE:
														
 
															+            # 加锁
														
 
															+            time2 = time.time()
														
 
															+            lock_file_sub = 'ocr'
														
 
															+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
														
 
															+            f = file_lock(lock_file)
														
 
															+            log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
														
 
															+            try:
														
 
															+                time2 = time.time()
														
 
															+                if str(self.device) != 'cpu':
														
 
															+                    torch.cuda.empty_cache()
														
 
															+                tensor = tensor.to(self.device)
														
 
															+                with torch.no_grad():
														
 
															+                    out = self.predictor(tensor)
														
 
															+                log("get file_lock run rec" + " time " + str(time.time()-time2))
														
 
															+            except RuntimeError:
														
 
															+                log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
														
 
															+                log("rec predictor shrink memory! ori_im.shape " + str(norm_img_batch.shape))
														
 
															+                get_gpu_memory_usage()
														
 
															+                raise RuntimeError
														
 
															+            finally:
														
 
															+                f.close()
														
 
															+                if str(self.device) != 'cpu':
														
 
															+                    torch.cuda.empty_cache()
														
 
															+                gc.collect()
														
 
															         else:
														
 
															-            img_new = cv2.resize(img, (imgW, imgH))
														
 
															-
														
 
															-        img_np = np.asarray(img_new)
														
 
															-        img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
														
 
															-        img_black[:, 0:img_np.shape[1]] = img_np
														
 
															-        img_black = img_black[:, :, np.newaxis]
														
 
															-
														
 
															-        row, col, c = img_black.shape
														
 
															-        c = 1
														
 
															-
														
 
															-        return np.reshape(img_black, (c, row, col)).astype(np.float32)
														
 
															-
														
 
															-    def srn_other_inputs(self, image_shape, num_heads, max_text_length):
														
 
															-
														
 
															-        imgC, imgH, imgW = image_shape
														
 
															-        feature_dim = int((imgH / 8) * (imgW / 8))
														
 
															-
														
 
															-        encoder_word_pos = np.array(range(0, feature_dim)).reshape(
														
 
															-            (feature_dim, 1)).astype('int64')
														
 
															-        gsrm_word_pos = np.array(range(0, max_text_length)).reshape(
														
 
															-            (max_text_length, 1)).astype('int64')
														
 
															-
														
 
															-        gsrm_attn_bias_data = np.ones((1, max_text_length, max_text_length))
														
 
															-        gsrm_slf_attn_bias1 = np.triu(gsrm_attn_bias_data, 1).reshape(
														
 
															-            [-1, 1, max_text_length, max_text_length])
														
 
															-        gsrm_slf_attn_bias1 = np.tile(
														
 
															-            gsrm_slf_attn_bias1,
														
 
															-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
														
 
															-
														
 
															-        gsrm_slf_attn_bias2 = np.tril(gsrm_attn_bias_data, -1).reshape(
														
 
															-            [-1, 1, max_text_length, max_text_length])
														
 
															-        gsrm_slf_attn_bias2 = np.tile(
														
 
															-            gsrm_slf_attn_bias2,
														
 
															-            [1, num_heads, 1, 1]).astype('float32') * [-1e9]
														
 
															-
														
 
															-        encoder_word_pos = encoder_word_pos[np.newaxis, :]
														
 
															-        gsrm_word_pos = gsrm_word_pos[np.newaxis, :]
														
 
															-
														
 
															-        return [
														
 
															-            encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
														
 
															-            gsrm_slf_attn_bias2
														
 
															-        ]
														
 
															-
														
 
															-    def process_image_srn(self, img, image_shape, num_heads, max_text_length):
														
 
															-        norm_img = self.resize_norm_img_srn(img, image_shape)
														
 
															-        norm_img = norm_img[np.newaxis, :]
														
 
															-
														
 
															-        [encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1, gsrm_slf_attn_bias2] = \
														
 
															-            self.srn_other_inputs(image_shape, num_heads, max_text_length)
														
 
															-
														
 
															-        gsrm_slf_attn_bias1 = gsrm_slf_attn_bias1.astype(np.float32)
														
 
															-        gsrm_slf_attn_bias2 = gsrm_slf_attn_bias2.astype(np.float32)
														
 
															-        encoder_word_pos = encoder_word_pos.astype(np.int64)
														
 
															-        gsrm_word_pos = gsrm_word_pos.astype(np.int64)
														
 
															-
														
 
															-        return (norm_img, encoder_word_pos, gsrm_word_pos, gsrm_slf_attn_bias1,
														
 
															-                gsrm_slf_attn_bias2)
														
 
															+            tensor = tensor.to(self.device)
														
 
															+            with torch.no_grad():
														
 
															+                out = self.predictor(tensor)
														
 
															+        # logging.info("ocr model predict time - rec" + str(time.time()-start_time))
														
 
															+        out = out.cpu().numpy()
														
 
															+        preds = out
														
 
															+        return preds
														
 
															+
														
 
															+    def predict_batch(self, batch_list):
														
 
															+        batch_out_list = []
														
 
															+        if get_platform() != "Windows" and not MAX_COMPUTE and self.gpu_id is not None:
														
 
															+            # 加锁
														
 
															+            time2 = time.time()
														
 
															+            lock_file_sub = f'ocr_{self.gpu_id}'
														
 
															+            lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
														
 
															+            f = file_lock(lock_file)
														
 
															+            log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
														
 
															+            try:
														
 
															+                time2 = time.time()
														
 
															+                if str(self.device) != 'cpu':
														
 
															+                    torch.cuda.empty_cache()
														
 
															+                for sub_batch_list in batch_list:
														
 
															+                    sub_batch_out = []
														
 
															+                    for tensor in sub_batch_list:
														
 
															+                        with torch.no_grad():
														
 
															+                            out = self.predictor(tensor)
														
 
															+                            out = out.cpu().numpy()
														
 
															+                        sub_batch_out.append(out)
														
 
															+                    # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
														
 
															+                    batch_out_list.append(sub_batch_out)
														
 
															+                log("get file_lock run rec" + " time " + str(time.time()-time2))
														
 
															+
														
 
															+            except RuntimeError:
														
 
															+                log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
														
 
															+                log("rec predictor shrink memory! ori_im.shape " + str(tensor.shape))
														
 
															+                get_gpu_memory_usage()
														
 
															+                raise RuntimeError
														
 
															+            finally:
														
 
															+                f.close()
														
 
															+                if str(self.device) != 'cpu':
														
 
															+                    torch.cuda.empty_cache()
														
 
															+        else:
														
 
															+            for sub_batch_list in batch_list:
														
 
															+                sub_batch_out = []
														
 
															+                for tensor in sub_batch_list:
														
 
															+                    # print('tensor.shape', tensor.shape)
														
 
															+                    with torch.no_grad():
														
 
															+                        out = self.predictor(tensor)
														
 
															+                        out = out.cpu().numpy()
														
 
															+                    # print('out.shape', out.shape)
														
 
															+                    sub_batch_out.append(out)
														
 
															+                # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
														
 
															+                batch_out_list.append(sub_batch_out)
														
 
															+
														
 
															+        # 转为numpy
														
 
															+        for bi, sub_batch_out in enumerate(batch_out_list):
														
 
															+            batch_out_list[bi] = np.concatenate(sub_batch_out, axis=0)
														
 
															+        return batch_out_list
														
 
															     def __call__(self, img_list):
														
 
															+        start_time = time.time()
														
 
															+        # print('into TextRecognizer __call__')
														
 
															         img_num = len(img_list)
														
 
															-        # Calculate the aspect ratio of all text bars
														
 
															+
														
 
															+        # 过滤图片比例异常的
														
 
															+        # print('rec len(img_list)', len(img_list))
														
 
															+        temp_list = []
														
 
															+        for img in img_list:
														
 
															+            if img.shape[0] == 0 or img.shape[1] == 0 \
														
 
															+                    or img.shape[0] >= 10000 or img.shape[1] >= 10000 \
														
 
															+                    or img.shape[1] / img.shape[0] <= 0.5 \
														
 
															+                    or img.shape[1] / img.shape[0] >= 100:
														
 
															+                # print('rec img.shape[1] / img.shape[0] <= 0.5', img.shape)
														
 
															+                continue
														
 
															+            temp_list.append(img)
														
 
															+        if not temp_list:
														
 
															+            return None, 0
														
 
															+        img_list = temp_list
														
 
															+
														
 
															+        # 按比例排序
														
 
															         width_list = []
														
 
															         i = 0
														
 
															         for img in img_list:
														
 
															-            # cv2.imwrite('D:/myProject/format_conversion_maxcompute/ocr/test/'+str(i)+'.jpg',img)
														
 
															-            # i+=1
														
 
															-            # cv2.imshow('img', img)
														
 
															-            # cv2.waitKey(1000)
														
 
															             width_list.append(img.shape[1] / float(img.shape[0]))
														
 
															         # Sorting can speed up the recognition process
														
 
															         indices = np.argsort(np.array(width_list))
														
 
															+        # 分批预测
														
 
															         # rec_res = []
														
 
															         rec_res = [['', 0.0]] * img_num
														
 
															         batch_num = self.rec_batch_num
														
 
															         elapse = 0
														
 
															+        batch_list = []
														
 
															         for beg_img_no in range(0, img_num, batch_num):
														
 
															             end_img_no = min(img_num, beg_img_no + batch_num)
														
 
															             norm_img_batch = []
														
 
															             max_wh_ratio = 0
														
 
															+            # 取这个batch中比例最大的
														
 
															             for ino in range(beg_img_no, end_img_no):
														
 
															                 # h, w = img_list[ino].shape[0:2]
														
 
															                 h, w = img_list[indices[ino]].shape[0:2]
														
 
															                 wh_ratio = w * 1.0 / h
														
 
															                 max_wh_ratio = max(max_wh_ratio, wh_ratio)
														
 
															-            # print('max_wh_ratio',max_wh_ratio)
														
 
															+            # print('max_wh_ratio', max_wh_ratio)
														
 
															+
														
 
															+            # resize image
														
 
															             for ino in range(beg_img_no, end_img_no):
														
 
															-                if self.rec_algorithm != "SRN":
														
 
															-                    # print('max_wh_ratio', max_wh_ratio)
														
 
															-                    norm_img = self.resize_norm_img(img_list[indices[ino]],
														
 
															-                                                    max_wh_ratio)
														
 
															-                    # cv2.imshow('img', norm_img.transpose(1,2,0))
														
 
															-                    # cv2.waitKey(1000)
														
 
															-                    norm_img = norm_img[np.newaxis, :]
														
 
															-                    norm_img_batch.append(norm_img)
														
 
															-                else:
														
 
															-                    # norm_img = self.process_image_srn(
														
 
															-                    #     img_list[indices[ino]], self.rec_image_shape, 8, 25)
														
 
															-                    # encoder_word_pos_list = []
														
 
															-                    # gsrm_word_pos_list = []
														
 
															-                    # gsrm_slf_attn_bias1_list = []
														
 
															-                    # gsrm_slf_attn_bias2_list = []
														
 
															-                    # encoder_word_pos_list.append(norm_img[1])
														
 
															-                    # gsrm_word_pos_list.append(norm_img[2])
														
 
															-                    # gsrm_slf_attn_bias1_list.append(norm_img[3])
														
 
															-                    # gsrm_slf_attn_bias2_list.append(norm_img[4])
														
 
															-                    # norm_img_batch.append(norm_img[0])
														
 
															-                    pass
														
 
															+                # print('img_list[indices[ino]].shape', img_list[indices[ino]].shape)
														
 
															+                norm_img = self.resize_norm_img(img_list[indices[ino]],
														
 
															+                                                max_wh_ratio)
														
 
															+                # print('norm_img.shape', norm_img.shape)
														
 
															+                norm_img = norm_img[np.newaxis, :]
														
 
															+                norm_img_batch.append(norm_img)
														
 
															+
														
 
															             norm_img_batch = np.concatenate(norm_img_batch)
														
 
															             norm_img_batch = norm_img_batch.copy()
														
 
															-            if self.rec_algorithm == "SRN":
														
 
															-                # starttime = time.time()
														
 
															-                # encoder_word_pos_list = np.concatenate(encoder_word_pos_list)
														
 
															-                # gsrm_word_pos_list = np.concatenate(gsrm_word_pos_list)
														
 
															-                # gsrm_slf_attn_bias1_list = np.concatenate(
														
 
															-                #     gsrm_slf_attn_bias1_list)
														
 
															-                # gsrm_slf_attn_bias2_list = np.concatenate(
														
 
															-                #     gsrm_slf_attn_bias2_list)
														
 
															-                #
														
 
															-                # inputs = [
														
 
															-                #     norm_img_batch,
														
 
															-                #     encoder_word_pos_list,
														
 
															-                #     gsrm_word_pos_list,
														
 
															-                #     gsrm_slf_attn_bias1_list,
														
 
															-                #     gsrm_slf_attn_bias2_list,
														
 
															-                # ]
														
 
															-                # input_names = self.predictor.get_input_names()
														
 
															-                # for i in range(len(input_names)):
														
 
															-                #     input_tensor = self.predictor.get_input_handle(input_names[
														
 
															-                #         i])
														
 
															-                #     input_tensor.copy_from_cpu(inputs[i])
														
 
															-                # self.predictor.run()
														
 
															-                # outputs = []
														
 
															-                # for output_tensor in self.output_tensors:
														
 
															-                #     output = output_tensor.copy_to_cpu()
														
 
															-                #     outputs.append(output)
														
 
															-                # preds = {"predict": outputs[2]}
														
 
															-                pass
														
 
															+            # 预测
														
 
															+            # starttime = time.time()
														
 
															+            # # 当图片很长时，降低batch，防止爆内存
														
 
															+            # # print('norm_img_batch.shape', norm_img_batch.shape)
														
 
															+            # preds = []
														
 
															+            # if norm_img_batch.shape[-1] >= 400:
														
 
															+            #     if norm_img_batch.shape[-1] <= 1000:
														
 
															+            #         mini_batch_size = 4
														
 
															+            #     elif norm_img_batch.shape[-1] <= 3000:
														
 
															+            #         mini_batch_size = 2
														
 
															+            #     else:
														
 
															+            #         mini_batch_size = 1
														
 
															+            #     for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
														
 
															+            #         sub_batch = norm_img_batch[bi:bi+mini_batch_size]
														
 
															+            #         sub_preds = self.predict(sub_batch)
														
 
															+            #         preds.append(sub_preds)
														
 
															+            #         # print('type(sub_preds), sub_preds.shape', type(sub_preds), sub_preds.shape)
														
 
															+            #     preds = np.concatenate(preds, axis=0)
														
 
															+            # else:
														
 
															+            #     preds = self.predict(norm_img_batch)
														
 
															+            # # print('type(preds), preds.shape', type(preds), preds.shape)
														
 
															+            #
														
 
															+            # # 后处理
														
 
															+            # rec_result = self.postprocess_op(preds)
														
 
															+            # for rno in range(len(rec_result)):
														
 
															+            #     rec_res[indices[beg_img_no + rno]] = rec_result[rno]
														
 
															+            # elapse += time.time() - starttime
														
 
															+
														
 
															+            # 根据长度，动态batch
														
 
															+            if norm_img_batch.shape[-1] >= 400:
														
 
															+                if norm_img_batch.shape[-1] <= 1000:
														
 
															+                    mini_batch_size = 4
														
 
															+                elif norm_img_batch.shape[-1] <= 3000:
														
 
															+                    mini_batch_size = 2
														
 
															+                else:
														
 
															+                    mini_batch_size = 1
														
 
															+                sub_batch_list = []
														
 
															+                for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
														
 
															+                    sub_batch = norm_img_batch[bi:bi+mini_batch_size]
														
 
															+                    tensor = torch.from_numpy(sub_batch).float()
														
 
															+                    tensor = tensor.to(self.device)
														
 
															+                    sub_batch_list.append(tensor)
														
 
															             else:
														
 
															-                starttime = time.time()
														
 
															-
														
 
															                 tensor = torch.from_numpy(norm_img_batch).float()
														
 
															-                start_time = time.time()
														
 
															                 tensor = tensor.to(self.device)
														
 
															-                with torch.no_grad():
														
 
															-                    out = self.predictor(tensor)
														
 
															-                logging.info("ocr model predict time - rec" + str(time.time()-start_time))
														
 
															-                out = out.cpu().numpy()
														
 
															-                preds = out
														
 
															+                sub_batch_list = [tensor]
														
 
															-            # print("tools/infer/predict_rec preds", preds)
														
 
															-            rec_result = self.postprocess_op(preds)
														
 
															-            for rno in range(len(rec_result)):
														
 
															-                # print("predict_rec", img_num, batch_num, beg_img_no,
														
 
															-                #       indices[beg_img_no + rno], len(rec_res))
														
 
															-                rec_res[indices[beg_img_no + rno]] = rec_result[rno]
														
 
															-            elapse += time.time() - starttime
														
 
															-            # 释放内存
														
 
															-            # self.predictor.clear_intermediate_tensor()
														
 
															-            # self.predictor.try_shrink_memory()
														
 
															-
														
 
															-            # gc.collect()
														
 
															-            if str(self.device)!='cpu':
														
 
															-                torch.cuda.empty_cache()
														
 
															-            #     gc.collect()
														
 
															+            batch_list.append(sub_batch_list)
														
 
															+
														
 
															+        # 预测
														
 
															+        batch_out_list = self.predict_batch(batch_list)
														
 
															+
														
 
															+        # 后处理
														
 
															+        for bi, out in enumerate(batch_out_list):
														
 
															+            begin_img_no = bi * batch_num
														
 
															+            rec_result = self.postprocess_op(out)
														
 
															+            for ri in range(len(rec_result)):
														
 
															+                rec_res[indices[begin_img_no + ri]] = rec_result[ri]
														
 
															+        elapse += time.time() - start_time
														
 
															         return rec_res, elapse
														
--- a/ocr/tools/infer/predict_system.py
+++ b/ocr/tools/infer/predict_system.py
@@ -26,17 +26,19 @@ import copy
 
															 import numpy as np
														
 
															 import time
														
 
															 from PIL import Image
														
 
															+
														
 
															 os.environ['FLAGS_eager_delete_tensor_gb'] = '0'
														
 
															 import utility as utility
														
 
															 # import ocr.tools.infer.predict_rec as predict_rec
														
 
															-import ocr.tools.infer.predict_rec_pytorch as predict_rec # pytorch rec model
														
 
															+import ocr.tools.infer.predict_rec_pytorch as predict_rec  # pytorch rec model
														
 
															 # import ocr.tools.infer.predict_det as predict_det
														
 
															-import ocr.tools.infer.predict_det_pytorch as predict_det # pytorch det model
														
 
															+import ocr.tools.infer.predict_det_pytorch as predict_det  # pytorch det model
														
 
															 import ocr.tools.infer.predict_cls as predict_cls
														
 
															 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
														
 
															 from ocr.ppocr.utils.logging import get_logger
														
 
															 from ocr.tools.infer.utility import draw_ocr_box_txt
														
 
															-from format_convert.utils import has_intersection
														
 
															+from format_convert.utils import has_intersection, log
														
 
															+from format_convert import _global
														
 
															 logger = get_logger()
														
@@ -61,27 +63,36 @@ class TextSystem(object):
 
															         points[:, 0] = points[:, 0] - left
														
 
															         points[:, 1] = points[:, 1] - top
														
 
															         '''
														
 
															-        img_crop_width = int(
														
 
															-            max(
														
 
															-                np.linalg.norm(points[0] - points[1]),
														
 
															-                np.linalg.norm(points[2] - points[3])))
														
 
															-        img_crop_height = int(
														
 
															-            max(
														
 
															-                np.linalg.norm(points[0] - points[3]),
														
 
															-                np.linalg.norm(points[1] - points[2])))
														
 
															-        pts_std = np.float32([[0, 0], [img_crop_width, 0],
														
 
															-                              [img_crop_width, img_crop_height],
														
 
															-                              [0, img_crop_height]])
														
 
															-        M = cv2.getPerspectiveTransform(points, pts_std)
														
 
															-        dst_img = cv2.warpPerspective(
														
 
															-            img,
														
 
															-            M, (img_crop_width, img_crop_height),
														
 
															-            borderMode=cv2.BORDER_REPLICATE,
														
 
															-            flags=cv2.INTER_CUBIC)
														
 
															-        dst_img_height, dst_img_width = dst_img.shape[0:2]
														
 
															-        # if dst_img_height * 1.0 / dst_img_width >= 1.5:
														
 
															-        if dst_img_height * 1.0 / dst_img_width >= 2.0:
														
 
															-            dst_img = np.rot90(dst_img)
														
 
															+        # img_crop_width = int(
														
 
															+        #     max(
														
 
															+        #         np.linalg.norm(points[0] - points[1]),
														
 
															+        #         np.linalg.norm(points[2] - points[3])))
														
 
															+        # img_crop_height = int(
														
 
															+        #     max(
														
 
															+        #         np.linalg.norm(points[0] - points[3]),
														
 
															+        #         np.linalg.norm(points[1] - points[2])))
														
 
															+        # pts_std = np.float32([[0, 0], [img_crop_width, 0],
														
 
															+        #                       [img_crop_width, img_crop_height],
														
 
															+        #                       [0, img_crop_height]])
														
 
															+        # M = cv2.getPerspectiveTransform(points, pts_std)
														
 
															+        # dst_img = cv2.warpPerspective(
														
 
															+        #     img,
														
 
															+        #     M, (img_crop_width, img_crop_height),
														
 
															+        #     borderMode=cv2.BORDER_REPLICATE,
														
 
															+        #     flags=cv2.INTER_CUBIC)
														
 
															+        # print('dst_img.shape', dst_img.shape)
														
 
															+        #
														
 
															+        # print('points', points)
														
 
															+        w = abs(points[2][0] - points[0][0])
														
 
															+        h = abs(points[2][1] - points[0][1])
														
 
															+        dst_img = img[int(points[0][1]):int(points[0][1] + h), int(points[0][0]):int(points[0][0] + w), :]
														
 
															+        # print('dst_img.shape2', dst_img.shape)
														
 
															+        # cv2.imshow('dst_img', dst_img)
														
 
															+        # cv2.waitKey(0)
														
 
															+        # dst_img_height, dst_img_width = dst_img.shape[0:2]
														
 
															+        # # if dst_img_height * 1.0 / dst_img_width >= 1.5:
														
 
															+        # if dst_img_height * 1.0 / dst_img_width >= 2.0:
														
 
															+        #     dst_img = np.rot90(dst_img)
														
 
															         return dst_img
														
 
															     def print_draw_crop_rec_res(self, img_crop_list, rec_res):
														
@@ -91,6 +102,7 @@ class TextSystem(object):
 
															             logger.info(bno, rec_res[bno])
														
 
															     def __call__(self, img):
														
 
															+        # print('into TextSystem __call__')
														
 
															         # cv2.imshow('img',img)
														
 
															         # cv2.waitKey(0)
														
 
															         ori_im = img.copy()
														
@@ -98,15 +110,65 @@ class TextSystem(object):
 
															         logger.info("dt_boxes num : {}, elapse : {}".format(
														
 
															             len(dt_boxes), elapse))
														
 
															         if dt_boxes is None:
														
 
															-            return None, None
														
 
															-        img_crop_list = []
														
 
															+            return [], []
														
 
															-        dt_boxes = sorted_boxes(dt_boxes)
														
 
															+        temp_list = []
														
 
															+        # print('dt_boxes', type(dt_boxes))
														
 
															+        # print('dt_boxes.shape', dt_boxes.shape)
														
 
															+        # 过滤一些比例离谱的box
														
 
															+        for b in dt_boxes:
														
 
															+            w = b[2][0] - b[0][0]
														
 
															+            h = b[2][1] - b[0][1]
														
 
															+            if h == 0 or w == 0 \
														
 
															+                    or h >= 10000 or w >= 10000 \
														
 
															+                    or w / h <= 0.5 or w / h >= 100:
														
 
															+                continue
														
 
															+            temp_list.append(b)
														
 
															+
														
 
															+        if not temp_list:
														
 
															+            return [], []
														
 
															+        dt_boxes = np.array(temp_list)
														
 
															+        # print('dt_boxes.shape2', dt_boxes.shape)
														
 
															+
														
 
															+        # show
														
 
															+        # for b in dt_boxes:
														
 
															+        #     p1 = [int(x) for x in b[0]]
														
 
															+        #     p2 = [int(x) for x in b[2]]
														
 
															+        #     cv2.rectangle(img, p1, p2, (0, 0, 255))
														
 
															+        # cv2.namedWindow('img', cv2.WINDOW_NORMAL)
														
 
															+        # cv2.imshow('img', img)
														
 
															+        # cv2.waitKey(0)
														
 
															+
														
 
															+        # # 检测过多单字box，返回None
														
 
															+        # if len(dt_boxes) >= 150:
														
 
															+        #     short_box_cnt = 0
														
 
															+        #     long_box_cnt = 0
														
 
															+        #     for b in dt_boxes:
														
 
															+        #         w = b[2][0] - b[0][0]
														
 
															+        #         h = b[2][1] - b[0][1]
														
 
															+        #         if w / h < 1.3:
														
 
															+        #             short_box_cnt += 1
														
 
															+        #         if w / h >= 3:
														
 
															+        #             long_box_cnt += 1
														
 
															+        #         print('dt_boxes', w, h, round(w/h, 3))
														
 
															+        #     # print('short_box_cnt, len(dt_boxes)', short_box_cnt, len(dt_boxes))
														
 
															+        #     log('short_box_cnt, long_box_cnt, len(dt_boxes) ' + str([short_box_cnt, long_box_cnt, len(dt_boxes)]))
														
 
															+        #     if short_box_cnt >= 2/3 * len(dt_boxes) and long_box_cnt < 10:
														
 
															+        #         # print('short_box_cnt >= 2/3 * len(dt_boxes), return None')
														
 
															+        #         log('short_box_cnt >= 2/3 * len(dt_boxes), return None. ' + str([short_box_cnt, long_box_cnt, len(dt_boxes)]))
														
 
															+        #         return [], []
														
 
															+        img_crop_list = []
														
 
															+        dt_boxes = sorted_boxes(dt_boxes)
														
 
															         for bno in range(len(dt_boxes)):
														
 
															             tmp_box = copy.deepcopy(dt_boxes[bno])
														
 
															             img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
														
 
															             img_crop_list.append(img_crop)
														
 
															+        # print('system len(img_crop_list)', len(img_crop_list))
														
 
															+        # for img in img_crop_list:
														
 
															+        #     if img.shape[1] / img.shape[0] <= 0.5:
														
 
															+        # print('system img.shape[1] / img.shape[0] <= 0.5', img.shape)
														
 
															+
														
 
															         if self.use_angle_cls:
														
 
															             img_crop_list, angle_list, elapse = self.text_classifier(
														
 
															                 img_crop_list)
														
@@ -131,6 +193,7 @@ class TextSystem(object):
 
															                 filter_rec_res.append(rec_reuslt)
														
 
															         return filter_boxes, filter_rec_res
														
 
															+
														
 
															 def boxex_points_fixup(dt_boxes):
														
 
															     # 检查框全部转换为矩形
														
 
															     # for i in range(len(dt_boxes)):
														
@@ -143,39 +206,37 @@ def boxex_points_fixup(dt_boxes):
 
															     #     y_min = min(y_list)
														
 
															     #     dt_boxes[i] = np.array([[x_min,y_min],[x_max,y_min],[x_max,y_max],[x_min,y_max]])
														
 
															-
														
 
															     for i in range(len(dt_boxes)):
														
 
															         box1 = dt_boxes[i]
														
 
															         box1_point3 = box1[2]
														
 
															-        box1_point4 = box1[3] # 四边形底边的两点坐标
														
 
															-        bottom_line = (min(box1_point3[0],box1_point4[0]),max(box1_point3[0],box1_point4[0]))
														
 
															-        bottom_line_len = abs(bottom_line[1]-bottom_line[0])
														
 
															+        box1_point4 = box1[3]  # 四边形底边的两点坐标
														
 
															+        bottom_line = (min(box1_point3[0], box1_point4[0]), max(box1_point3[0], box1_point4[0]))
														
 
															+        bottom_line_len = abs(bottom_line[1] - bottom_line[0])
														
 
															-        for j in range(i+1,len(dt_boxes)):
														
 
															+        for j in range(i + 1, len(dt_boxes)):
														
 
															             box2 = dt_boxes[j]
														
 
															             box2_point1 = box2[0]
														
 
															-            box2_point2 = box2[1] # 四边形顶边的两点坐标
														
 
															+            box2_point2 = box2[1]  # 四边形顶边的两点坐标
														
 
															             top_line = (min(box2_point1[0], box2_point2[0]), max(box2_point1[0], box2_point2[0]))
														
 
															-            top_line_len = abs(top_line[1]-top_line[0])
														
 
															+            top_line_len = abs(top_line[1] - top_line[0])
														
 
															             if has_intersection(box1, box2):  # 四边形框是否有交集
														
 
															-                if not (min(top_line)>=max(bottom_line) or min(bottom_line)>=max(top_line)):  # x轴方向上有交集
														
 
															+                if not (min(top_line) >= max(bottom_line) or min(bottom_line) >= max(top_line)):  # x轴方向上有交集
														
 
															                     # 求重合部分y中间值
														
 
															                     mid_y = ((box2_point1[1] + box2_point2[1]) / 2 + (box1_point3[1] + box1_point4[1]) / 2) // 2
														
 
															                     if not mid_y:
														
 
															                         continue
														
 
															-                    max_line_len = max(bottom_line_len,top_line_len)
														
 
															+                    max_line_len = max(bottom_line_len, top_line_len)
														
 
															                     cross_line_len = bottom_line_len + top_line_len - \
														
 
															-                                     (max(bottom_line[1],bottom_line[0],top_line[1],top_line[0]) - min(bottom_line[1],bottom_line[0],top_line[1],top_line[0]))
														
 
															+                                     (max(bottom_line[1], bottom_line[0], top_line[1], top_line[0]) - min(
														
 
															+                                         bottom_line[1], bottom_line[0], top_line[1], top_line[0]))
														
 
															                     # print(cross_line_len,max_line_len,cross_line_len/max_line_len)
														
 
															-                    if cross_line_len/max_line_len>=0.55: # 重合比例
														
 
															-                        box1[2] = [box1_point3[0],mid_y]
														
 
															-                        box1[3] = [box1_point4[0],mid_y]
														
 
															-                        box2[0] = [box2_point1[0],mid_y]
														
 
															-                        box2[1] = [box2_point2[0],mid_y]
														
 
															+                    if cross_line_len / max_line_len >= 0.55:  # 重合比例
														
 
															+                        box1[2] = [box1_point3[0], mid_y]
														
 
															+                        box1[3] = [box1_point4[0], mid_y]
														
 
															+                        box2[0] = [box2_point1[0], mid_y]
														
 
															+                        box2[1] = [box2_point2[0], mid_y]
														
 
															                         break
														
 
															-
														
 
															-
														
 
															     return dt_boxes
														
@@ -247,4 +308,4 @@ def main(args):
 
															 if __name__ == "__main__":
														
 
															     main(utility.parse_args())
														
 
															-    pass
														
 
															+    pass
														
--- a/start_and_stop/kill_convert.sh
+++ b/start_and_stop/kill_convert.sh
@@ -0,0 +1 @@
 
															+kill -9 $(lsof -i:15010|sed -n '2,$p'|awk '{print $2}'|tr '\n' ' ')
														
--- a/tika_/tika_interface.py
+++ b/tika_/tika_interface.py
@@ -1,3 +1,5 @@
 
															+import base64
														
 
															+import io
														
 
															 import json
														
 
															 import os
														
 
															 import re
														
@@ -7,8 +9,11 @@ import traceback
 
															 from glob import glob
														
 
															 import psutil
														
 
															+from PIL import Image
														
 
															+from bs4 import BeautifulSoup
														
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															+from config.max_compute_config import MAX_COMPUTE
														
 
															 _dir = os.path.abspath(os.path.dirname(__file__))
														
 
															 os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
														
 
															 os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
														
@@ -16,12 +21,19 @@ os.environ["TIKA_PATH"] = _dir + "/files/"
 
															 os.environ["TIKA_LOG_FILE"] = "tika.log"
														
 
															 from format_convert import _global
														
 
															-from format_convert.utils import log, request_post, dynamic_get_port
														
 
															+from format_convert.utils import log, request_post, dynamic_get_port, get_platform
														
 
															 import tika
														
 
															 from tika import parser, config
														
 
															 from tika.tika import runCommand
														
 
															 from flask import Flask, request
														
 
															+if get_platform() == "Windows":
														
 
															+    FROM_REMOTE = False
														
 
															+else:
														
 
															+    FROM_REMOTE = True
														
 
															+
														
 
															+if MAX_COMPUTE:
														
 
															+    FROM_REMOTE = False
														
 
															 # 接口配置
														
 
															 app = Flask(__name__)
														
@@ -46,18 +58,18 @@ def _tika():
 
															         _md5 = request.form.get("md5")
														
 
															         _global.update({"md5": _md5})
														
 
															-        html = tika_interface(data).get('html')
														
 
															-        return json.dumps({"html": html})
														
 
															+        html = tika_interface(data).get('data')
														
 
															+        return json.dumps({"data": html})
														
 
															     except TimeoutError:
														
 
															-        return json.dumps({"html": [-5]})
														
 
															+        return json.dumps({"data": [-5]})
														
 
															     except:
														
 
															         traceback.print_exc()
														
 
															-        return json.dumps({"html": [-1]})
														
 
															+        return json.dumps({"data": [-1]})
														
 
															     finally:
														
 
															         log("tika interface finish time " + str(time.time()-start_time))
														
 
															-def tika_interface(_path, show=1):
														
 
															+def tika_interface(_path, show=0):
														
 
															     try:
														
 
															         # apache tika服务器 提取
														
 
															         # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
														
@@ -67,7 +79,8 @@ def tika_interface(_path, show=1):
 
															         if globals().get(key):
														
 
															             port = globals().get(key)
														
 
															         else:
														
 
															-            port = dynamic_get_port(port)
														
 
															+            if FROM_REMOTE:
														
 
															+                port = dynamic_get_port(port)
														
 
															             if port is None:
														
 
															                 kill_tika_java_server()
														
 
															                 # return {"html": [-19]}
														
@@ -76,31 +89,104 @@ def tika_interface(_path, show=1):
 
															         url = 'http://localhost:' + str(port)
														
 
															         log('tika ' + key + ' port: ' + str(port))
														
 
															         parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
														
 
															-        html = parsed.get('content')
														
 
															-
														
 
															-        # 处理html
														
 
															-        html = html.split('\n')
														
 
															-        temp_list = []
														
 
															-        for line in html:
														
 
															-            if '<meta' in line:
														
 
															-                continue
														
 
															-            temp_list.append(line)
														
 
															-        html = temp_list
														
 
															-        if len(html) <= 4:
														
 
															-            return {"html": ''}
														
 
															-
														
 
															-        html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
														
 
															-        html = '\n'.join(html)
														
 
															-        html = re.sub('<table>', '<table border="1">', html)
														
 
															-        html = re.sub(' class="正文"', '', html)
														
 
															+        # print('parsed', parsed)
														
 
															+        html = parsed.get('content', '')
														
 
															+        # 提取html各种元素，其中图片只是一个映射
														
 
															+        soup = BeautifulSoup(html, 'lxml')
														
 
															+        tag_list = collect_soup_elements(soup)
														
 
															         if show:
														
 
															-            with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
														
 
															-                f.write(html)
														
 
															+            print('tag_list0', tag_list)
														
 
															+
														
 
															+        if not tag_list:
														
 
															+            return {"data": tag_list}
														
 
															+
														
 
															+        # docx不是二进制，不能直接读二进制图片
														
 
															+        if _path[-3:] == 'doc':
														
 
															+            # 直接从二进制提取图片，保存在同一目录下
														
 
															+            ss = re.split('[/\\\]', _path)
														
 
															+            save_dir = os.sep.join(ss[:-1])
														
 
															+            file_name = re.split('\.', ss[-1])[0]
														
 
															+            if show:
														
 
															+                print('save_dir', save_dir)
														
 
															+                print('file_name', file_name)
														
 
															+            image_path_dict = extract_images_from_doc(_path, save_dir)
														
 
															+
														
 
															+            if show:
														
 
															+                print('image_path_dict', image_path_dict)
														
 
															+
														
 
															+            # embedded_images = re.findall(r'embedded:image[^"]+', html)
														
 
															+            match_flag = 1
														
 
															+            for tag in tag_list:
														
 
															+                tag_name, value = tag
														
 
															+                if tag_name != 'img':
														
 
															+                    continue
														
 
															+                # 提取图片文件名
														
 
															+                image_name = file_name + '_' + re.sub('image', '', value)
														
 
															+                if show:
														
 
															+                    print('image_name', image_name)
														
 
															+                # 保证所有image映射都对得上
														
 
															+                real_image_path = image_path_dict.get(image_name)
														
 
															+                if real_image_path is None:
														
 
															+                    match_flag = 0
														
 
															+                    break
														
 
															+                else:
														
 
															+                    tag[1] = real_image_path
														
 
															+            if show:
														
 
															+                print('match_flag', match_flag)
														
 
															+
														
 
															+            if match_flag:
														
 
															+                # 图片数量能对上，则是正确的
														
 
															+                pass
														
 
															+            else:
														
 
															+                # 图片对不上，则删除所有图片类型的tag
														
 
															+                temp_list = []
														
 
															+                for tag_name, value in tag_list:
														
 
															+                    if tag_name == 'img':
														
 
															+                        continue
														
 
															+                    temp_list.append([tag_name, value])
														
 
															+                tag_list = temp_list
														
 
															+
														
 
															+        elif _path[-4:] == 'docx':
														
 
															+            temp_list = []
														
 
															+            for tag_name, value in tag_list:
														
 
															+                if tag_name == 'img':
														
 
															+                    continue
														
 
															+                temp_list.append([tag_name, value])
														
 
															+            tag_list = temp_list
														
 
															+
														
 
															+
														
 
															+        # # 处理html
														
 
															+        # html = html.split('\n')
														
 
															+        # temp_list = []
														
 
															+        # for line in html:
														
 
															+        #     if '<meta' in line:
														
 
															+        #         continue
														
 
															+        #     temp_list.append(line)
														
 
															+        # html = temp_list
														
 
															+        # if len(html) <= 4:
														
 
															+        #     return {"html": ''}
														
 
															+        #
														
 
															+        # html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
														
 
															+        # html = '\n'.join(html)
														
 
															+        # html = re.sub('<table>', '<table border="1">', html)
														
 
															+        # html = re.sub(' class="正文"', '', html)
														
 
															+        #
														
 
															+        # if show:
														
 
															+        #     with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
														
 
															+        #         f.write(html)
														
 
															+    # except:
														
 
															+    #     traceback.print_exc()
														
 
															+    #     return {"html": [-17]}
														
 
															+    # return {"html": html}
														
 
															+
														
 
															+        if show:
														
 
															+            print('tag_list final', tag_list)
														
 
															+
														
 
															     except:
														
 
															         traceback.print_exc()
														
 
															-        return {"html": [-17]}
														
 
															-    return {"html": html}
														
 
															+        return {"data": [-17]}
														
 
															+    return {"data": tag_list}
														
 
															 def kill_tika_java_server():
														
@@ -122,6 +208,139 @@ def kill_tika_java_server():
 
															             os.system(comm)
														
 
															+def extract_images_from_doc(doc_file_path, output_folder):
														
 
															+    # 定义图片格式相关的标志
														
 
															+    image_signatures = {
														
 
															+        'jpg': (b'\xFF\xD8', b'\xFF\xD9'),
														
 
															+        'png': (b'\x89PNG', b'\x49\x45\x4E\x44\xAE\x42\x60\x82')
														
 
															+    }
														
 
															+
														
 
															+    file_name = re.split('[/\\\.]', doc_file_path)[-2]
														
 
															+
														
 
															+    # 读取.doc文件
														
 
															+    with open(doc_file_path, 'rb') as doc_file:
														
 
															+        doc_data = doc_file.read()
														
 
															+
														
 
															+    output_file_path_dict = {}
														
 
															+    # 查找并提取所有图片
														
 
															+    for img_format, (start_sig, end_sig) in image_signatures.items():
														
 
															+        start_index = 0
														
 
															+        image_count = 1
														
 
															+        while True:
														
 
															+            # 查找图片起始位置
														
 
															+            start_index = doc_data.find(start_sig, start_index)
														
 
															+            if start_index == -1:
														
 
															+                break
														
 
															+
														
 
															+            # 查找图片结束位置
														
 
															+            end_index = doc_data.find(end_sig, start_index)
														
 
															+            if end_index == -1:
														
 
															+                break
														
 
															+
														
 
															+            # 提取图片数据
														
 
															+            end_index += len(end_sig)  # 包含结束标志
														
 
															+            image_data = doc_data[start_index:end_index]
														
 
															+
														
 
															+            # 保存图片
														
 
															+            # image_count = len([f for f in os.listdir(output_folder) if f.endswith(f'.{img_format}')])
														
 
															+            image_name = f'{file_name}_{image_count}.{img_format}'
														
 
															+            image_path = os.path.join(output_folder, image_name)
														
 
															+            with open(image_path, 'wb') as img_file:
														
 
															+                img_file.write(image_data)
														
 
															+            print(f'Saved {img_format} image to {image_path}')
														
 
															+            output_file_path_dict[image_name] = image_path
														
 
															+
														
 
															+            # 继续查找下一个图片
														
 
															+            start_index = end_index
														
 
															+            image_count += 1
														
 
															+    return output_file_path_dict
														
 
															+
														
 
															+
														
 
															+def is_image_valid(image_path):
														
 
															+    try:
														
 
															+        # 尝试打开图片
														
 
															+        with Image.open(image_path) as img:
														
 
															+            # 如果图片可以打开并且没有问题，则 True返回
														
 
															+            img.load()
														
 
															+            return True
														
 
															+    except:
														
 
															+        # 如果出现异常，则返回 False
														
 
															+        return False
														
 
															+
														
 
															+
														
 
															+def is_image_data_valid(image_data):
														
 
															+    """
														
 
															+    判断图片数据流是否可以正常打开
														
 
															+
														
 
															+    Args:
														
 
															+        image_data (bytes): 图片数据流
														
 
															+
														
 
															+    Returns:
														
 
															+        bool: 如果图片数据流可以正常打开，则返回True，否则返回False
														
 
															+    """
														
 
															+    try:
														
 
															+        # 将图片数据流转换为文件类对象
														
 
															+        image_file = io.BytesIO(image_data)
														
 
															+        # 尝试打开图片
														
 
															+        with Image.open(image_file) as img:
														
 
															+            # 如果图片可以打开并且没有问题，则返回True
														
 
															+            img.load()
														
 
															+            return True
														
 
															+    except:
														
 
															+        # 如果出现异常，则返回False
														
 
															+        return False
														
 
															+
														
 
															+
														
 
															+def collect_soup_elements(soup):
														
 
															+    # elements = []
														
 
															+    # # print('tags', tags)
														
 
															+    # for tag in tags:
														
 
															+    #     for element in tag.children:
														
 
															+    #         print('element', element)
														
 
															+    #         if element.name == 'img':
														
 
															+    #             # 提取<img>标签的alt属性
														
 
															+    #             alt_value = element.get('alt')
														
 
															+    #             print(f"Image: {alt_value}")
														
 
															+    #             elements.append(['img', alt_value])
														
 
															+    #         elif element.name == 'table':
														
 
															+    #             elements.append(['table', element])
														
 
															+    #         elif element.string and element.string.strip():
														
 
															+    #             # 提取文本内容
														
 
															+    #             text = element.string.strip()
														
 
															+    #             print(f"Text: {text}")
														
 
															+    #             elements.append(['text', text])
														
 
															+
														
 
															+    table_tags = soup.find_all('table')
														
 
															+    for table in table_tags:
														
 
															+        table['border'] = "1"
														
 
															+
														
 
															+    elements = []
														
 
															+    # 遍历所有标签
														
 
															+    for element in soup.body.descendants:
														
 
															+        if element.name == 'p':
														
 
															+            # 提取文本
														
 
															+            text = element.get_text(strip=True)
														
 
															+            if text:
														
 
															+                elements.append(['text', text])
														
 
															+        elif element.name == 'img':
														
 
															+            # 提取图片alt
														
 
															+            alt = element.get('alt')
														
 
															+            elements.append(['img', alt])
														
 
															+        elif element.name == 'table':
														
 
															+            # 提取表格数据
														
 
															+            # table_data = []
														
 
															+            # for row in element.find_all('tr'):
														
 
															+            #     row_data = []
														
 
															+            #     for cell in row.find_all('td'):
														
 
															+            #         cell_text = cell.get_text(strip=True)
														
 
															+            #         row_data.append(cell_text)
														
 
															+            #     table_data.append(row_data)
														
 
															+            for p_tag in element.find_all('p'):
														
 
															+                p_tag.unwrap()
														
 
															+            elements.append(['table', str(element)])
														
 
															+    return elements
														
 
															+
														
 
															+
														
 
															 def test_interface():
														
 
															     # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
														
 
															     paths = ["files/1716253106319.doc"]
														
@@ -153,6 +372,13 @@ if __name__ == "__main__":
 
															     #     # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
														
 
															     #     tika_interface(_p)
														
 
															-    # app.run(host='0.0.0.0', port=5000)
														
 
															+    # app.run(host='0.0.0.0', port=16050)
														
 
															     # test_interface()
														
 
															-    kill_tika_java_server()
														
 
															+    # kill_tika_java_server()
														
 
															+
														
 
															+    # p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
														
 
															+    # extract_images_from_doc(p, '.')
														
 
															+
														
 
															+    _p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
														
 
															+    save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
														
 
															+    c = tika_interface(_p)
+															+豈
+															+更
+															+車
+															+賈
+															+滑
+															+串
+															+句
+															+龜
+															+龜
+															+契
+															+金
+															+喇
+															+奈
+															+懶
+															+癩
+															+羅
+															+蘿
+															+螺
+															+裸
+															+邏
+															+樂
+															+洛
+															+烙
+															+珞
+															+落
+															+酪
+															+駱
+															+亂
+															+卵
+															+欄
+															+爛
+															+蘭
+															+鸞
+															+嵐
+															+濫
+															+藍
+															+襤
+															+拉
+															+臘
+															+蠟
+															+廊
+															+朗
+															+浪
+															+狼
+															+郎
+															+來
+															+冷
+															+勞
+															+擄
+															+櫓
+															+爐
+															+盧
+															+老
+															+蘆
+															+虜
+															+路
+															+露
+															+魯
+															+鷺
+															+碌
+															+祿
+															+綠
+															+菉
+															+錄
+															+鹿
+															+論
+															+壟
+															+弄
+															+籠
+															+聾
+															+牢
+															+磊
+															+賂
+															+雷
+															+壘
+															+屢
+															+樓
+															+淚
+															+漏
+															+累
+															+縷
+															+陋
+															+勒
+															+肋
+															+凜
+															+凌
+															+稜
+															+綾
+															+菱
+															+陵
+															+讀
+															+拏
+															+樂
+															+諾
+															+丹
+															+寧
+															+怒
+															+率
+															+異
+															+北
+															+磻
+															+便
+															+復
+															+不
+															+泌
+															+數
+															+索
+															+參
+															+塞
+															+省
+															+葉
+															+說
+															+殺
+															+辰
+															+沈
+															+拾
+															+若
+															+掠
+															+略
+															+亮
+															+兩
+															+凉
+															+梁
+															+糧
+															+良
+															+諒
+															+量
+															+勵
+															+呂
+															+女
+															+廬
+															+旅
+															+濾
+															+礪
+															+閭
+															+驪
+															+麗
+															+黎
+															+力
+															+曆
+															+歷
+															+轢
+															+年
+															+憐
+															+戀
+															+撚
+															+漣
+															+煉
+															+璉
+															+秊
+															+練
+															+聯
+															+輦
+															+蓮
+															+連
+															+鍊
+															+列
+															+劣
+															+咽
+															+烈
+															+裂
+															+說
+															+廉
+															+念
+															+捻
+															+殮
+															+簾
+															+獵
+															+令
+															+囹
+															+寧
+															+嶺
+															+怜
+															+玲
+															+瑩
+															+羚
+															+聆
+															+鈴
+															+零
+															+靈
+															+領
+															+例
+															+禮
+															+醴
+															+隸
+															+惡
+															+了
+															+僚
+															+寮
+															+尿
+															+料
+															+樂
+															+燎
+															+療
+															+蓼
+															+遼
+															+龍
+															+暈
+															+阮
+															+劉
+															+杻
+															+柳
+															+流
+															+溜
+															+琉
+															+留
+															+硫
+															+紐
+															+類
+															+六
+															+戮
+															+陸
+															+倫
+															+崙
+															+淪
+															+輪
+															+律
+															+慄
+															+栗
+															+率
+															+隆
+															+利
+															+吏
+															+履
+															+易
+															+李
+															+梨
+															+泥
+															+理
+															+痢
+															+罹
+															+裏
+															+裡
+															+里
+															+離
+															+匿
+															+溺
+															+吝
+															+燐
+															+璘
+															+藺
+															+隣
+															+鱗
+															+麟
+															+林
+															+淋
+															+臨
+															+立
+															+笠
+															+粒
+															+狀
+															+炙
+															+識
+															+什
+															+茶
+															+刺
+															+切
+															+度
+															+拓
+															+糖
+															+宅
+															+洞
+															+暴
+															+輻
+															+行
+															+降
+															+見
+															+廓
+															+兀
+															+嗀
+															+﨎
+															+﨏
+															+塚
+															+﨑
+															+晴
+															+﨓
+															+﨔
+															+凞
+															+猪
+															+益
+															+礼
+															+神
+															+祥
+															+福
+															+靖
+															+精
+															+羽
+															+﨟
+															+蘒
+															+﨡
+															+諸
+															+﨣
+															+﨤
+															+逸
+															+都
+															+﨧
+															+﨨
+															+﨩
+															+飯
+															+飼
+															+館
+															+鶴
+															+郞
+															+隷
+															+侮
+															+僧
+															+免
+															+勉
+															+勤
+															+卑
+															+喝
+															+嘆
+															+器
+															+塀
+															+墨
+															+層
+															+屮
+															+悔
+															+慨
+															+憎
+															+懲
+															+敏
+															+既
+															+暑
+															+梅
+															+海
+															+渚
+															+漢
+															+煮
+															+爫
+															+琢
+															+碑
+															+社
+															+祉
+															+祈
+															+祐
+															+祖
+															+祝
+															+禍
+															+禎
+															+穀
+															+突
+															+節
+															+練
+															+縉
+															+繁
+															+署
+															+者
+															+臭
+															+艹
+															+艹
+															+著
+															+褐
+															+視
+															+謁
+															+謹
+															+賓
+															+贈
+															+辶
+															+逸
+															+難
+															+響
+															+頻
+															+恵
+															+𤋮
+															+舘
+															+﩮
+															+﩯
+															+並
+															+况
+															+全
+															+侀
+															+充
+															+冀
+															+勇
+															+勺
+															+喝
+															+啕
+															+喙
+															+嗢
+															+塚
+															+墳
+															+奄
+															+奔
+															+婢
+															+嬨
+															+廒
+															+廙
+															+彩
+															+徭
+															+惘
+															+慎
+															+愈
+															+憎
+															+慠
+															+懲
+															+戴
+															+揄
+															+搜
+															+摒
+															+敖
+															+晴
+															+朗
+															+望
+															+杖
+															+歹
+															+殺
+															+流
+															+滛
+															+滋
+															+漢
+															+瀞
+															+煮
+															+瞧
+															+爵
+															+犯
+															+猪
+															+瑱
+															+甆
+															+画
+															+瘝
+															+瘟
+															+益
+															+盛
+															+直
+															+睊
+															+着
+															+磌
+															+窱
+															+節
+															+类
+															+絛
+															+練
+															+缾
+															+者
+															+荒
+															+華
+															+蝹
+															+襁
+															+覆
+															+視
+															+調
+															+諸
+															+請
+															+謁
+															+諾
+															+諭
+															+謹
+															+變
+															+贈
+															+輸
+															+遲
+															+醙
+															+鉶
+															+陼
+															+難
+															+靖
+															+韛
+															+響
+															+頋
+															+頻
+															+鬒
+															+龜
+															+𢡊
+															+𢡄
+															+𣏕
+															+㮝
+															+䀘
+															+䀹
+															+𥉉
+															+𥳐
+															+𧻓
+															+齃
+															+龎
	`@@ -0,0 +1 @@`
			`+kill -9 $(lsof -i:15010\|sed -n '2,$p'\|awk '{print $2}'\|tr '\n' ' ')`