пре 2 година · c6ac7bddb9
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -59,7 +59,7 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
 
				     def get_html_2(_class):
			
 
				         return _class.get_html()
			
 
				 
			
 
				-    log("file type - " + _type + ' time out - ' + str(time_out))
			
 
				+    log("file type - " + _type + ' page - ' + str(_page_no) + ' time out - ' + str(time_out))
			
 
				 
			
 
				     try:
			
 
				         ss = path_or_stream.split(".")
			
@@ -153,7 +153,7 @@ def remove_underline(image_np):
 
				 
			
 
				 # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
			
 
				 # @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
			
 
				-def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300):
			
 
				+def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_middle=None):
			
 
				     if get_platform() == "Windows":
			
 
				         _global._init()
			
 
				 
			
@@ -210,7 +210,7 @@ def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300):
 
				     finally:
			
 
				         print("======================================")
			
 
				         try:
			
 
				-            if get_platform() == "Linux":
			
 
				+            if get_platform() == "Linux" and save_middle is None:
			
 
				                 # log("not delete temp file")
			
 
				                 # 删除该唯一空间下所有文件
			
 
				                 if os.path.exists(unique_space_path):
			
@@ -419,6 +419,9 @@ def _convert():
 
				         if _timeout is not None:
			
 
				             globals().update({"time_out": _timeout})
			
 
				 
			
 
				+        # 是否保留中间文件
			
 
				+        save_middle = data.get('save_middle')
			
 
				+
			
 
				         # 最终结果截取的最大字节数
			
 
				         max_bytes = data.get("max_bytes")
			
 
				 
			
@@ -427,7 +430,7 @@ def _convert():
 
				             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
			
 
				             # text, swf_images = origin_unique_temp_file_process(stream, _type)
			
 
				             try:
			
 
				-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
			
 
				+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
			
 
				             except TimeoutError:
			
 
				                 log("convert time out! 300 sec")
			
 
				                 text = [-5]
			
@@ -435,7 +438,7 @@ def _convert():
 
				         else:
			
 
				             # Linux 通过装饰器设置整个转换超时时间
			
 
				             try:
			
 
				-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
			
 
				+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
			
 
				             except TimeoutError:
			
 
				                 log("convert time out! 300 sec")
			
 
				                 text = [-5]
			
--- a/format_convert/convert_doc.py
+++ b/format_convert/convert_doc.py
@@ -2,6 +2,8 @@ import inspect
 
				 import os
			
 
				 import re
			
 
				 import sys
			
 
				+
			
 
				+import chardet
			
 
				 from bs4 import BeautifulSoup
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				 from format_convert.convert_tree import _Document, _Sentence, _Page
			
@@ -40,9 +42,15 @@ class DocConvert:
 
				         # 先判断特殊doc文件，可能是html文本
			
 
				         is_html_doc = False
			
 
				         try:
			
 
				-            with open(self.path, 'r') as f:
			
 
				-                html_str = f.read()
			
 
				-            if re.search('<div|<html|<body|<head|<tr|<br|<table|<td', html_str):
			
 
				+            try:
			
 
				+                with open(self.path, 'r') as f:
			
 
				+                    html_str = f.read()
			
 
				+            except UnicodeDecodeError:
			
 
				+                with open(self.path, 'r', errors='ignore') as f:
			
 
				+                    html_str = f.read()
			
 
				+            # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
			
 
				+            if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
			
 
				+                log('doc as html!')
			
 
				                 soup = BeautifulSoup(html_str, 'lxml')
			
 
				                 text = soup.text
			
 
				                 is_html_doc = True
			
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -53,16 +53,27 @@ def read_no_start(numbering_xml):
 
				     # 获取虚拟id的开始编号
			
 
				     w_abstract_num_list = numbering_xml.getElementsByTagName("w:abstractNum")
			
 
				     abstract_id_level_dict = {}
			
 
				+    abstract_id_level_text_dict = {}
			
 
				     for w_abstract_num in w_abstract_num_list:
			
 
				         w_abstract_num_id = w_abstract_num.getAttribute("w:abstractNumId")
			
 
				         w_lvl_list = w_abstract_num.getElementsByTagName("w:lvl")
			
 
				         level_start_dict = {}
			
 
				+        level_text_dict = {}
			
 
				         for w_lvl in w_lvl_list:
			
 
				             w_ilvl_value = w_lvl.getAttribute('w:ilvl')
			
 
				             if w_lvl.getElementsByTagName("w:start"):
			
 
				                 w_ilvl_start_num = w_lvl.getElementsByTagName("w:start")[0].getAttribute("w:val")
			
 
				                 level_start_dict[int(w_ilvl_value)] = int(w_ilvl_start_num)
			
 
				+            if w_lvl.getElementsByTagName("w:lvlText") and w_lvl.getElementsByTagName("w:numFmt"):
			
 
				+                w_lvl_text = w_lvl.getElementsByTagName("w:lvlText")[0].getAttribute("w:val")
			
 
				+                w_lvl_format = w_lvl.getElementsByTagName("w:numFmt")[0].getAttribute("w:val")
			
 
				+                if w_lvl_format == 'upperLetter':
			
 
				+                    w_lvl_text = re.sub('%\d', '%A', w_lvl_text)
			
 
				+                elif w_lvl_format == 'lowerLetter':
			
 
				+                    w_lvl_text = re.sub('%\d', '%a', w_lvl_text)
			
 
				+                level_text_dict[int(w_ilvl_value)] = w_lvl_text
			
 
				         abstract_id_level_dict[w_abstract_num_id] = level_start_dict
			
 
				+        abstract_id_level_text_dict[w_abstract_num_id] = level_text_dict
			
 
				 
			
 
				     # 映射回真实id
			
 
				     real_id_level_start_dict = {}
			
@@ -72,7 +83,14 @@ def read_no_start(numbering_xml):
 
				         if level_start_dict:
			
 
				             real_id_level_start_dict[int(real_id)] = level_start_dict
			
 
				 
			
 
				-    return real_id_level_start_dict
			
 
				+    real_id_level_text_dict = {}
			
 
				+    for abstract_id in abstract_real_id_dict.keys():
			
 
				+        real_id = abstract_real_id_dict.get(abstract_id)
			
 
				+        level_text_dict = abstract_id_level_text_dict.get(abstract_id)
			
 
				+        if level_text_dict:
			
 
				+            real_id_level_text_dict[int(real_id)] = level_text_dict
			
 
				+
			
 
				+    return real_id_level_start_dict, real_id_level_text_dict
			
 
				 
			
 
				 
			
 
				 def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numbering_xml, document_xml_rels,
			
@@ -95,8 +113,8 @@ def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numberi
 
				     # 文本的编号（如果有编号的话）
			
 
				     text_no = ''
			
 
				 
			
 
				-    # 获取编号组的起始值
			
 
				-    id_level_start_dict = read_no_start(numbering_xml)
			
 
				+    # 获取编号组的起始值和编号组的展示形式
			
 
				+    id_level_start_dict, id_level_text_dict = read_no_start(numbering_xml)
			
 
				     # print('_num_pr_dict', _num_pr_dict)
			
 
				 
			
 
				     # 提取编号 组-层级-序号
			
@@ -143,8 +161,22 @@ def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numberi
 
				                         if id_level_start_dict.get(group_id) and id_level_start_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
			
 
				                             start_no = id_level_start_dict.get(group_id).get(level)
			
 
				                             level_node_cnt += start_no - 1
			
 
				+
			
 
				+                        level_text = None
			
 
				+                        if id_level_text_dict.get(group_id) and id_level_text_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
			
 
				+                            level_text = id_level_text_dict.get(group_id).get(level)
			
 
				                         # print('level_node_cnt', level_node_cnt)
			
 
				-                        text_no += str(level_node_cnt) + '.'
			
 
				+                        if level_text:
			
 
				+                            if re.search('a', level_text):
			
 
				+                                level_node_cnt = chr(ord('a') + level_node_cnt - 1)
			
 
				+                                text_no += re.sub('%a', str(level_node_cnt), level_text)
			
 
				+                            elif re.search('A', level_text):
			
 
				+                                level_node_cnt = chr(ord('A') + level_node_cnt - 1)
			
 
				+                                text_no += re.sub('%A', str(level_node_cnt), level_text)
			
 
				+                            else:
			
 
				+                                text_no += re.sub('%\d', str(level_node_cnt), level_text)
			
 
				+                        else:
			
 
				+                            text_no += str(level_node_cnt) + '.'
			
 
				                         # print('text_no', text_no)
			
 
				                     _last_node_level = node_level
			
 
				 
			
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -17,7 +17,7 @@ import traceback
 
				 import cv2
			
 
				 from isr.pre_process import count_red_pixel
			
 
				 from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log, \
			
 
				-    memory_decorator, pil_resize, np2bytes, ocr_cant_read
			
 
				+    memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2
			
 
				 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
			
 
				     from_idc_interface, from_isr_interface
			
 
				 from format_convert.table_correct import get_rotated_image
			
@@ -88,7 +88,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				                 textbox_list.remove(_obj)
			
 
				         return textbox_list
			
 
				 
			
 
				-    def idc_process(_image_np):
			
 
				+    def idc_process(_image_np, return_angle=False):
			
 
				         # 图片倾斜校正，写入原来的图片路径
			
 
				         # print("image_process", image_path)
			
 
				         # g_r_i = get_rotated_image(_image_np, image_path)
			
@@ -115,17 +115,26 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				         #     image_bytes = f.read()
			
 
				         image_bytes = np2bytes(image_resize)
			
 
				         angle = from_idc_interface(image_bytes)
			
 
				+        log('idc_process angle ' + str(angle))
			
 
				         if judge_error_code(angle):
			
 
				-            if is_from_docx:
			
 
				-                return []
			
 
				+            if return_angle:
			
 
				+                if is_from_docx:
			
 
				+                    return [], []
			
 
				+                else:
			
 
				+                    return angle, angle
			
 
				             else:
			
 
				-                return angle
			
 
				+                if is_from_docx:
			
 
				+                    return []
			
 
				+                else:
			
 
				+                    return angle
			
 
				         # 根据角度旋转
			
 
				-        image_pil = Image.fromarray(_image_np)
			
 
				-        _image_np = np.array(image_pil.rotate(angle, expand=1))
			
 
				+        _image_pil = Image.fromarray(_image_np)
			
 
				+        _image_np = np.array(_image_pil.rotate(angle, expand=1))
			
 
				         # 写入
			
 
				         # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
			
 
				         # cv2.imwrite(idc_path, image_np)
			
 
				+        if return_angle:
			
 
				+            return _image_np, angle
			
 
				         return _image_np
			
 
				 
			
 
				     def isr_process(_image_np):
			
@@ -288,6 +297,23 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				         # 调用现成方法形成表格
			
 
				         try:
			
 
				             if list_line:
			
 
				+
			
 
				+                # 排除掉短且经过文字bbox中间的竖线
			
 
				+                temp_list = []
			
 
				+                for line in list_line:
			
 
				+                    find_cnt = 0
			
 
				+                    if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
			
 
				+                        for t_obj in list_text_boxes:
			
 
				+                            if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
			
 
				+                                # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2])
			
 
				+                                find_cnt += 1
			
 
				+                                if find_cnt >= 2:
			
 
				+                                    break
			
 
				+                    if find_cnt >= 2:
			
 
				+                        continue
			
 
				+                    temp_list.append(line)
			
 
				+                list_line = temp_list
			
 
				+
			
 
				                 from format_convert.convert_tree import TableLine
			
 
				                 list_lines = []
			
 
				                 for line in list_line:
			
@@ -486,19 +512,31 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
				                     return text_list
			
 
				 
			
 
				                 # 判断ocr识别是否正确
			
 
				-                if ocr_cant_read(text_list, box_list) and not idc_flag and False:
			
 
				+                if ocr_cant_read(text_list, box_list) and not idc_flag:
			
 
				+                # if True:
			
 
				                     # 方向分类
			
 
				-                    image_np = idc_process(image_np)
			
 
				-                    # cv2.imshow("idc_process", image_np)
			
 
				-                    # cv2.waitKey(0)
			
 
				+                    image_np, angle = idc_process(image_np, return_angle=True)
			
 
				                     if isinstance(image_np, list):
			
 
				                         return image_np
			
 
				+                    # 如果角度不变，旋转180
			
 
				+                    if angle in [0, 360]:
			
 
				+                        image_pil = Image.fromarray(image_np)
			
 
				+                        image_np = np.array(image_pil.rotate(180, expand=1))
			
 
				+                    # cv2.imshow("idc_process", image_np)
			
 
				+                    # cv2.waitKey(0)
			
 
				 
			
 
				                     # 文字识别
			
 
				                     text_list1, box_list_1 = ocr_process(image_np)
			
 
				                     if judge_error_code(text_list1):
			
 
				                         return text_list1
			
 
				 
			
 
				+                    # all_text = ''.join(text_list1)
			
 
				+                    # all_text = re.sub('[\s\d]', '', all_text)
			
 
				+                    # if len(re.findall(get_garble_code2(), all_text)) >= 2:
			
 
				+                    # log('text_list1' + ''.join(text_list1))
			
 
				+                    if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and is_from_pdf:
			
 
				+                        return [-16]
			
 
				+
			
 
				                     # 比较字数
			
 
				                     # print("ocr process", len("".join(text_list)), len("".join(text_list1)))
			
 
				                     if len("".join(text_list)) < len("".join(text_list1)):
			
--- a/format_convert/convert_layout.py
+++ b/format_convert/convert_layout.py
@@ -0,0 +1,562 @@
 
				+import os
			
 
				+import sys
			
 
				+sys.setrecursionlimit(10000)
			
 
				+sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
			
 
				+import re
			
 
				+import traceback
			
 
				+from bs4 import BeautifulSoup
			
 
				+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
			
 
				+from format_convert.wrapt_timeout_decorator import timeout
			
 
				+
			
 
				+
			
 
				+class TreeNode:
			
 
				+    def __init__(self, data):
			
 
				+        self.data = data
			
 
				+        self.children = []
			
 
				+
			
 
				+    def add_child(self, child_node):
			
 
				+        self.children.append(child_node)
			
 
				+
			
 
				+
			
 
				+def print_tree(node, level=0):
			
 
				+    print("  " * level + str(node.data))
			
 
				+    for child in node.children:
			
 
				+        print_tree(child, level + 1)
			
 
				+
			
 
				+
			
 
				+def print_tree_order(node, div_list, level=0):
			
 
				+    text = "  " * level + div_list[node.data[0]].text
			
 
				+    colors = [(255, 0, 0, 0.7), (0, 255, 0, 0.6), (0, 0, 255, 0.6), (255, 127, 0, 0.2),
			
 
				+              # (123, 104, 238, 0.2),
			
 
				+              (238, 238, 0, 0.2),
			
 
				+              (255, 104, 255, 0.2)
			
 
				+              ]
			
 
				+
			
 
				+    if level < len(colors):
			
 
				+        color = colors[level]
			
 
				+    else:
			
 
				+        color = colors[-1]
			
 
				+
			
 
				+    text = '<div style="background-color: rgba{}";>'.format(str(color)) + text + '</div>'
			
 
				+
			
 
				+    if level == 0:
			
 
				+        text = '<!DOCTYPE HTML><head><meta charset="UTF-8">' + text
			
 
				+
			
 
				+    with open('../layout.html', 'a') as f:
			
 
				+        f.write(text)
			
 
				+
			
 
				+    for child in node.children:
			
 
				+        print('node.child', child.data[:10])
			
 
				+        print_tree_order(child, div_list, level + 1)
			
 
				+
			
 
				+
			
 
				+class LayoutConvert:
			
 
				+    def __init__(self, html):
			
 
				+        self.html = html
			
 
				+
			
 
				+        self.order_type_list = ['[★]?(\d{1,3}[.])+[.\d]?',
			
 
				+                                '[★]?[A-Z][.、]',
			
 
				+                                '[★]?[a-z][.、]',
			
 
				+                                '[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]',
			
 
				+                                '[ⅠⅡⅢⅣⅤⅥⅦⅧⅩⅪⅫ]',
			
 
				+                                '[ⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹ]',
			
 
				+                                '[❶❷❸❹❻❼❽❾❿]',
			
 
				+                                '第[一二三四五六七八九十]{1,2}[章节篇]',
			
 
				+                                '第\d{1,2}[章节篇]',
			
 
				+                                '[（(]\d{1,3}[)）]',
			
 
				+                                '[★]?\d{1,3}、',
			
 
				+                                '[（(][一二三四五六七八九十]{1,3}[)）]',
			
 
				+                                '[一二三四五六七八九十]{1,3}、',
			
 
				+                                '包[1-9]{1,3}',
			
 
				+                                '标段[1-9]{1,3}',
			
 
				+                                ]
			
 
				+
			
 
				+        self.chinese_arabic_dict = {
			
 
				+            '一': 1,
			
 
				+            '二': 2,
			
 
				+            '三': 3,
			
 
				+            '四': 4,
			
 
				+            '五': 5,
			
 
				+            '六': 6,
			
 
				+            '七': 7,
			
 
				+            '八': 8,
			
 
				+            '九': 9,
			
 
				+            '十': 10,
			
 
				+        }
			
 
				+
			
 
				+    def get_layout(self):
			
 
				+        return
			
 
				+
			
 
				+    def recursion_get_tree(self, index_list, div_list, start_index, end_index):
			
 
				+        print([start_index, end_index], div_list[start_index].text[:10], '-'*20)
			
 
				+        tree_node = TreeNode([start_index, end_index])
			
 
				+
			
 
				+        if end_index - start_index == 1:
			
 
				+            print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
			
 
				+            return tree_node
			
 
				+
			
 
				+        temp_end_i = index_list[0][0]
			
 
				+        for start_i, end_i in index_list:
			
 
				+            if not start_index < start_i <= end_i <= end_index:
			
 
				+                if start_i == 0:
			
 
				+                    print('continue not start_index < start_i <= end_i <= end_index', start_i, end_i)
			
 
				+                continue
			
 
				+            if start_i < temp_end_i:
			
 
				+                print('continue start_i < temp_end_i', start_i, temp_end_i, div_list[start_i])
			
 
				+                continue
			
 
				+
			
 
				+            sub_tree_node = self.recursion_get_tree(index_list, div_list, start_i, end_i)
			
 
				+            tree_node.add_child(sub_tree_node)
			
 
				+            temp_end_i = end_i
			
 
				+        print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
			
 
				+        return tree_node
			
 
				+
			
 
				+    def get_order_number_tree(self, product=None):
			
 
				+        def get_order_no(_ti, _div_text):
			
 
				+            _tis = re.split('[.、]', str(_ti))
			
 
				+            temp_tis = []
			
 
				+            for _t in _tis:
			
 
				+                if _t != '':
			
 
				+                    temp_tis.append(_t)
			
 
				+            _tis = temp_tis
			
 
				+
			
 
				+            _ti_order_no = None
			
 
				+            if len(_tis) >= 2:
			
 
				+                re.search('', _div_text)
			
 
				+            else:
			
 
				+                _match = re.search('[1-9]+', _div_text)
			
 
				+                if _match:
			
 
				+                    _ti_order_no = int(_match.group())
			
 
				+                else:
			
 
				+                    _match = re.search('[一二三四五六七八九十]+', _div_text)
			
 
				+                    if _match:
			
 
				+                        _ti_order_no = _match.group()
			
 
				+                        temp_order_no = ''
			
 
				+                        for o in _ti_order_no:
			
 
				+                            temp_order_no += str(self.chinese_arabic_dict.get(o))
			
 
				+                        _ti_order_no = int(temp_order_no)
			
 
				+            return _ti_order_no
			
 
				+
			
 
				+
			
 
				+        soup = BeautifulSoup(self.html, 'lxml')
			
 
				+
			
 
				+        div_list = soup.findAll('div')
			
 
				+
			
 
				+        type_index_list = []
			
 
				+        range_index_list = []
			
 
				+        cut_type_index_dict = {}
			
 
				+        # temp_type_index_list = []
			
 
				+
			
 
				+        # 获取每一行的序号类型
			
 
				+        for div_index, d in enumerate(div_list):
			
 
				+            text = d.text
			
 
				+
			
 
				+            # 判断该行是什么序号类型
			
 
				+            find_type_index = -1
			
 
				+            for type_index, reg in enumerate(self.order_type_list):
			
 
				+                if find_type_index >= 0:
			
 
				+                    continue
			
 
				+
			
 
				+                match = re.finditer(reg, text)
			
 
				+                for m in match:
			
 
				+                    if m.span()[0] != 0:
			
 
				+                        continue
			
 
				+                    order = m.group()
			
 
				+
			
 
				+                    if type_index in [0, 1]:
			
 
				+                        order = re.sub('[★]', '', order)
			
 
				+
			
 
				+                    # 普通情况，单层序号
			
 
				+                    if type_index != 0:
			
 
				+                        find_type_index = type_index
			
 
				+                    # 特殊情况，多层序号
			
 
				+                    else:
			
 
				+                        ss = order.split('.')
			
 
				+                        # if len(re.findall('[.]', m.group())) == 1:
			
 
				+                        if len(ss) - ss.count('') == 1:
			
 
				+                            find_type_index = 0
			
 
				+                            # print('find_type_index1', find_type_index, text[:5])
			
 
				+                        else:
			
 
				+                            # 用小数表示多层序号
			
 
				+                            find_type_index = re.sub('\d+', '0', order)
			
 
				+                            find_type_index = re.sub('[.]', '', find_type_index)
			
 
				+                            find_type_index = find_type_index[0] + '.' + find_type_index[1:-1] + '1'
			
 
				+                            find_type_index = float(find_type_index)
			
 
				+                            # print('find_type_index2', find_type_index, text[:5])
			
 
				+                    break
			
 
				+            type_index_list.append(find_type_index)
			
 
				+
			
 
				+        # 根据每一行的序号类型分块
			
 
				+        for div_index, d in enumerate(div_list):
			
 
				+            find_type_index = type_index_list[div_index]
			
 
				+            sub_type_index_list = type_index_list[:div_index]
			
 
				+            text = d.text
			
 
				+            print(text)
			
 
				+
			
 
				+            # 若无序号类型，跳过
			
 
				+            if find_type_index < 0:
			
 
				+                # type_index_list.append(find_type_index)
			
 
				+                print('continue -1')
			
 
				+                print('-'*40)
			
 
				+                continue
			
 
				+
			
 
				+            print('find_type_index, div_index', find_type_index, div_index)
			
 
				+
			
 
				+            # 已经存在相同的序号类型
			
 
				+            if find_type_index in sub_type_index_list:
			
 
				+            #     # 判断是否开始的序号
			
 
				+            #     if (find_type_index >= 1 or find_type_index == 0) and len(re.findall('[1一]', text[:3])) == 1 \
			
 
				+            #             and len(re.findall('[2-9二三四五六七八九十]', text[:3])) == 0:
			
 
				+            #         # type_index_list.append(find_type_index)
			
 
				+            #         final_index = None
			
 
				+            #         for temp_div_index, temp_type in enumerate(sub_type_index_list):
			
 
				+            #             if find_type_index == temp_type:
			
 
				+            #                 final_index = temp_div_index
			
 
				+            #         final_block_index = div_index
			
 
				+            #         min_block_size = 100000
			
 
				+            #         for block in range_index_list:
			
 
				+            #             if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
			
 
				+            #                 min_block_size = block[1] - block[0]
			
 
				+            #                 final_block_index = block[1]+1
			
 
				+            #         if final_index is not None and [final_index, final_block_index] not in range_index_list:
			
 
				+            #             range_index_list.append([final_index, final_block_index])
			
 
				+            #             if cut_type_index_dict.get(find_type_index) is not None:
			
 
				+            #                 if div_index > cut_type_index_dict[find_type_index]:
			
 
				+            #                     cut_type_index_dict[find_type_index] = final_block_index
			
 
				+            #             else:
			
 
				+            #                 cut_type_index_dict[find_type_index] = final_block_index
			
 
				+            #         print('continue 1')
			
 
				+            #         print('cut_type_index_dict', cut_type_index_dict)
			
 
				+            #         print('-'*40)
			
 
				+            #         continue
			
 
				+
			
 
				+                # 判断是否开始的序号
			
 
				+                # if 0 < find_type_index < 1 \
			
 
				+                #         and len(re.findall('[1]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 1 \
			
 
				+                #         and len(re.findall('[2-9]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 0:
			
 
				+                #     # type_index_list.append(find_type_index)
			
 
				+                #     final_index = None
			
 
				+                #     for temp_div_index, temp_type in enumerate(sub_type_index_list):
			
 
				+                #         if find_type_index == temp_type:
			
 
				+                #             final_index = temp_div_index
			
 
				+                #     final_block_index = div_index
			
 
				+                #     min_block_size = 100000
			
 
				+                #     for block in range_index_list:
			
 
				+                #         if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
			
 
				+                #             min_block_size = block[1] - block[0]
			
 
				+                #             final_block_index = block[1]+1
			
 
				+                #     if final_index is not None and [final_index, final_block_index] not in range_index_list:
			
 
				+                #         range_index_list.append([final_index, final_block_index])
			
 
				+                #         if cut_type_index_dict.get(find_type_index) is not None:
			
 
				+                #             if div_index > cut_type_index_dict[find_type_index]:
			
 
				+                #                 cut_type_index_dict[find_type_index] = final_block_index
			
 
				+                #         else:
			
 
				+                #             cut_type_index_dict[find_type_index] = final_block_index
			
 
				+                #     print('continue 2')
			
 
				+                #     print('-'*40)
			
 
				+                #     continue
			
 
				+
			
 
				+                # 找之前相同的序号类型的index，且index不能超过截断的该类型的index
			
 
				+                last_index = len(sub_type_index_list) - 1 - sub_type_index_list[::-1].index(find_type_index)
			
 
				+                print('find_type_index', find_type_index, [last_index, div_index], [sub_type_index_list[0], sub_type_index_list[-1]])
			
 
				+                if last_index < cut_type_index_dict.get(find_type_index, 0):
			
 
				+                    # type_index_list.append(find_type_index)
			
 
				+                    print('continue 3 last_index < cut_type_index_dict ', last_index, cut_type_index_dict.get(find_type_index, 0))
			
 
				+                    print('-'*40)
			
 
				+                    continue
			
 
				+
			
 
				+                # 新增块
			
 
				+                range_index_list.append([last_index, div_index])
			
 
				+                print('find last_index add block', [last_index, div_index])
			
 
				+
			
 
				+                # 更新截断
			
 
				+                if cut_type_index_dict.get(find_type_index) is not None:
			
 
				+                    if div_index > cut_type_index_dict[find_type_index]:
			
 
				+                        cut_type_index_dict[find_type_index] = div_index
			
 
				+                else:
			
 
				+                    cut_type_index_dict[find_type_index] = div_index
			
 
				+
			
 
				+                # 找到块了，那么块内的所有序号类型的截断到该块的最小index
			
 
				+                final_type_index_dict = {}
			
 
				+                for temp_div_index, temp_type in enumerate(sub_type_index_list[last_index+1:div_index]):
			
 
				+                    temp_div_index += last_index + 1
			
 
				+
			
 
				+                    if temp_div_index < cut_type_index_dict.get(temp_type, 0):
			
 
				+                        continue
			
 
				+
			
 
				+                    # 对块内有的类型的最后一个都新增块
			
 
				+                    if temp_div_index <= range_index_list[-1][0]:
			
 
				+                        continue
			
 
				+                    final_type_index_dict[temp_type] = temp_div_index
			
 
				+
			
 
				+                for temp_type in final_type_index_dict.keys():
			
 
				+                    final_index = final_type_index_dict.get(temp_type)
			
 
				+                    if [final_index, div_index] not in range_index_list:
			
 
				+                        print('add block cut_type_index_dict 1', cut_type_index_dict)
			
 
				+                        range_index_list.append([final_index, div_index])
			
 
				+                        print('add block ', [final_index, div_index])
			
 
				+                        if cut_type_index_dict.get(temp_type) is not None:
			
 
				+                            if div_index > cut_type_index_dict[temp_type]:
			
 
				+                                cut_type_index_dict[temp_type] = div_index
			
 
				+                        else:
			
 
				+                            cut_type_index_dict[temp_type] = div_index
			
 
				+
			
 
				+                        print('add block cut_type_index_dict 2', cut_type_index_dict)
			
 
				+
			
 
				+
			
 
				+                # temp_type_index_list = []
			
 
				+            else:
			
 
				+                print('find_type_index not in type_index_list')
			
 
				+
			
 
				+            print(cut_type_index_dict)
			
 
				+            # 存储所有序号类型
			
 
				+            # type_index_list.append(find_type_index)
			
 
				+            # 存储块内的序号类型
			
 
				+            # temp_type_index_list.append(find_type_index)
			
 
				+            print('-'*40)
			
 
				+
			
 
				+        if not range_index_list:
			
 
				+            print('no range_index_list')
			
 
				+            return
			
 
				+
			
 
				+        # 排序
			
 
				+        range_index_list.sort(key=lambda x: (x[0], x[1]))
			
 
				+
			
 
				+        # 生成最后的块
			
 
				+        for temp_type in range(len(self.order_type_list)):
			
 
				+            for div_index, d in enumerate(div_list[::-1]):
			
 
				+                div_index = len(div_list) - 1 - div_index
			
 
				+                if type_index_list[div_index] != temp_type:
			
 
				+                    continue
			
 
				+                if [div_index, div_index+1] not in range_index_list:
			
 
				+                    range_index_list.append([div_index, len(div_list)-1])
			
 
				+                    break
			
 
				+
			
 
				+        # last_block_index = range_index_list[-1][1]
			
 
				+        # for div_index, d in enumerate(div_list[last_block_index:]):
			
 
				+        #     div_index = div_index + last_block_index
			
 
				+        #     if type_index_list[div_index] < 0:
			
 
				+        #         continue
			
 
				+        #     if [div_index, len(div_list)-1] not in range_index_list:
			
 
				+        #         range_index_list.append([div_index, len(div_list)-1])
			
 
				+
			
 
				+        # 排序
			
 
				+        range_index_list.sort(key=lambda x: (x[0], -x[1]))
			
 
				+
			
 
				+        print('type_index_list', type_index_list)
			
 
				+
			
 
				+        block_dict = {}
			
 
				+        index_div_list = []
			
 
				+        for range_index in range_index_list:
			
 
				+            _text = ''
			
 
				+
			
 
				+            for d in div_list[range_index[0]:range_index[1]]:
			
 
				+                _text += d.text
			
 
				+            print(range_index, _text[:20])
			
 
				+
			
 
				+        # 合并重叠的
			
 
				+        delete_range_index_list = []
			
 
				+        # for i, range_index in enumerate(range_index_list):
			
 
				+        #     if range_index in delete_range_index_list:
			
 
				+        #         continue
			
 
				+        #     for j in range(i+1, len(range_index_list)):
			
 
				+        #         range_index2 = range_index_list[j]
			
 
				+        #         if range_index2 in delete_range_index_list:
			
 
				+        #             continue
			
 
				+        #         if range_index[0] == range_index2[0] or range_index[1] == range_index2[1]:
			
 
				+        #             delete_range_index_list.append(range_index2)
			
 
				+
			
 
				+        # 补充中间断开的
			
 
				+        add_range_index_list = []
			
 
				+        if range_index_list[0][0] != 0:
			
 
				+            for j in range(0, range_index_list[0][0]):
			
 
				+                add_range_index_list.append([j, j+1])
			
 
				+        for i in range(1, len(range_index_list)):
			
 
				+            range_index1 = range_index_list[i-1]
			
 
				+            range_index2 = range_index_list[i]
			
 
				+            if range_index1[1] != range_index2[0] or (range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]):
			
 
				+                for j in range(range_index1[0], range_index2[0]):
			
 
				+                    add_range_index_list.append([j, j+1])
			
 
				+                # add_range_index_list.append([range_index1[0], range_index2[0]])
			
 
				+
			
 
				+            # if range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]:
			
 
				+            #     add_range_index_list.append([range_index1[0]+1, range_index2[0]])
			
 
				+
			
 
				+        print('delete_range_index_list', delete_range_index_list)
			
 
				+        print('add_range_index_list', add_range_index_list)
			
 
				+
			
 
				+        print('len(range_index_list)', len(range_index_list))
			
 
				+        for range_index in delete_range_index_list:
			
 
				+            if range_index in range_index_list:
			
 
				+                range_index_list.remove(range_index)
			
 
				+
			
 
				+        print('len(range_index_list)', len(range_index_list))
			
 
				+
			
 
				+        range_index_list += add_range_index_list
			
 
				+        range_index_list.sort(key=lambda x: (x[0], -x[1]))
			
 
				+
			
 
				+        print('len(range_index_list)', len(range_index_list))
			
 
				+
			
 
				+        tree_root = self.recursion_get_tree(range_index_list, div_list, 0, len(div_list))
			
 
				+        # print_tree(tree_root)
			
 
				+
			
 
				+        with open('../layout.html', 'w') as f:
			
 
				+            f.write('')
			
 
				+
			
 
				+        print_tree_order(tree_root, div_list)
			
 
				+
			
 
				+        with open('../origin.html', 'w') as f:
			
 
				+            f.write(self.html)
			
 
				+
			
 
				+        # 打印某个产品的参数
			
 
				+        if product:
			
 
				+            candidate_div_list = []
			
 
				+            for i, div in enumerate(div_list):
			
 
				+                div = div.text
			
 
				+                if i == 0 or i == len(div_list)-1:
			
 
				+                    continue
			
 
				+                if not re.search(product, div):
			
 
				+                    continue
			
 
				+
			
 
				+                print('find product', div[:20])
			
 
				+
			
 
				+                type_index = type_index_list[i]
			
 
				+
			
 
				+                type_index_after = None
			
 
				+                for ti in type_index_list[i+1:]:
			
 
				+                    if ti != -1:
			
 
				+                        type_index_after = ti
			
 
				+                        break
			
 
				+                type_index_before = None
			
 
				+                for ti in type_index_list[:i][::-1]:
			
 
				+                    if ti != -1:
			
 
				+                        type_index_before = ti
			
 
				+                        break
			
 
				+
			
 
				+                print('type_index, type_index_before, type_index_after1', type_index, type_index_before, type_index_after)
			
 
				+
			
 
				+                # 复用序号样式
			
 
				+                dup_type_index_flag = 0
			
 
				+                if type_index_after == type_index:
			
 
				+                    dup_type_index_flag = 1
			
 
				+
			
 
				+                print('type_index, type_index_before, type_index_after2', type_index, type_index_before, type_index_after)
			
 
				+
			
 
				+                block_type_list = []
			
 
				+                block_div_list = []
			
 
				+                no_order_type_list = []
			
 
				+                sub_type_index_list = type_index_list[i:]
			
 
				+                type_index_pair1 = [type_index_before, type_index]
			
 
				+                type_index_pair2 = [type_index, type_index_after]
			
 
				+                for j, ti in enumerate(sub_type_index_list):
			
 
				+                    real_j = j + i
			
 
				+                    if j == 0 or j == len(sub_type_index_list) - 1:
			
 
				+                        continue
			
 
				+                    ti_previous = sub_type_index_list[j-1]
			
 
				+                    ti_next = sub_type_index_list[j+1]
			
 
				+                    ti_pair1 = [ti_previous, ti_next]
			
 
				+                    ti_pair2 = [ti, ti_next]
			
 
				+                    _div = div_list[real_j].text
			
 
				+
			
 
				+                    # 判断多层还是单层，且是否第一个
			
 
				+                    tis = re.split('[.、]', str(ti))
			
 
				+                    temp_tis = []
			
 
				+                    for _ti in tis:
			
 
				+                        if _ti != '':
			
 
				+                            temp_tis.append(_ti)
			
 
				+                    tis = temp_tis
			
 
				+                    break_flag1 = 0
			
 
				+                    if len(tis) >= 2:
			
 
				+                        if len(re.findall('[1一]{2,}', tis[-1])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', tis[-1])) != 0:
			
 
				+                            break_flag1 = 1
			
 
				+                    else:
			
 
				+                        if len(re.findall('[1一]{2,}', _div[:3])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', _div[:6])) != 0:
			
 
				+                            break_flag1 = 1
			
 
				+
			
 
				+                    # 有复用的，与搜索的type_index相同且连续，但与之前的相同的type_index的数字不连续
			
 
				+                    break_flag2 = 0
			
 
				+                    if dup_type_index_flag and type_index == ti and ti in block_type_list:
			
 
				+                        last_ti_index = block_type_list[::-1].index(ti)
			
 
				+                        last_ti_index = len(block_type_list) - 1 - last_ti_index
			
 
				+                        last_ti_div = block_div_list[last_ti_index]
			
 
				+                        last_ti_order_no = get_order_no(ti, last_ti_div)
			
 
				+                        ti_order_no = get_order_no(ti, _div)
			
 
				+                        type_index_order_no = get_order_no(type_index, div)
			
 
				+                        print('last_ti_order_no, ti_order_no, type_index_order_no', last_ti_order_no, ti_order_no, type_index_order_no)
			
 
				+                        print(last_ti_div[:10], _div[:10], div[:10])
			
 
				+
			
 
				+                        if None not in [type_index_order_no, last_ti_order_no, ti_order_no]:
			
 
				+                            if ti_order_no - type_index_order_no == 1 and ti_order_no - last_ti_order_no != 1:
			
 
				+                                break_flag2 = 1
			
 
				+
			
 
				+                    if break_flag2:
			
 
				+                        break
			
 
				+                    # 碰到很大的序号类型
			
 
				+                    elif ti in [7, 8]:
			
 
				+                        break
			
 
				+                    # 碰到不是从1开始的
			
 
				+                    elif ti == -1:
			
 
				+                        no_order_type_list.append(ti)
			
 
				+                        block_type_list.append(ti)
			
 
				+                        block_div_list.append(_div)
			
 
				+                    elif ti not in block_type_list and break_flag1:
			
 
				+                        print('not 1 start break', _div[:6], len(re.findall('[1一]', _div[:3])), len(re.findall('[2-9二三四五六七八九十]', _div[:6])))
			
 
				+                        print(block_div_list)
			
 
				+                        print(block_type_list)
			
 
				+                        break
			
 
				+                    elif not dup_type_index_flag and ti not in [type_index, type_index_before, type_index_after]:
			
 
				+                        block_type_list.append(ti)
			
 
				+                        block_div_list.append(_div)
			
 
				+                        no_order_type_list = []
			
 
				+                    else:
			
 
				+                        # 遇到相同类型的组合
			
 
				+                        if not dup_type_index_flag and (type_index_pair1 == ti_pair1):
			
 
				+                            block_type_list.append(ti)
			
 
				+                            block_div_list.append(_div)
			
 
				+                            print('type_index_pair1 == ti_pair1 or type_index_pair2 == ti_pair2 break',
			
 
				+                                  _div[:6], type_index_pair1, ti_pair1, type_index_pair2, ti_pair2)
			
 
				+                            break
			
 
				+                        else:
			
 
				+                            no_order_type_list = []
			
 
				+                            block_type_list.append(ti)
			
 
				+                            block_div_list.append(_div)
			
 
				+
			
 
				+                if not block_type_list:
			
 
				+                    continue
			
 
				+
			
 
				+                # 排除末尾为非序号的
			
 
				+                if block_type_list[-1] == -1:
			
 
				+                    block_type_list = block_type_list[:len(block_type_list)-len(no_order_type_list)]
			
 
				+                    block_div_list = block_div_list[:len(block_div_list)-len(no_order_type_list)]
			
 
				+
			
 
				+                candidate_div_list.append(block_div_list)
			
 
				+
			
 
				+            print('len(candidate_div_list)', len(candidate_div_list))
			
 
				+            print('candidate_div_list', candidate_div_list)
			
 
				+            if candidate_div_list:
			
 
				+                candidate_div_list.sort(key=lambda x: len(x))
			
 
				+                for div in candidate_div_list:
			
 
				+                    print(len(div), div)
			
 
				+                print('='*10, product, '='*10)
			
 
				+                for div in candidate_div_list[-1]:
			
 
				+                    print(div)
			
 
				+
			
 
				+
			
 
				+
			
 
				+        # print(d.text)
			
 
				+
			
 
				+    def order_show_in_layout(self, tree_root, div_list):
			
 
				+        print_tree_order(tree_root, div_list)
			
 
				+
			
 
				+
			
 
				+
			
 
				+# with open('../result.html', 'r') as f:
			
 
				+with open(r'C:\Users\Administrator\Desktop\test_layout\4.html', 'r') as f:
			
 
				+    html = f.read()
			
 
				+
			
 
				+LayoutConvert(html).get_order_number_tree('连续性血液净化设备')
			
 
				+
			
 
				+
			
 
				+_list = [1, 3, 5, 7, 9]
			
 
				+print(len(_list) - 1 - _list[::-1].index(3))
			
--- a/format_convert/convert_need_interface.py
+++ b/format_convert/convert_need_interface.py
@@ -555,8 +555,8 @@ def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
 
				 
			
 
				 
			
 
				 def interface_pool_gunicorn(interface_type):
			
 
				-    # if get_platform() == 'Windows':
			
 
				-    #     set_flask_global()
			
 
				+    if get_platform() == 'Windows':
			
 
				+        set_flask_global()
			
 
				 
			
 
				     ip_port_flag_dict = _global.get("ip_port_flag")
			
 
				     ip_port_dict = _global.get("ip_port")
			
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
--- a/format_convert/convert_test.py
+++ b/format_convert/convert_test.py
@@ -5,13 +5,27 @@ import random
 
				 import sys
			
 
				 import time
			
 
				 from glob import glob
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				+from pdfminer.converter import PDFPageAggregator
			
 
				+from pdfminer.layout import LAParams, LTLine
			
 
				+from pdfminer.pdfdocument import PDFDocument
			
 
				+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
			
 
				+from pdfminer.pdfpage import PDFPage
			
 
				+from pdfminer.pdfparser import PDFParser
			
 
				+from pdfplumber import PDF
			
 
				+
			
 
				+from otr.table_line_pdf import _plot
			
 
				+
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
			
 
				 from format_convert.convert import to_html
			
 
				 import multiprocessing as mp
			
 
				 
			
 
				 
			
 
				-def test_one(p, page_no_range=None, from_remote=False):
			
 
				+def test_one(p, page_no_range=None, from_remote=False, timeout=300, save_middle=None):
			
 
				     start_time = time.time()
			
 
				     with open(p, "rb") as f:
			
 
				         file_bytes = f.read()
			
@@ -19,14 +33,15 @@ def test_one(p, page_no_range=None, from_remote=False):
 
				 
			
 
				     _md5 = get_md5_from_bytes(file_bytes)
			
 
				 
			
 
				-    data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range}
			
 
				+    data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
			
 
				+            'timeout': timeout, 'save_middle': save_middle}
			
 
				     if from_remote:
			
 
				         _url = 'http://121.46.18.113:15010/convert'
			
 
				         # _url = 'http://192.168.2.103:15010/convert'
			
 
				         # _url = 'http://192.168.2.102:15011/convert'
			
 
				         # _url = 'http://172.16.160.65:15010/convert'
			
 
				         # _url = 'http://127.0.0.1:15010/convert'
			
 
				-        result = json.loads(request_post(_url, data, time_out=10000))
			
 
				+        result = json.loads(request_post(_url, data, time_out=timeout+20))
			
 
				         text_str = ""
			
 
				         for t in result.get("result_html"):
			
 
				             text_str += t
			
@@ -42,6 +57,25 @@ def test_one(p, page_no_range=None, from_remote=False):
 
				     print(time.time()-start_time)
			
 
				 
			
 
				 
			
 
				+def test_path():
			
 
				+    # _url = 'http://121.46.18.113:15010/convert'
			
 
				+    _url = 'http://192.168.0.115:15010/convert'
			
 
				+    print(_url)
			
 
				+    p = '/data/fangjiasheng/format_conversion_maxcompute/1.png'
			
 
				+    data = {"file_path": p, "type": p.split(".")[-1], "filemd5": 100, 'page_no': '1,-1',
			
 
				+            'timeout': 10000, 'save_middle': None}
			
 
				+    print(str(data))
			
 
				+    # result = json.loads(request_post(_url, data, time_out=1000))
			
 
				+    result = json.loads(requests.post(_url, data))
			
 
				+    text_str = ""
			
 
				+    for t in result.get("result_html"):
			
 
				+        text_str += t
			
 
				+    to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
			
 
				+            text_str)
			
 
				+    print("result_text", result.get("result_text")[0][:20])
			
 
				+    print("is_success", result.get("is_success"))
			
 
				+
			
 
				+
			
 
				 def test_duplicate(path_list, process_no=None):
			
 
				     start_time = time.time()
			
 
				     # random.shuffle(path_list)
			
@@ -81,24 +115,28 @@ def test_maxcompute(p, page_no_range=None):
 
				 if __name__ == '__main__':
			
 
				     if get_platform() == "Windows":
			
 
				         # file_path = "C:/Users/Administrator/Desktop/2.png"
			
 
				-        file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xls"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xls"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/test_doc/error5.doc"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
			
 
				-        # file_path = "C:/Users/Administrator/Downloads/1688432101601.xlsx"
			
 
				+        # file_path = "C:/Users/Administrator/Downloads/W020230512399773694376.jpg"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
			
 
				-        # file_path = "C:/Users/Administrator/Desktop/test_image/error36.png"
			
 
				+        file_path = "C:/Users/Administrator/Desktop/test_image/error9-1.png"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/test_b_table/error1.png"
			
 
				-        # file_path = "C:/Users/Administrator/Desktop/test_pdf/表格连接error/error7.pdf"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_pdf/直接读表格线error/error62.pdf"
			
 
				         # file_path = "C:/save_b_table/0-0895e32470613dd7be1139eefd1342c4.png"
			
 
				     else:
			
 
				         file_path = "1660296734009.pdf"
			
 
				 
			
 
				-    test_one(file_path, page_no_range='1,-1', from_remote=True)
			
 
				+    test_one(file_path, page_no_range='1,-1', from_remote=True, timeout=1000, save_middle=None)
			
 
				+
			
 
				+    # test_path()
			
 
				 
			
 
				-    file_path = "C:/Users/Administrator/Downloads/"
			
 
				+    # file_path = "C:/Users/Administrator/Downloads/"
			
 
				     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
			
 
				     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
			
 
				+    # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
			
 
				+    file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
			
 
				     test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
			
 
				                      ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
			
 
				                      ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
			
@@ -106,12 +144,48 @@ if __name__ == '__main__':
 
				                      ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
			
 
				                      ['error50.pdf', '1,-1'],
			
 
				                      ['error59.pdf', '1,-1'],
			
 
				-                     ['error51.pdf', '1,-1'],
			
 
				+                     ['error60.pdf', '1,-1'],
			
 
				+                     ['error61.pdf', '1,-1'],
			
 
				                      ['error7.pdf', '39,57'],
			
 
				+                     ['error8.pdf', '7,12'],
			
 
				+                     ['error23.pdf', '1,-1']
			
 
				                      ]
			
 
				-    index = 1
			
 
				+    index = 11
			
 
				     # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
			
 
				 
			
 
				+    # from pdfplumber.table import TableFinder
			
 
				+    # fp = open(file_path+test_pdf_list[index][0], 'rb')
			
 
				+    # parser = PDFParser(fp)
			
 
				+    # doc_pdfminer = PDFDocument(parser)
			
 
				+    # rsrcmgr = PDFResourceManager()
			
 
				+    # laparams = LAParams(line_overlap=0.01,
			
 
				+    #                     char_margin=0.3,
			
 
				+    #                     line_margin=0.01,
			
 
				+    #                     word_margin=0.01,
			
 
				+    #                     boxes_flow=0.1, )
			
 
				+    # device = PDFPageAggregator(rsrcmgr, laparams=laparams)
			
 
				+    # interpreter = PDFPageInterpreter(rsrcmgr, device)
			
 
				+    # doc_top = 0
			
 
				+    # doc_pdfplumber = PDF(fp)
			
 
				+    # pages = PDFPage.create_pages(doc_pdfminer)
			
 
				+    # from pdfplumber.page import Page as pdfPage
			
 
				+    # for page in pages:
			
 
				+    #     page_plumber = pdfPage(doc_pdfplumber, page, page_number=1, initial_doctop=doc_top)
			
 
				+    #     table_finder = TableFinder(page_plumber)
			
 
				+    #     all_width_zero = True
			
 
				+    #     for _edge in table_finder.get_edges():
			
 
				+    #         if _edge.get('linewidth') and _edge.get('linewidth') > 0:
			
 
				+    #             all_width_zero = False
			
 
				+    #             break
			
 
				+    #     lt_line_list = []
			
 
				+    #     for _edge in table_finder.get_edges():
			
 
				+    #         # print(_edge)
			
 
				+    #         if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
			
 
				+    #             lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
			
 
				+    #                                        (float(_edge["x1"]), float(_edge["y1"]))))
			
 
				+    #     _plot(lt_line_list, 'table', 1, 1)
			
 
				+
			
 
				+
			
 
				 
			
 
				     # 测试maxcompute模式
			
 
				     # _process = mp.Process(target=test_maxcompute, args=(file_path, '1,-1',))
			
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -115,6 +115,9 @@ class _Image:
 
				     def get_html(self):
			
 
				         # 将Image转为Sentence,table
			
 
				         self.convert()
			
 
				+        if self.error_code == [-16]:
			
 
				+            self.error_code = None
			
 
				+            return "<div>#idc error#<div>"
			
 
				         if self.error_code is not None:
			
 
				             return ""
			
 
				 
			
@@ -138,6 +141,10 @@ class _Image:
 
				         obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx,
			
 
				                                  self.b_table_from_text, self.b_table_text_obj_list,
			
 
				                                  self.b_table_layout_size)
			
 
				+        if judge_error_code(obj_list):
			
 
				+            self.error_code = obj_list
			
 
				+            return
			
 
				+
			
 
				         if self.b_table_from_text:
			
 
				             temp_list = []
			
 
				             for obj in obj_list:
			
@@ -145,9 +152,6 @@ class _Image:
 
				                     temp_list.append(obj)
			
 
				             obj_list = temp_list
			
 
				 
			
 
				-        if judge_error_code(obj_list):
			
 
				-            self.error_code = obj_list
			
 
				-            return
			
 
				         for obj in obj_list:
			
 
				             self.add_child(obj)
			
 
				 
			
--- a/format_convert/convert_xlsx.py
+++ b/format_convert/convert_xlsx.py
@@ -216,6 +216,7 @@ class XlsxConvert:
 
				         # 拼接html表格
			
 
				         text = '<table border="1">' + "\n"
			
 
				         for row in row_list:
			
 
				+            text = text + "<tr>"
			
 
				             for col in row:
			
 
				                 text = text + "<td>" + str(col) + "</td>" + "\n"
			
 
				             text = text + "</tr>" + "\n"
			
--- a/format_convert/monitor_process_config.py
+++ b/format_convert/monitor_process_config.py
@@ -48,14 +48,14 @@ for name in interface_list:
 
				 
			
 
				         # 设置命令
			
 
				         if name == 'convert':
			
 
				-            comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "format_convert" + ' ' + name + ":app" + std_out
			
 
				+            comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 6000 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "format_convert" + ' ' + name + ":app" + std_out
			
 
				         elif name == 'yolo':
			
 
				             comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "/botr/yolov8" + ' ' + name + "_interface:app" + std_out_gpu
			
 
				         elif name == 'office':
			
 
				             comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
			
 
				             office_port_comm_list = []
			
 
				             for office_port in range(port, port + port_num):
			
 
				-                office_port_comm_list = re.sub("#", str(office_port), comm)
			
 
				+                office_port_comm_list.append(re.sub("#", str(office_port), comm))
			
 
				             comm_dict[name] = office_port_comm_list
			
 
				         else:
			
 
				             comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "/" + name + ' ' + name + "_interface:app" + std_out_gpu
			
@@ -69,7 +69,8 @@ for name in interface_list:
 
				             comm_dict[name] = [gpu_comm + comm]
			
 
				 
			
 
				     # print(name, port_list, num_list, gpu_list)
			
 
				-
			
 
				+# print('comm_dict', comm_dict)
			
 
				+# print('interface_port_dict', interface_port_dict)
			
 
				 # convert_port_list = get_args_from_config(ip_port_dict, ip, "convert", "MASTER")
			
 
				 # if convert_port_list:
			
 
				 #     convert_port_list = convert_port_list[0]
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
--- a/idc/idc_interface.py
+++ b/idc/idc_interface.py
@@ -47,7 +47,8 @@ tf.compat.v1.disable_eager_execution()
 
				 sess = tf.compat.v1.Session(graph=tf.Graph())
			
 
				 
			
 
				 
			
 
				-image_shape = (192, 192)
			
 
				+# image_shape = (192, 192)
			
 
				+image_shape = (640, 640)
			
 
				 
			
 
				 
			
 
				 def adjust_direction(image_np, model, if_return_angle=False):
			
@@ -59,10 +60,11 @@ def adjust_direction(image_np, model, if_return_angle=False):
 
				     # image_np = pil_resize(image_np, image_shape[0], image_shape[1])
			
 
				 
			
 
				     # 获取合适的文字区域
			
 
				-    result_list, image_np = get_text_region(image_np, image_shape)
			
 
				+    image_np = get_text_region(image_np, image_shape)
			
 
				     # cv2.imshow("get_text_region", image_np)
			
 
				     # cv2.waitKey(0)
			
 
				-    if not result_list:
			
 
				+    # print(type(image_np))
			
 
				+    if type(image_np) != np.ndarray:
			
 
				         return None
			
 
				     if len(image_np.shape) < 3:
			
 
				         image_np = np.expand_dims(image_np, axis=-1)
			
@@ -85,9 +87,12 @@ def adjust_direction(image_np, model, if_return_angle=False):
 
				     if if_return_angle:
			
 
				         return angle
			
 
				     else:
			
 
				-        # 根据角度旋转
			
 
				-        image_pil = Image.fromarray(origin_image)
			
 
				-        image_rotate = np.array(image_pil.rotate(angle, expand=1))
			
 
				+        if angle not in [0, 360]:
			
 
				+            # 根据角度旋转
			
 
				+            image_pil = Image.fromarray(origin_image)
			
 
				+            image_rotate = np.array(image_pil.rotate(angle, expand=1))
			
 
				+        else:
			
 
				+            image_rotate = origin_image
			
 
				         return image_rotate
			
 
				 
			
 
				 
			
@@ -154,7 +159,7 @@ class IdcModels:
 
				         _dir = os.path.abspath(os.path.dirname(__file__))
			
 
				 
			
 
				         # detect
			
 
				-        model_path = _dir + "/models/cnn.h5"
			
 
				+        model_path = _dir + "/models/e484-f10.96.h5"
			
 
				         with sess.as_default():
			
 
				             with sess.graph.as_default():
			
 
				                 self.model = direction_model(input_shape=(image_shape[0], image_shape[1], 1),
			
@@ -167,7 +172,7 @@ class IdcModels:
 
				 
			
 
				 def test_idc_model(from_remote=False):
			
 
				     idc_model = IdcModels().get_model()
			
 
				-    paths = glob("C:/Users/Administrator/Desktop/test_image/111.jpg")
			
 
				+    paths = glob("C:/Users/Administrator/Desktop/test_image/error43.png")
			
 
				     # file_path = "C:/Users/Administrator/Desktop/test_image/error10.jpg"
			
 
				     for file_path in paths:
			
 
				         img_np = cv2.imread(file_path)
			
--- a/idc/model.py
+++ b/idc/model.py
@@ -16,7 +16,7 @@ import keras.backend as K
 
				 
			
 
				 def direction_model(input_shape, output_shape):
			
 
				     model = cnn_model(input_shape, output_shape)
			
 
				-    print(input_shape, output_shape)
			
 
				+    # print(input_shape, output_shape)
			
 
				     # model = mobile_net_v3_tiny(input_shape, output_shape)
			
 
				     # model = fpn(input_shape, output_shape)
			
 
				     # model.summary(line_length=100)
			
@@ -24,6 +24,37 @@ def direction_model(input_shape, output_shape):
 
				 
			
 
				 
			
 
				 def cnn_model(input_shape, output_shape):
			
 
				+    conv_num = 6
			
 
				+
			
 
				+    # Input
			
 
				+    _input = Input(shape=input_shape, dtype="float32", name="input")
			
 
				+
			
 
				+    conv = Conv2D(16, (3, 3), padding='same')(_input)
			
 
				+    bn = BatchNormalization()(conv)
			
 
				+    relu = LeakyReLU(alpha=0.)(bn)
			
 
				+    max_pool = MaxPool2D()(relu)
			
 
				+    for i in range(conv_num):
			
 
				+        conv = Conv2D(16, (3, 3), padding='same')(max_pool)
			
 
				+        bn = BatchNormalization()(conv)
			
 
				+        relu = LeakyReLU(alpha=0.)(bn)
			
 
				+        # conv = Conv2D(32, (1, 1), padding='same')(relu)
			
 
				+        # bn = BatchNormalization()(conv)
			
 
				+        # relu = LeakyReLU(alpha=0.)(bn)
			
 
				+        max_pool = MaxPool2D()(relu)
			
 
				+    # conv = Conv2D(16, (3, 3), padding='same')(max_pool)
			
 
				+    # bn = BatchNormalization()(conv)
			
 
				+    # relu = LeakyReLU(alpha=0.)(bn)
			
 
				+    max_pool = MaxPool2D((6, 6))(relu)
			
 
				+
			
 
				+    dense = layers.Dense(output_shape, activation='softmax')(max_pool)
			
 
				+    squeeze = Lambda(lambda x: K.squeeze(x, axis=1))(dense)
			
 
				+    squeeze = Lambda(lambda x: K.squeeze(x, axis=1))(squeeze)
			
 
				+
			
 
				+    model = Model(inputs=_input, outputs=squeeze)
			
 
				+    return model
			
 
				+
			
 
				+
			
 
				+def cnn_model_240314(input_shape, output_shape):
			
 
				     conv_num = 5
			
 
				 
			
 
				     # Input
			
--- a/idc/models/e484-f10.96.h5
+++ b/idc/models/e484-f10.96.h5
--- a/idc/pre_process.py
+++ b/idc/pre_process.py
@@ -59,7 +59,7 @@ def get_img_label(img_np, size, cls_num=4):
 
				     return img_label_list
			
 
				 
			
 
				 
			
 
				-def get_text_region(img_np, size):
			
 
				+def get_text_region2(img_np, size):
			
 
				     img_np = remove_black_border(img_np)
			
 
				     origin_h, origin_w = img_np.shape[:2]
			
 
				     gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
			
@@ -198,6 +198,43 @@ def get_text_region(img_np, size):
 
				     return result_list, gray
			
 
				 
			
 
				 
			
 
				+def get_text_region3(img_np, size):
			
 
				+    img_np = remove_black_border(img_np)
			
 
				+    origin_h, origin_w = img_np.shape[:2]
			
 
				+    gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
			
 
				+
			
 
				+    h, w = get_best_predict_size2(img_np, threshold=640)
			
 
				+    img_np = pil_resize(img_np, h, w)
			
 
				+
			
 
				+    # 1.  转化成灰度图
			
 
				+    img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
			
 
				+
			
 
				+    result_list = []
			
 
				+    return result_list, gray
			
 
				+
			
 
				+
			
 
				+def get_text_region(img_np, size=(640, 640)):
			
 
				+    origin_h, origin_w = img_np.shape[:2]
			
 
				+
			
 
				+    # 1.  crop
			
 
				+    crop_h, crop_w = 2000, 2000
			
 
				+    if origin_h > crop_h:
			
 
				+        index = int((origin_h - crop_h) / 2)
			
 
				+        img_np = img_np[index:index+crop_h, :]
			
 
				+    if origin_w > crop_w:
			
 
				+        index = int((origin_w - crop_w) / 2)
			
 
				+        img_np = img_np[:, index:index+crop_w]
			
 
				+
			
 
				+    # 2.  resize
			
 
				+    # h, w = get_best_predict_size2(img_np, threshold=640)
			
 
				+    img_np = pil_resize(img_np, size[0], size[1])
			
 
				+
			
 
				+    # 3.  gray
			
 
				+    img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
			
 
				+
			
 
				+    return img_np
			
 
				+
			
 
				+
			
 
				 def gen(paths, batch_size=2, shape=(640, 640), cls_num=4, is_test=False):
			
 
				     def choose(_paths, _i):
			
 
				         while True:
			
--- a/layout.html
+++ b/layout.html
--- a/ocr/ocr_interface.py
+++ b/ocr/ocr_interface.py
@@ -22,6 +22,9 @@ from format_convert import _global
 
				 app = Flask(__name__)
			
 
				 
			
 
				 
			
 
				+use_angle_cls = False
			
 
				+
			
 
				+
			
 
				 @app.route('/ocr', methods=['POST'])
			
 
				 def _ocr():
			
 
				     _global._init()
			
@@ -75,9 +78,9 @@ def picture2text(img_data, ocr_model, only_rec=0):
 
				 
			
 
				         # 预测
			
 
				         if only_rec:
			
 
				-            results = ocr_model.ocr(img, det=False, rec=True, cls=False)
			
 
				+            results = ocr_model.ocr(img, det=False, rec=True, cls=use_angle_cls)
			
 
				         else:
			
 
				-            results = ocr_model.ocr(img, det=True, rec=True, cls=False)
			
 
				+            results = ocr_model.ocr(img, det=True, rec=True, cls=use_angle_cls)
			
 
				 
			
 
				         # 循环每张图片识别结果
			
 
				         text_list = []
			
@@ -124,7 +127,7 @@ class OcrModels:
 
				         from ocr.paddleocr import PaddleOCR
			
 
				         try:
			
 
				             log('----------- init ocr model ---------------')
			
 
				-            self.ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
			
 
				+            self.ocr_model = PaddleOCR(use_angle_cls=use_angle_cls, lang="ch")
			
 
				         except:
			
 
				             print(traceback.print_exc())
			
 
				             raise RuntimeError
			
--- a/otr/table_line_new.py
+++ b/otr/table_line_new.py
@@ -166,7 +166,7 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
 
				     return line_list
			
 
				 
			
 
				 
			
 
				-def table_line_pdf(line_list, page_w, page_h, is_test=0):
			
 
				+def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
			
 
				     for i, line in enumerate(line_list):
			
 
				         line_list[i] = [int(x) for x in line]
			
 
				 
			
@@ -188,7 +188,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
 
				         else:
			
 
				             if is_test:
			
 
				                 print(line)
			
 
				-    log("pdf divide rows and cols " + str(time.time() - start_time))
			
 
				+    # log("pdf divide rows and cols " + str(time.time() - start_time))
			
 
				     show(row_line_list + col_line_list, title="divide", mode=2, is_test=is_test)
			
 
				 
			
 
				     # 两种线都需要存在，否则跳过
			
@@ -201,7 +201,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
 
				     show(row_line_list + col_line_list, title="merge", mode=2, is_test=is_test)
			
 
				 
			
 
				     # 计算交点
			
 
				-    print('img_new.shape', img_new.shape)
			
 
				+    # print('img_new.shape', img_new.shape)
			
 
				     cross_points = get_points(row_line_list, col_line_list, (img_new.shape[0], img_new.shape[1]))
			
 
				     if not cross_points:
			
 
				         return []
			
@@ -252,7 +252,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
 
				         cross_points = get_points(row_line_list, col_line_list, (img_new.shape[0], img_new.shape[1]))
			
 
				         split_lines, split_y = get_split_line(cross_points, col_line_list, img_new)
			
 
				         area_row_line_list, area_col_line_list, area_point_list = get_split_area(split_y, row_line_list, col_line_list, cross_points)
			
 
				-    log("pdf fix_outline " + str(time.time() - start_time))
			
 
				+    # log("pdf fix_outline " + str(time.time() - start_time))
			
 
				 
			
 
				     # 根据区域循环
			
 
				     for i in range(len(area_point_list)):
			
@@ -270,7 +270,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
 
				         # 修复内部缺线
			
 
				         start_time = time.time()
			
 
				         sub_row_line_list, sub_col_line_list = fix_inner(sub_row_line_list, sub_col_line_list, sub_point_list)
			
 
				-        log("pdf fix_inner " + str(time.time() - start_time))
			
 
				+        # log("pdf fix_inner " + str(time.time() - start_time))
			
 
				         show(sub_row_line_list + sub_col_line_list, title="fix_inner1", mode=2, is_test=is_test)
			
 
				 
			
 
				         # 修复内部线后重新计算交点
			
@@ -289,7 +289,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
 
				     line_list = row_line_list + col_line_list
			
 
				     # 打印处理后线
			
 
				     show(line_list, title="all", img=img_show, mode=5, is_test=is_test)
			
 
				-    log("pdf otr postprocess table_line " + str(time.time() - start_time))
			
 
				+    # log("table_line_pdf cost: " + str(time.time() - start_time))
			
 
				     return line_list
			
 
				 
			
 
				 
			
--- a/otr/table_line_pdf.py
+++ b/otr/table_line_pdf.py
@@ -0,0 +1,624 @@
 
				+import copy
			
 
				+import math
			
 
				+import random
			
 
				+import time
			
 
				+import numpy as np
			
 
				+import cv2
			
 
				+from matplotlib import pyplot as plt
			
 
				+from pdfminer.layout import LTTextContainer, LTRect, LTCurve, LTLine
			
 
				+from scipy.stats import linregress
			
 
				+from shapely.geometry import LineString
			
 
				+from format_convert.utils import log, bbox_iou
			
 
				+from otr.table_line_new import table_line_pdf_post_process
			
 
				+
			
 
				+page_w = 100
			
 
				+page_h = 100
			
 
				+
			
 
				+
			
 
				+def _plot(_line_list, title, mode=1, show=1):
			
 
				+    if not show:
			
 
				+        return
			
 
				+
			
 
				+    for _line in _line_list:
			
 
				+        if mode == 1:
			
 
				+            x0, y0, x1, y1 = _line.__dict__.get("bbox")
			
 
				+        elif mode == 2:
			
 
				+            x0, y0, x1, y1 = _line
			
 
				+        plt.plot([x0, x1], [y0, y1])
			
 
				+    plt.title(title)
			
 
				+    plt.show()
			
 
				+    return
			
 
				+
			
 
				+
			
 
				+def is_cross(A, B, C, D):
			
 
				+    if A[0] == B[0] == C[0] == D[0]:
			
 
				+        if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
			
 
				+                or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
			
 
				+            return True
			
 
				+    if A[1] == B[1] == C[1] == D[1]:
			
 
				+        if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
			
 
				+                or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
			
 
				+            return True
			
 
				+
			
 
				+    line1 = LineString([A, B])
			
 
				+    line2 = LineString([C, D])
			
 
				+
			
 
				+    int_pt = line1.intersection(line2)
			
 
				+    try:
			
 
				+        point_of_intersection = int_pt.x, int_pt.y
			
 
				+        return True
			
 
				+    except:
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+def calculate_k(bbox):
			
 
				+    x = [bbox[0], bbox[2]]
			
 
				+    y = [bbox[1], bbox[3]]
			
 
				+    slope, intercept, r_value, p_value, std_err = linregress(x, y)
			
 
				+    # print('k', slope)
			
 
				+    if math.isnan(slope):
			
 
				+        slope = 0
			
 
				+    return slope
			
 
				+
			
 
				+
			
 
				+def line_iou(line1, line2, axis=0):
			
 
				+    if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
			
 
				+        return 1.0
			
 
				+    if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
			
 
				+        return 1.0
			
 
				+
			
 
				+    inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
			
 
				+    # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
			
 
				+    union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
			
 
				+    if union in [0, 0.]:
			
 
				+        iou = 0.
			
 
				+    else:
			
 
				+        iou = inter / union
			
 
				+    return iou
			
 
				+
			
 
				+
			
 
				+def get_cross_line(_line_list, threshold=1, cross_times=0):
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    start_time1 = time.time()
			
 
				+    # 分横线竖线
			
 
				+    new_line_list = []
			
 
				+    for line in _line_list:
			
 
				+        if abs(line[0]-line[2]) >= abs(line[1]-line[3]):
			
 
				+            new_line = [max(0, line[0] - threshold), line[1], min(line[2] + threshold, page_w), line[3]]
			
 
				+        else:
			
 
				+            new_line = [line[0], max(0, line[1] - threshold), line[2], min(line[3] + threshold, page_h)]
			
 
				+        new_line_list.append(new_line)
			
 
				+
			
 
				+    _cross_line_list = []
			
 
				+    for i in range(len(new_line_list)):
			
 
				+        line1 = new_line_list[i]
			
 
				+
			
 
				+        # line1的计算区域
			
 
				+        line1_area = [max(0, line1[0]-threshold), max(0, line1[1]-threshold),
			
 
				+                      min(page_w, line1[2]+threshold), min(page_h, line1[3]+threshold)]
			
 
				+
			
 
				+        # line1是横线还是竖线
			
 
				+        if abs(line1[0] - line1[2]) >= abs(line1[1]-line1[3]):
			
 
				+            line1_is_row = 1
			
 
				+        else:
			
 
				+            line1_is_row = 0
			
 
				+
			
 
				+        _times = 0
			
 
				+        for j in range(len(new_line_list)):
			
 
				+            if i == j:
			
 
				+                continue
			
 
				+
			
 
				+            line2 = new_line_list[j]
			
 
				+            if abs(line2[0] - line2[2]) >= abs(line2[1]-line2[3]):
			
 
				+                line2_is_row = 1
			
 
				+            else:
			
 
				+                line2_is_row = 0
			
 
				+
			
 
				+            # 十字交叉的横竖线直接判断交点
			
 
				+            if line1_is_row ^ line2_is_row:
			
 
				+                if (line1_is_row and line1[0] <= line2[0] <= line1[2] and line2[1] <= line1[1] <= line2[3]) \
			
 
				+                        or (line2_is_row and line2[0] <= line1[0] <= line2[2] and line1[1] <= line2[1] <= line1[3]):
			
 
				+                    _times += 1
			
 
				+                    if _times >= cross_times:
			
 
				+                        _cross_line_list += [line1]
			
 
				+                        break
			
 
				+                    continue
			
 
				+
			
 
				+            # 不在计算区域的直接跳过
			
 
				+            if not((line1_area[0] <= line2[0] <= line1_area[2] and line1_area[1] <= line2[1] <= line1_area[3])
			
 
				+                   or (line1_area[0] <= line2[2] <= line1_area[2] and line1_area[1] <= line2[3] <= line1_area[3]) or ()):
			
 
				+                continue
			
 
				+
			
 
				+            if is_cross(line1[:2], line1[2:4], line2[:2], line2[2:4]):
			
 
				+                _times += 1
			
 
				+                if _times >= cross_times:
			
 
				+                    _cross_line_list += [line1]
			
 
				+                    break
			
 
				+    _cross_line_list1 = _cross_line_list
			
 
				+    # print('get_cross_line new', time.time()-start_time1)
			
 
				+    # start_time1 = time.time()
			
 
				+    #
			
 
				+    # # 根据是否有交点判断表格线
			
 
				+    # _cross_line_list = []
			
 
				+    # for line1 in _line_list:
			
 
				+    #     if line1 in _cross_line_list:
			
 
				+    #         continue
			
 
				+    #     if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
			
 
				+    #         p1 = [max(0, line1[0] - threshold), line1[1]]
			
 
				+    #         p2 = [min(line1[2] + threshold, page_w), line1[3]]
			
 
				+    #     else:
			
 
				+    #         p1 = [line1[0], max(0, line1[1] - threshold)]
			
 
				+    #         p2 = [line1[2], min(line1[3] + threshold, page_h)]
			
 
				+    #     line1 = [p1[0], p1[1], p2[0], p2[1]]
			
 
				+    #     _times = 0
			
 
				+    #     for line2 in _line_list:
			
 
				+    #         if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
			
 
				+    #             p3 = [max(0, line2[0] - threshold), line2[1]]
			
 
				+    #             p4 = [min(line2[2] + threshold, page_w), line2[3]]
			
 
				+    #         else:
			
 
				+    #             p3 = [line2[0], max(0, line2[1] - threshold)]
			
 
				+    #             p4 = [line2[2], min(line2[3] + threshold, page_h)]
			
 
				+    #         line2 = [p3[0], p3[1], p4[0], p4[1]]
			
 
				+    #         if line1 == line2:
			
 
				+    #             continue
			
 
				+    #         if is_cross(p1, p2, p3, p4):
			
 
				+    #             _times += 1
			
 
				+    #             if _times >= cross_times:
			
 
				+    #                 _cross_line_list += [line1]
			
 
				+    #                 break
			
 
				+    #
			
 
				+    # if len(_cross_line_list1) > 0 or len(_cross_line_list) > 0:
			
 
				+    #     print('get_cross_line old', time.time()-start_time1)
			
 
				+    #     print(len(_cross_line_list1), len(_cross_line_list))
			
 
				+
			
 
				+    log('get_cross_line cost: ' + str(time.time()-start_time))
			
 
				+    return _cross_line_list1
			
 
				+
			
 
				+
			
 
				+def merge_line(_line_list, threshold=2):
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    new_line_list = []
			
 
				+    # 分列
			
 
				+    _line_list.sort(key=lambda x: (x[0], x[1]))
			
 
				+    cols = []
			
 
				+    col = []
			
 
				+    current_w = None
			
 
				+    for line in _line_list:
			
 
				+        if abs(line[0] - line[2]) > abs(line[1] - line[3]):
			
 
				+            continue
			
 
				+        if not col:
			
 
				+            col.append(line)
			
 
				+            current_w = line[0]
			
 
				+
			
 
				+        _iou = line_iou([[0, line[1]], [0, line[3]]], [[0, col[0][1]], [0, col[0][3]]], axis=1)
			
 
				+        if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
			
 
				+                and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
			
 
				+            col.append(line)
			
 
				+        elif min(line[0], line[2]) - 2*threshold <= current_w <= max(line[0], line[2]) + 2*threshold \
			
 
				+                and _iou >= 0.1:
			
 
				+            col.append(line)
			
 
				+        else:
			
 
				+            if col:
			
 
				+                cols.append(col)
			
 
				+            col = [line]
			
 
				+            current_w = line[0]
			
 
				+    if col:
			
 
				+        cols.append(col)
			
 
				+
			
 
				+    for col in cols:
			
 
				+        temp_c = col[0]
			
 
				+        col_w = col[0][0]
			
 
				+        for i in range(len(col) - 1):
			
 
				+            c = col[i]
			
 
				+            next_c = col[i + 1]
			
 
				+            if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]) \
			
 
				+                    or line_iou([[0, c[1]], [0, c[3]]], [[0, next_c[1]], [0, next_c[3]]], axis=1) >= 0.1:
			
 
				+                temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
			
 
				+                          max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
			
 
				+            else:
			
 
				+                new_line_list.append(temp_c)
			
 
				+                temp_c = next_c
			
 
				+        if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
			
 
				+            new_line_list.append(temp_c)
			
 
				+
			
 
				+    # 分行
			
 
				+    _line_list.sort(key=lambda x: (x[1], x[0]))
			
 
				+    rows = []
			
 
				+    row = []
			
 
				+    current_h = None
			
 
				+    for line in _line_list:
			
 
				+        if abs(line[0] - line[2]) < abs(line[1] - line[3]):
			
 
				+            continue
			
 
				+
			
 
				+        if not row:
			
 
				+            row = [line]
			
 
				+            current_h = line[1]
			
 
				+
			
 
				+        if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
			
 
				+            row.append(line)
			
 
				+        else:
			
 
				+            if row:
			
 
				+                rows.append(row)
			
 
				+            row = [line]
			
 
				+            current_h = line[1]
			
 
				+    if row:
			
 
				+        rows.append(row)
			
 
				+
			
 
				+    for row in rows:
			
 
				+        temp_r = row[0]
			
 
				+        row_h = row[0][1]
			
 
				+        for i in range(len(row) - 1):
			
 
				+            r = row[i]
			
 
				+            next_r = row[i + 1]
			
 
				+            # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
			
 
				+            if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0) >= 0.1:
			
 
				+                temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
			
 
				+                          max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
			
 
				+            else:
			
 
				+                new_line_list.append(temp_r)
			
 
				+                temp_r = next_r
			
 
				+        if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
			
 
				+            new_line_list.append(temp_r)
			
 
				+
			
 
				+    log('merge_line cost: ' + str(time.time()-start_time))
			
 
				+    return new_line_list
			
 
				+
			
 
				+
			
 
				+def remove_outline_no_cross(_line_list):
			
 
				+    row_list = []
			
 
				+    col_list = []
			
 
				+    for line in _line_list:
			
 
				+        # 存所有行
			
 
				+        if abs(line[0] - line[2]) > abs(line[1] - line[3]):
			
 
				+            row_list.append(line)
			
 
				+        # 存所有列
			
 
				+        if abs(line[0] - line[2]) < abs(line[1] - line[3]):
			
 
				+            col_list.append(line)
			
 
				+
			
 
				+    if not col_list:
			
 
				+        return _line_list
			
 
				+
			
 
				+    # 左右两条边框
			
 
				+    col_list.sort(key=lambda x: (x[0], x[1]))
			
 
				+    left_col = col_list[0]
			
 
				+    right_col = col_list[-1]
			
 
				+
			
 
				+    # 判断有交点但中间区域无交点
			
 
				+    compare_list = []
			
 
				+    for col in [left_col, right_col]:
			
 
				+        add_h = abs(col[1]-col[3]) / 8
			
 
				+        center_area = [col[1]+add_h, col[3]-add_h]
			
 
				+        cross_cnt = 0
			
 
				+        center_cross_cnt = 0
			
 
				+        center_row_cnt = 0
			
 
				+        for row in row_list:
			
 
				+            if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
			
 
				+                if center_area[0] <= row[1] <= center_area[1]:
			
 
				+                    center_cross_cnt += 1
			
 
				+                else:
			
 
				+                    cross_cnt += 1
			
 
				+            else:
			
 
				+                if center_area[0] <= row[1] <= center_area[1]:
			
 
				+                    center_row_cnt += 1
			
 
				+        compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
			
 
				+
			
 
				+    _flag = True
			
 
				+    for c in compare_list:
			
 
				+        if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
			
 
				+            continue
			
 
				+        _flag = False
			
 
				+    print('compare_list', compare_list)
			
 
				+    if _flag and compare_list[0][1] == compare_list[1][1] \
			
 
				+            and compare_list[0][2] == compare_list[1][2]:
			
 
				+        for col in [left_col, right_col]:
			
 
				+            if col in _line_list:
			
 
				+                _line_list.remove(col)
			
 
				+    return _line_list
			
 
				+
			
 
				+
			
 
				+def table_line_pdf(layout, page_no, show=0):
			
 
				+    print('table_line_pdf show ', show)
			
 
				+    page_h = layout.height
			
 
				+    page_w = layout.width
			
 
				+
			
 
				+    line_list = []
			
 
				+
			
 
				+    lt_text_container_list = []
			
 
				+    lt_rect_list = []
			
 
				+    lt_line_list = []
			
 
				+    lt_curve_list = []
			
 
				+
			
 
				+    line_rect_list = []
			
 
				+    non_line_rect_list = []
			
 
				+    delete_lt_rect_list = []
			
 
				+
			
 
				+    start_time = time.time()
			
 
				+    # 从layout中提取各种对象：文本框、矩形框、曲线、线
			
 
				+    min_y = 10000
			
 
				+    max_x, max_y = 0, 0
			
 
				+    threshold = 2
			
 
				+    for element in layout:
			
 
				+        if isinstance(element, LTTextContainer):
			
 
				+            lt_text_container_list.append(element)
			
 
				+
			
 
				+        elif isinstance(element, LTRect):
			
 
				+            lt_rect_list.append(element)
			
 
				+
			
 
				+            # 筛选出线形矩形和非线形矩形
			
 
				+            if (element.height <= threshold) ^ (element.width <= threshold):
			
 
				+                print('line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
			
 
				+                line_rect_list.append(element)
			
 
				+            elif element.height > threshold and element.width > threshold:
			
 
				+                print('non_line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
			
 
				+                non_line_rect_list.append(element)
			
 
				+            else:
			
 
				+                delete_lt_rect_list.append(element)
			
 
				+
			
 
				+            # 获取最大尺寸
			
 
				+            if element.bbox[1] <= min_y:
			
 
				+                min_y = element.bbox[1]
			
 
				+            if element.bbox[3] <= min_y:
			
 
				+                min_y = element.bbox[3]
			
 
				+            if element.bbox[1] > max_y:
			
 
				+                max_y = element.bbox[1]
			
 
				+            if element.bbox[3] > max_y:
			
 
				+                max_y = element.bbox[3]
			
 
				+            if element.bbox[0] > max_x:
			
 
				+                max_x = element.bbox[0]
			
 
				+            if element.bbox[2] > max_x:
			
 
				+                max_x = element.bbox[2]
			
 
				+
			
 
				+        elif isinstance(element, LTLine):
			
 
				+            lt_line_list.append(element)
			
 
				+
			
 
				+        elif isinstance(element, LTCurve):
			
 
				+            lt_curve_list.append(element)
			
 
				+
			
 
				+    if show:
			
 
				+        print('len(lt_text_container_list)', len(lt_text_container_list))
			
 
				+        print('len(lt_rect_list)', len(lt_rect_list))
			
 
				+        print('len(lt_line_list)', len(lt_line_list))
			
 
				+        print('len(lt_curve_list)', len(lt_curve_list))
			
 
				+
			
 
				+        print('len(line_rect_list)', len(line_rect_list))
			
 
				+        print('len(non_line_rect_list)', len(non_line_rect_list))
			
 
				+        print('len(delete_lt_rect_list)', len(delete_lt_rect_list))
			
 
				+
			
 
				+    if max_y > page_h:
			
 
				+        page_h = max_y + 20
			
 
				+    if max_x > page_w:
			
 
				+        page_w = max_x + 20
			
 
				+
			
 
				+    globals().update({'page_h': page_h})
			
 
				+    globals().update({'page_w': page_w})
			
 
				+
			
 
				+    # 矩形框y有负数
			
 
				+    if min_y < 0:
			
 
				+        for lt_rect in lt_rect_list:
			
 
				+            if lt_rect.y0 < 0 or lt_rect.y1 < 0:
			
 
				+                new_y0 = 10 if lt_rect.y0 < 0 else lt_rect.y0
			
 
				+                new_y1 = 10 if lt_rect.y1 < 0 else lt_rect.y1
			
 
				+                lt_rect.set_bbox((lt_rect.x0, new_y0, lt_rect.x1, new_y1))
			
 
				+
			
 
				+    _plot([x.bbox for x in lt_rect_list + lt_line_list], 'get_page_lines start', mode=2, show=show)
			
 
				+
			
 
				+    # 合并矩形框
			
 
				+    # for i in range(len(non_line_rect_list)):
			
 
				+    #     lt_rect1 = non_line_rect_list[i]
			
 
				+    #     b1 = lt_rect1.bbox
			
 
				+    #     if lt_rect1 in delete_lt_rect_list:
			
 
				+    #         continue
			
 
				+    #     for j in range(i+1, len(non_line_rect_list)):
			
 
				+    #         lt_rect2 = non_line_rect_list[j]
			
 
				+    #         b2 = lt_rect2.bbox
			
 
				+    #         if lt_rect2 in delete_lt_rect_list:
			
 
				+    #             continue
			
 
				+    #         if bbox_iou(b1, b2, False) >= 0.5:
			
 
				+    #             delete_lt_rect_list.append(lt_rect2)
			
 
				+    #
			
 
				+    # # 非线形矩形若与线形矩形距离较近，则删除
			
 
				+    # threshold = 5
			
 
				+    # for n_rect in non_line_rect_list:
			
 
				+    #     if n_rect in delete_lt_rect_list:
			
 
				+    #         continue
			
 
				+    #     middle_x = (n_rect.x0 + n_rect.x1) / 2
			
 
				+    #     middle_y = (n_rect.y0 + n_rect.y1) / 2
			
 
				+    #     for rect in line_rect_list:
			
 
				+    #         if rect in delete_lt_rect_list:
			
 
				+    #             continue
			
 
				+    #         if rect.height >= rect.width:
			
 
				+    #             if n_rect.width / 2 - threshold <= abs(rect.x0 - middle_x) <= n_rect.width / 2 + threshold:
			
 
				+    #                 delete_lt_rect_list.append(n_rect)
			
 
				+    #         else:
			
 
				+    #             if n_rect.height / 2 - threshold <= abs(rect.y0 - middle_y) <= n_rect.height / 2 + threshold:
			
 
				+    #                 delete_lt_rect_list.append(n_rect)
			
 
				+
			
 
				+    # 寻找每个文本框对应的最小矩形框
			
 
				+    text_lt_rect_list = []
			
 
				+    # for text_lt_rect in lt_text_container_list:
			
 
				+    #     text_box = text_lt_rect.bbox
			
 
				+    #     contain_iou_list = []
			
 
				+    #
			
 
				+    #     min_area = 1000000
			
 
				+    #     min_lt_rect = None
			
 
				+    #     for lt_rect in non_line_rect_list:
			
 
				+    #         _bbox = lt_rect.bbox
			
 
				+    #
			
 
				+    #         if lt_rect in delete_lt_rect_list:
			
 
				+    #             continue
			
 
				+    #         if lt_rect in text_lt_rect_list:
			
 
				+    #             continue
			
 
				+    #         if lt_rect.height <= 5 or lt_rect.width <= 5:
			
 
				+    #             continue
			
 
				+    #
			
 
				+    #         # 如果文本框与矩形框有交集，则直接删除
			
 
				+    #         if (text_box[0] <= _bbox[0] <= text_box[2] or text_box[0] <= _bbox[2] <= text_box[2]) \
			
 
				+    #                 and (text_box[1] <= _bbox[1] <= text_box[3] or text_box[1] <= _bbox[3] <= text_box[3]):
			
 
				+    #             text_lt_rect_list.append(lt_rect)
			
 
				+    #             continue
			
 
				+    #
			
 
				+    #         _area = abs(_bbox[2] - _bbox[0]) * abs(_bbox[3] - _bbox[1])
			
 
				+    #         _iou = bbox_iou(_bbox, text_box, False)
			
 
				+    #         if _iou >= 0.3 and _area < min_area:
			
 
				+    #             min_area = _area
			
 
				+    #             min_lt_rect = lt_rect
			
 
				+    #         # else:
			
 
				+    #         #     contain_iou = bbox_iou(_bbox, text_box, True)
			
 
				+    #         #     contain_iou_list.append([lt_rect, contain_iou])
			
 
				+    #
			
 
				+    #     if min_lt_rect is not None:
			
 
				+    #         text_lt_rect_list.append(min_lt_rect)
			
 
				+    #     # else:
			
 
				+    #     #     # 找不到就放低条件，计算iou时包含即为1
			
 
				+    #     #     contain_iou_list.sort(key=lambda x: x[1])
			
 
				+    #     #     text_lt_rect_list.append(contain_iou_list[-1][0])
			
 
				+
			
 
				+    delete_lt_rect_list += text_lt_rect_list
			
 
				+
			
 
				+    text_line_list = []
			
 
				+    for lt_line in lt_text_container_list:
			
 
				+        _b = lt_line.bbox
			
 
				+        if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
			
 
				+            text_line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
			
 
				+        else:
			
 
				+            text_line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
			
 
				+
			
 
				+    _plot(text_line_list, 'lt_text_container_list', mode=2, show=show)
			
 
				+
			
 
				+    # 从线对象提取线
			
 
				+    for lt_line in lt_line_list+lt_curve_list:
			
 
				+        _b = lt_line.bbox
			
 
				+        if lt_line.height > 10 or lt_line.width > 10:
			
 
				+            if lt_line.height >= lt_line.width:
			
 
				+                line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
			
 
				+            else:
			
 
				+                line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
			
 
				+
			
 
				+    _plot(line_list, 'lt_line_list+lt_curve_list', mode=2, show=show)
			
 
				+
			
 
				+    # 从线形矩形框提取线
			
 
				+    for lt_rect in line_rect_list:
			
 
				+        if lt_rect in delete_lt_rect_list:
			
 
				+            continue
			
 
				+        _b = lt_rect.bbox
			
 
				+        if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
			
 
				+            line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
			
 
				+        else:
			
 
				+            line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
			
 
				+
			
 
				+    _plot(line_list, 'line_rect_list', mode=2, show=show)
			
 
				+
			
 
				+    # min_x, min_y = 10000, 10000
			
 
				+    # max_x, max_y = 0, 0
			
 
				+    # for _b in line_list:
			
 
				+    #     min_x = _b[0] if _b[0] < min_x else min_x
			
 
				+    #     max_x = _b[2] if _b[2] > max_x else max_x
			
 
				+    #     min_y = _b[1] if _b[1] < min_y else min_y
			
 
				+    #     max_y = _b[3] if _b[3] > max_y else max_y
			
 
				+
			
 
				+    # 从普通矩形框提取线，区分描边颜色，排除无色的
			
 
				+    # threshold = 10
			
 
				+    # img = np.full([int(max_x)+10, int(max_y)+10, 3], 255, dtype=np.uint8)
			
 
				+    threshold = 0.3
			
 
				+    for lt_rect in non_line_rect_list:
			
 
				+        if lt_rect in delete_lt_rect_list:
			
 
				+            continue
			
 
				+        _b = lt_rect.bbox
			
 
				+        if type(lt_rect.non_stroking_color) == tuple:
			
 
				+            continue_flag = 0
			
 
				+            for t in lt_rect.non_stroking_color:
			
 
				+                if float(t) >= threshold:
			
 
				+                    continue_flag = 1
			
 
				+                    break
			
 
				+            if continue_flag:
			
 
				+                continue
			
 
				+        elif lt_rect.non_stroking_color is not None and float(lt_rect.non_stroking_color) >= threshold:
			
 
				+            continue
			
 
				+        # if max_y != 10000 and min_y != 0:
			
 
				+        #     if (_b[3] - max_y >= threshold and _b[2] - max_x >= threshold):
			
 
				+        #         print('_b[3] - max_y >= threshold', _b[3], max_y, _b[2], max_x)
			
 
				+        #         continue
			
 
				+        #     if abs(_b[3] - _b[1]) * abs(_b[2] - _b[0]) >= 1 / 10 * abs(max_y - min_y) * abs(max_x - min_x):
			
 
				+        #         print('>= 1 / 10', _b[3], _b[1], _b[2], _b[0], max_x, max_y)
			
 
				+        #         continue
			
 
				+        # contain_flag = 0
			
 
				+        # for lt_rect2 in non_line_rect_list:
			
 
				+        #     if lt_rect == lt_rect2:
			
 
				+        #         continue
			
 
				+        #     _b2 = lt_rect2.bbox
			
 
				+        #     if bbox_iou(_b, _b2) >= 0.9:
			
 
				+        #         contain_flag = 1
			
 
				+        #     if _b2[0] <= _b[0] <= _b[2] <= _b2[2] and _b2[1] <= _b[1] <= _b[3] <= _b2[3]:
			
 
				+        #         contain_flag = 1
			
 
				+        # if contain_flag:
			
 
				+        #     continue
			
 
				+        line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[0], _b[1], _b[2], _b[1]],
			
 
				+                      [_b[2], _b[1], _b[2], _b[3]], [_b[0], _b[3], _b[2], _b[3]]]
			
 
				+        # cv2.rectangle(img, (int(_b[0]), int(_b[1])), (int(_b[2]), int(_b[3])), [random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)])
			
 
				+        # cv2.imshow('img', img)
			
 
				+        # cv2.waitKey(0)
			
 
				+
			
 
				+    _plot(line_list, 'non_line_rect_list', mode=2, show=show)
			
 
				+
			
 
				+    if not line_list:
			
 
				+        return []
			
 
				+    # 去重
			
 
				+    line_list = [str(x) for x in line_list]
			
 
				+    line_list = list(set(line_list))
			
 
				+    line_list = [eval(x) for x in line_list]
			
 
				+
			
 
				+    # 合并线
			
 
				+    line_list = merge_line(line_list)
			
 
				+
			
 
				+    if show:
			
 
				+        print('get_page_lines len(line_list)', len(line_list))
			
 
				+    _plot(line_list, 'line_list+bias_line_list', mode=2, show=show)
			
 
				+
			
 
				+    # 根据是否有交点判断表格线
			
 
				+    cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
			
 
				+
			
 
				+    if show:
			
 
				+        print('get_page_lines len(cross_line_list)', len(cross_line_list))
			
 
				+    _plot(cross_line_list, 'get_cross_line', mode=2, show=show)
			
 
				+
			
 
				+    # 删除最外层嵌套边框
			
 
				+    cross_line_list = remove_outline_no_cross(cross_line_list)
			
 
				+
			
 
				+    # 复用otr的部分后处理，补线
			
 
				+    cross_line_list = table_line_pdf_post_process(cross_line_list, page_w, page_h)
			
 
				+    _plot(cross_line_list, 'cross_line_process1', mode=2, show=show)
			
 
				+
			
 
				+    # 有过短的横线与过短的竖线交点
			
 
				+    short_line_list = []
			
 
				+    for line in cross_line_list:
			
 
				+        if line[1] == line[3] and abs(line[2] - line[0]) <= 30:
			
 
				+            short_line_list.append(line)
			
 
				+        if line[0] == line[2] and abs(line[3] - line[1]) <= 30:
			
 
				+            short_line_list.append(line)
			
 
				+    for line in short_line_list:
			
 
				+        for line2 in short_line_list:
			
 
				+            if line == line2:
			
 
				+                continue
			
 
				+            if is_cross(line[:2], line[2:4], line2[:2], line2[2:4]):
			
 
				+                if line in cross_line_list:
			
 
				+                    cross_line_list.remove(line)
			
 
				+                if line2 in cross_line_list:
			
 
				+                    cross_line_list.remove(line2)
			
 
				+
			
 
				+    # print('len(temp_list), len(cross_line_list)', len(temp_list), len(cross_line_list))
			
 
				+    # if len(temp_list) != len(cross_line_list):
			
 
				+    #     cross_line_list = table_line_pdf_post_process(temp_list, page_w, page_h)
			
 
				+
			
 
				+    # show
			
 
				+    if show:
			
 
				+        print('len(cross_line_list)', len(cross_line_list))
			
 
				+    _plot(cross_line_list, 'cross_line_process2', mode=2, show=show)
			
 
				+
			
 
				+    lt_line_list = []
			
 
				+    for line in cross_line_list:
			
 
				+        lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
			
 
				+                                   (float(line[2]), float(line[3]))))
			
 
				+    log("pdf page %s has %s lines cost: %s" % (str(page_no), str(len(lt_line_list)), str(time.time()-start_time)))
			
 
				+    return lt_line_list