Преглед изворни кода

优化
1. pdf表格线提取
2. docx提取
3. idc模型预处理优化,重新训练
4. pdf文字重复问题

fangjiasheng пре 1 година
родитељ
комит
c6ac7bddb9

+ 8 - 5
format_convert/convert.py

@@ -59,7 +59,7 @@ def getText(_type, path_or_stream, _page_no=None, time_out=300):
     def get_html_2(_class):
         return _class.get_html()
 
-    log("file type - " + _type + ' time out - ' + str(time_out))
+    log("file type - " + _type + ' page - ' + str(_page_no) + ' time out - ' + str(time_out))
 
     try:
         ss = path_or_stream.split(".")
@@ -153,7 +153,7 @@ def remove_underline(image_np):
 
 # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
 # @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
-def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300):
+def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_middle=None):
     if get_platform() == "Windows":
         _global._init()
 
@@ -210,7 +210,7 @@ def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300):
     finally:
         print("======================================")
         try:
-            if get_platform() == "Linux":
+            if get_platform() == "Linux" and save_middle is None:
                 # log("not delete temp file")
                 # 删除该唯一空间下所有文件
                 if os.path.exists(unique_space_path):
@@ -419,6 +419,9 @@ def _convert():
         if _timeout is not None:
             globals().update({"time_out": _timeout})
 
+        # 是否保留中间文件
+        save_middle = data.get('save_middle')
+
         # 最终结果截取的最大字节数
         max_bytes = data.get("max_bytes")
 
@@ -427,7 +430,7 @@ def _convert():
             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
             # text, swf_images = origin_unique_temp_file_process(stream, _type)
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
             except TimeoutError:
                 log("convert time out! 300 sec")
                 text = [-5]
@@ -435,7 +438,7 @@ def _convert():
         else:
             # Linux 通过装饰器设置整个转换超时时间
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle)
             except TimeoutError:
                 log("convert time out! 300 sec")
                 text = [-5]

+ 11 - 3
format_convert/convert_doc.py

@@ -2,6 +2,8 @@ import inspect
 import os
 import re
 import sys
+
+import chardet
 from bs4 import BeautifulSoup
 sys.path.append(os.path.dirname(__file__) + "/../")
 from format_convert.convert_tree import _Document, _Sentence, _Page
@@ -40,9 +42,15 @@ class DocConvert:
         # 先判断特殊doc文件,可能是html文本
         is_html_doc = False
         try:
-            with open(self.path, 'r') as f:
-                html_str = f.read()
-            if re.search('<div|<html|<body|<head|<tr|<br|<table|<td', html_str):
+            try:
+                with open(self.path, 'r') as f:
+                    html_str = f.read()
+            except UnicodeDecodeError:
+                with open(self.path, 'r', errors='ignore') as f:
+                    html_str = f.read()
+            # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
+            if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
+                log('doc as html!')
                 soup = BeautifulSoup(html_str, 'lxml')
                 text = soup.text
                 is_html_doc = True

+ 36 - 4
format_convert/convert_docx.py

@@ -53,16 +53,27 @@ def read_no_start(numbering_xml):
     # 获取虚拟id的开始编号
     w_abstract_num_list = numbering_xml.getElementsByTagName("w:abstractNum")
     abstract_id_level_dict = {}
+    abstract_id_level_text_dict = {}
     for w_abstract_num in w_abstract_num_list:
         w_abstract_num_id = w_abstract_num.getAttribute("w:abstractNumId")
         w_lvl_list = w_abstract_num.getElementsByTagName("w:lvl")
         level_start_dict = {}
+        level_text_dict = {}
         for w_lvl in w_lvl_list:
             w_ilvl_value = w_lvl.getAttribute('w:ilvl')
             if w_lvl.getElementsByTagName("w:start"):
                 w_ilvl_start_num = w_lvl.getElementsByTagName("w:start")[0].getAttribute("w:val")
                 level_start_dict[int(w_ilvl_value)] = int(w_ilvl_start_num)
+            if w_lvl.getElementsByTagName("w:lvlText") and w_lvl.getElementsByTagName("w:numFmt"):
+                w_lvl_text = w_lvl.getElementsByTagName("w:lvlText")[0].getAttribute("w:val")
+                w_lvl_format = w_lvl.getElementsByTagName("w:numFmt")[0].getAttribute("w:val")
+                if w_lvl_format == 'upperLetter':
+                    w_lvl_text = re.sub('%\d', '%A', w_lvl_text)
+                elif w_lvl_format == 'lowerLetter':
+                    w_lvl_text = re.sub('%\d', '%a', w_lvl_text)
+                level_text_dict[int(w_ilvl_value)] = w_lvl_text
         abstract_id_level_dict[w_abstract_num_id] = level_start_dict
+        abstract_id_level_text_dict[w_abstract_num_id] = level_text_dict
 
     # 映射回真实id
     real_id_level_start_dict = {}
@@ -72,7 +83,14 @@ def read_no_start(numbering_xml):
         if level_start_dict:
             real_id_level_start_dict[int(real_id)] = level_start_dict
 
-    return real_id_level_start_dict
+    real_id_level_text_dict = {}
+    for abstract_id in abstract_real_id_dict.keys():
+        real_id = abstract_real_id_dict.get(abstract_id)
+        level_text_dict = abstract_id_level_text_dict.get(abstract_id)
+        if level_text_dict:
+            real_id_level_text_dict[int(real_id)] = level_text_dict
+
+    return real_id_level_start_dict, real_id_level_text_dict
 
 
 def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numbering_xml, document_xml_rels,
@@ -95,8 +113,8 @@ def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numberi
     # 文本的编号(如果有编号的话)
     text_no = ''
 
-    # 获取编号组的起始值
-    id_level_start_dict = read_no_start(numbering_xml)
+    # 获取编号组的起始值和编号组的展示形式
+    id_level_start_dict, id_level_text_dict = read_no_start(numbering_xml)
     # print('_num_pr_dict', _num_pr_dict)
 
     # 提取编号 组-层级-序号
@@ -143,8 +161,22 @@ def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numberi
                         if id_level_start_dict.get(group_id) and id_level_start_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
                             start_no = id_level_start_dict.get(group_id).get(level)
                             level_node_cnt += start_no - 1
+
+                        level_text = None
+                        if id_level_text_dict.get(group_id) and id_level_text_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
+                            level_text = id_level_text_dict.get(group_id).get(level)
                         # print('level_node_cnt', level_node_cnt)
-                        text_no += str(level_node_cnt) + '.'
+                        if level_text:
+                            if re.search('a', level_text):
+                                level_node_cnt = chr(ord('a') + level_node_cnt - 1)
+                                text_no += re.sub('%a', str(level_node_cnt), level_text)
+                            elif re.search('A', level_text):
+                                level_node_cnt = chr(ord('A') + level_node_cnt - 1)
+                                text_no += re.sub('%A', str(level_node_cnt), level_text)
+                            else:
+                                text_no += re.sub('%\d', str(level_node_cnt), level_text)
+                        else:
+                            text_no += str(level_node_cnt) + '.'
                         # print('text_no', text_no)
                     _last_node_level = node_level
 

+ 49 - 11
format_convert/convert_image.py

@@ -17,7 +17,7 @@ import traceback
 import cv2
 from isr.pre_process import count_red_pixel
 from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log, \
-    memory_decorator, pil_resize, np2bytes, ocr_cant_read
+    memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
     from_idc_interface, from_isr_interface
 from format_convert.table_correct import get_rotated_image
@@ -88,7 +88,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                 textbox_list.remove(_obj)
         return textbox_list
 
-    def idc_process(_image_np):
+    def idc_process(_image_np, return_angle=False):
         # 图片倾斜校正,写入原来的图片路径
         # print("image_process", image_path)
         # g_r_i = get_rotated_image(_image_np, image_path)
@@ -115,17 +115,26 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
         #     image_bytes = f.read()
         image_bytes = np2bytes(image_resize)
         angle = from_idc_interface(image_bytes)
+        log('idc_process angle ' + str(angle))
         if judge_error_code(angle):
-            if is_from_docx:
-                return []
+            if return_angle:
+                if is_from_docx:
+                    return [], []
+                else:
+                    return angle, angle
             else:
-                return angle
+                if is_from_docx:
+                    return []
+                else:
+                    return angle
         # 根据角度旋转
-        image_pil = Image.fromarray(_image_np)
-        _image_np = np.array(image_pil.rotate(angle, expand=1))
+        _image_pil = Image.fromarray(_image_np)
+        _image_np = np.array(_image_pil.rotate(angle, expand=1))
         # 写入
         # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
         # cv2.imwrite(idc_path, image_np)
+        if return_angle:
+            return _image_np, angle
         return _image_np
 
     def isr_process(_image_np):
@@ -288,6 +297,23 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
         # 调用现成方法形成表格
         try:
             if list_line:
+
+                # 排除掉短且经过文字bbox中间的竖线
+                temp_list = []
+                for line in list_line:
+                    find_cnt = 0
+                    if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
+                        for t_obj in list_text_boxes:
+                            if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
+                                # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2])
+                                find_cnt += 1
+                                if find_cnt >= 2:
+                                    break
+                    if find_cnt >= 2:
+                        continue
+                    temp_list.append(line)
+                list_line = temp_list
+
                 from format_convert.convert_tree import TableLine
                 list_lines = []
                 for line in list_line:
@@ -486,19 +512,31 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                     return text_list
 
                 # 判断ocr识别是否正确
-                if ocr_cant_read(text_list, box_list) and not idc_flag and False:
+                if ocr_cant_read(text_list, box_list) and not idc_flag:
+                # if True:
                     # 方向分类
-                    image_np = idc_process(image_np)
-                    # cv2.imshow("idc_process", image_np)
-                    # cv2.waitKey(0)
+                    image_np, angle = idc_process(image_np, return_angle=True)
                     if isinstance(image_np, list):
                         return image_np
+                    # 如果角度不变,旋转180
+                    if angle in [0, 360]:
+                        image_pil = Image.fromarray(image_np)
+                        image_np = np.array(image_pil.rotate(180, expand=1))
+                    # cv2.imshow("idc_process", image_np)
+                    # cv2.waitKey(0)
 
                     # 文字识别
                     text_list1, box_list_1 = ocr_process(image_np)
                     if judge_error_code(text_list1):
                         return text_list1
 
+                    # all_text = ''.join(text_list1)
+                    # all_text = re.sub('[\s\d]', '', all_text)
+                    # if len(re.findall(get_garble_code2(), all_text)) >= 2:
+                    # log('text_list1' + ''.join(text_list1))
+                    if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and is_from_pdf:
+                        return [-16]
+
                     # 比较字数
                     # print("ocr process", len("".join(text_list)), len("".join(text_list1)))
                     if len("".join(text_list)) < len("".join(text_list1)):

+ 562 - 0
format_convert/convert_layout.py

@@ -0,0 +1,562 @@
+import os
+import sys
+sys.setrecursionlimit(10000)
+sys.path.append(os.path.dirname(__file__) + "/../")
+from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
+import re
+import traceback
+from bs4 import BeautifulSoup
+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
+from format_convert.wrapt_timeout_decorator import timeout
+
+
+class TreeNode:
+    def __init__(self, data):
+        self.data = data
+        self.children = []
+
+    def add_child(self, child_node):
+        self.children.append(child_node)
+
+
+def print_tree(node, level=0):
+    print("  " * level + str(node.data))
+    for child in node.children:
+        print_tree(child, level + 1)
+
+
+def print_tree_order(node, div_list, level=0):
+    text = "  " * level + div_list[node.data[0]].text
+    colors = [(255, 0, 0, 0.7), (0, 255, 0, 0.6), (0, 0, 255, 0.6), (255, 127, 0, 0.2),
+              # (123, 104, 238, 0.2),
+              (238, 238, 0, 0.2),
+              (255, 104, 255, 0.2)
+              ]
+
+    if level < len(colors):
+        color = colors[level]
+    else:
+        color = colors[-1]
+
+    text = '<div style="background-color: rgba{}";>'.format(str(color)) + text + '</div>'
+
+    if level == 0:
+        text = '<!DOCTYPE HTML><head><meta charset="UTF-8">' + text
+
+    with open('../layout.html', 'a') as f:
+        f.write(text)
+
+    for child in node.children:
+        print('node.child', child.data[:10])
+        print_tree_order(child, div_list, level + 1)
+
+
+class LayoutConvert:
+    def __init__(self, html):
+        self.html = html
+
+        self.order_type_list = ['[★]?(\d{1,3}[.])+[.\d]?',
+                                '[★]?[A-Z][.、]',
+                                '[★]?[a-z][.、]',
+                                '[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]',
+                                '[ⅠⅡⅢⅣⅤⅥⅦⅧⅩⅪⅫ]',
+                                '[ⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹ]',
+                                '[❶❷❸❹❻❼❽❾❿]',
+                                '第[一二三四五六七八九十]{1,2}[章节篇]',
+                                '第\d{1,2}[章节篇]',
+                                '[((]\d{1,3}[))]',
+                                '[★]?\d{1,3}、',
+                                '[((][一二三四五六七八九十]{1,3}[))]',
+                                '[一二三四五六七八九十]{1,3}、',
+                                '包[1-9]{1,3}',
+                                '标段[1-9]{1,3}',
+                                ]
+
+        self.chinese_arabic_dict = {
+            '一': 1,
+            '二': 2,
+            '三': 3,
+            '四': 4,
+            '五': 5,
+            '六': 6,
+            '七': 7,
+            '八': 8,
+            '九': 9,
+            '十': 10,
+        }
+
+    def get_layout(self):
+        return
+
+    def recursion_get_tree(self, index_list, div_list, start_index, end_index):
+        print([start_index, end_index], div_list[start_index].text[:10], '-'*20)
+        tree_node = TreeNode([start_index, end_index])
+
+        if end_index - start_index == 1:
+            print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
+            return tree_node
+
+        temp_end_i = index_list[0][0]
+        for start_i, end_i in index_list:
+            if not start_index < start_i <= end_i <= end_index:
+                if start_i == 0:
+                    print('continue not start_index < start_i <= end_i <= end_index', start_i, end_i)
+                continue
+            if start_i < temp_end_i:
+                print('continue start_i < temp_end_i', start_i, temp_end_i, div_list[start_i])
+                continue
+
+            sub_tree_node = self.recursion_get_tree(index_list, div_list, start_i, end_i)
+            tree_node.add_child(sub_tree_node)
+            temp_end_i = end_i
+        print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
+        return tree_node
+
+    def get_order_number_tree(self, product=None):
+        def get_order_no(_ti, _div_text):
+            _tis = re.split('[.、]', str(_ti))
+            temp_tis = []
+            for _t in _tis:
+                if _t != '':
+                    temp_tis.append(_t)
+            _tis = temp_tis
+
+            _ti_order_no = None
+            if len(_tis) >= 2:
+                re.search('', _div_text)
+            else:
+                _match = re.search('[1-9]+', _div_text)
+                if _match:
+                    _ti_order_no = int(_match.group())
+                else:
+                    _match = re.search('[一二三四五六七八九十]+', _div_text)
+                    if _match:
+                        _ti_order_no = _match.group()
+                        temp_order_no = ''
+                        for o in _ti_order_no:
+                            temp_order_no += str(self.chinese_arabic_dict.get(o))
+                        _ti_order_no = int(temp_order_no)
+            return _ti_order_no
+
+
+        soup = BeautifulSoup(self.html, 'lxml')
+
+        div_list = soup.findAll('div')
+
+        type_index_list = []
+        range_index_list = []
+        cut_type_index_dict = {}
+        # temp_type_index_list = []
+
+        # 获取每一行的序号类型
+        for div_index, d in enumerate(div_list):
+            text = d.text
+
+            # 判断该行是什么序号类型
+            find_type_index = -1
+            for type_index, reg in enumerate(self.order_type_list):
+                if find_type_index >= 0:
+                    continue
+
+                match = re.finditer(reg, text)
+                for m in match:
+                    if m.span()[0] != 0:
+                        continue
+                    order = m.group()
+
+                    if type_index in [0, 1]:
+                        order = re.sub('[★]', '', order)
+
+                    # 普通情况,单层序号
+                    if type_index != 0:
+                        find_type_index = type_index
+                    # 特殊情况,多层序号
+                    else:
+                        ss = order.split('.')
+                        # if len(re.findall('[.]', m.group())) == 1:
+                        if len(ss) - ss.count('') == 1:
+                            find_type_index = 0
+                            # print('find_type_index1', find_type_index, text[:5])
+                        else:
+                            # 用小数表示多层序号
+                            find_type_index = re.sub('\d+', '0', order)
+                            find_type_index = re.sub('[.]', '', find_type_index)
+                            find_type_index = find_type_index[0] + '.' + find_type_index[1:-1] + '1'
+                            find_type_index = float(find_type_index)
+                            # print('find_type_index2', find_type_index, text[:5])
+                    break
+            type_index_list.append(find_type_index)
+
+        # 根据每一行的序号类型分块
+        for div_index, d in enumerate(div_list):
+            find_type_index = type_index_list[div_index]
+            sub_type_index_list = type_index_list[:div_index]
+            text = d.text
+            print(text)
+
+            # 若无序号类型,跳过
+            if find_type_index < 0:
+                # type_index_list.append(find_type_index)
+                print('continue -1')
+                print('-'*40)
+                continue
+
+            print('find_type_index, div_index', find_type_index, div_index)
+
+            # 已经存在相同的序号类型
+            if find_type_index in sub_type_index_list:
+            #     # 判断是否开始的序号
+            #     if (find_type_index >= 1 or find_type_index == 0) and len(re.findall('[1一]', text[:3])) == 1 \
+            #             and len(re.findall('[2-9二三四五六七八九十]', text[:3])) == 0:
+            #         # type_index_list.append(find_type_index)
+            #         final_index = None
+            #         for temp_div_index, temp_type in enumerate(sub_type_index_list):
+            #             if find_type_index == temp_type:
+            #                 final_index = temp_div_index
+            #         final_block_index = div_index
+            #         min_block_size = 100000
+            #         for block in range_index_list:
+            #             if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
+            #                 min_block_size = block[1] - block[0]
+            #                 final_block_index = block[1]+1
+            #         if final_index is not None and [final_index, final_block_index] not in range_index_list:
+            #             range_index_list.append([final_index, final_block_index])
+            #             if cut_type_index_dict.get(find_type_index) is not None:
+            #                 if div_index > cut_type_index_dict[find_type_index]:
+            #                     cut_type_index_dict[find_type_index] = final_block_index
+            #             else:
+            #                 cut_type_index_dict[find_type_index] = final_block_index
+            #         print('continue 1')
+            #         print('cut_type_index_dict', cut_type_index_dict)
+            #         print('-'*40)
+            #         continue
+
+                # 判断是否开始的序号
+                # if 0 < find_type_index < 1 \
+                #         and len(re.findall('[1]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 1 \
+                #         and len(re.findall('[2-9]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 0:
+                #     # type_index_list.append(find_type_index)
+                #     final_index = None
+                #     for temp_div_index, temp_type in enumerate(sub_type_index_list):
+                #         if find_type_index == temp_type:
+                #             final_index = temp_div_index
+                #     final_block_index = div_index
+                #     min_block_size = 100000
+                #     for block in range_index_list:
+                #         if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
+                #             min_block_size = block[1] - block[0]
+                #             final_block_index = block[1]+1
+                #     if final_index is not None and [final_index, final_block_index] not in range_index_list:
+                #         range_index_list.append([final_index, final_block_index])
+                #         if cut_type_index_dict.get(find_type_index) is not None:
+                #             if div_index > cut_type_index_dict[find_type_index]:
+                #                 cut_type_index_dict[find_type_index] = final_block_index
+                #         else:
+                #             cut_type_index_dict[find_type_index] = final_block_index
+                #     print('continue 2')
+                #     print('-'*40)
+                #     continue
+
+                # 找之前相同的序号类型的index,且index不能超过截断的该类型的index
+                last_index = len(sub_type_index_list) - 1 - sub_type_index_list[::-1].index(find_type_index)
+                print('find_type_index', find_type_index, [last_index, div_index], [sub_type_index_list[0], sub_type_index_list[-1]])
+                if last_index < cut_type_index_dict.get(find_type_index, 0):
+                    # type_index_list.append(find_type_index)
+                    print('continue 3 last_index < cut_type_index_dict ', last_index, cut_type_index_dict.get(find_type_index, 0))
+                    print('-'*40)
+                    continue
+
+                # 新增块
+                range_index_list.append([last_index, div_index])
+                print('find last_index add block', [last_index, div_index])
+
+                # 更新截断
+                if cut_type_index_dict.get(find_type_index) is not None:
+                    if div_index > cut_type_index_dict[find_type_index]:
+                        cut_type_index_dict[find_type_index] = div_index
+                else:
+                    cut_type_index_dict[find_type_index] = div_index
+
+                # 找到块了,那么块内的所有序号类型的截断到该块的最小index
+                final_type_index_dict = {}
+                for temp_div_index, temp_type in enumerate(sub_type_index_list[last_index+1:div_index]):
+                    temp_div_index += last_index + 1
+
+                    if temp_div_index < cut_type_index_dict.get(temp_type, 0):
+                        continue
+
+                    # 对块内有的类型的最后一个都新增块
+                    if temp_div_index <= range_index_list[-1][0]:
+                        continue
+                    final_type_index_dict[temp_type] = temp_div_index
+
+                for temp_type in final_type_index_dict.keys():
+                    final_index = final_type_index_dict.get(temp_type)
+                    if [final_index, div_index] not in range_index_list:
+                        print('add block cut_type_index_dict 1', cut_type_index_dict)
+                        range_index_list.append([final_index, div_index])
+                        print('add block ', [final_index, div_index])
+                        if cut_type_index_dict.get(temp_type) is not None:
+                            if div_index > cut_type_index_dict[temp_type]:
+                                cut_type_index_dict[temp_type] = div_index
+                        else:
+                            cut_type_index_dict[temp_type] = div_index
+
+                        print('add block cut_type_index_dict 2', cut_type_index_dict)
+
+
+                # temp_type_index_list = []
+            else:
+                print('find_type_index not in type_index_list')
+
+            print(cut_type_index_dict)
+            # 存储所有序号类型
+            # type_index_list.append(find_type_index)
+            # 存储块内的序号类型
+            # temp_type_index_list.append(find_type_index)
+            print('-'*40)
+
+        if not range_index_list:
+            print('no range_index_list')
+            return
+
+        # 排序
+        range_index_list.sort(key=lambda x: (x[0], x[1]))
+
+        # 生成最后的块
+        for temp_type in range(len(self.order_type_list)):
+            for div_index, d in enumerate(div_list[::-1]):
+                div_index = len(div_list) - 1 - div_index
+                if type_index_list[div_index] != temp_type:
+                    continue
+                if [div_index, div_index+1] not in range_index_list:
+                    range_index_list.append([div_index, len(div_list)-1])
+                    break
+
+        # last_block_index = range_index_list[-1][1]
+        # for div_index, d in enumerate(div_list[last_block_index:]):
+        #     div_index = div_index + last_block_index
+        #     if type_index_list[div_index] < 0:
+        #         continue
+        #     if [div_index, len(div_list)-1] not in range_index_list:
+        #         range_index_list.append([div_index, len(div_list)-1])
+
+        # 排序
+        range_index_list.sort(key=lambda x: (x[0], -x[1]))
+
+        print('type_index_list', type_index_list)
+
+        block_dict = {}
+        index_div_list = []
+        for range_index in range_index_list:
+            _text = ''
+
+            for d in div_list[range_index[0]:range_index[1]]:
+                _text += d.text
+            print(range_index, _text[:20])
+
+        # 合并重叠的
+        delete_range_index_list = []
+        # for i, range_index in enumerate(range_index_list):
+        #     if range_index in delete_range_index_list:
+        #         continue
+        #     for j in range(i+1, len(range_index_list)):
+        #         range_index2 = range_index_list[j]
+        #         if range_index2 in delete_range_index_list:
+        #             continue
+        #         if range_index[0] == range_index2[0] or range_index[1] == range_index2[1]:
+        #             delete_range_index_list.append(range_index2)
+
+        # 补充中间断开的
+        add_range_index_list = []
+        if range_index_list[0][0] != 0:
+            for j in range(0, range_index_list[0][0]):
+                add_range_index_list.append([j, j+1])
+        for i in range(1, len(range_index_list)):
+            range_index1 = range_index_list[i-1]
+            range_index2 = range_index_list[i]
+            if range_index1[1] != range_index2[0] or (range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]):
+                for j in range(range_index1[0], range_index2[0]):
+                    add_range_index_list.append([j, j+1])
+                # add_range_index_list.append([range_index1[0], range_index2[0]])
+
+            # if range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]:
+            #     add_range_index_list.append([range_index1[0]+1, range_index2[0]])
+
+        print('delete_range_index_list', delete_range_index_list)
+        print('add_range_index_list', add_range_index_list)
+
+        print('len(range_index_list)', len(range_index_list))
+        for range_index in delete_range_index_list:
+            if range_index in range_index_list:
+                range_index_list.remove(range_index)
+
+        print('len(range_index_list)', len(range_index_list))
+
+        range_index_list += add_range_index_list
+        range_index_list.sort(key=lambda x: (x[0], -x[1]))
+
+        print('len(range_index_list)', len(range_index_list))
+
+        tree_root = self.recursion_get_tree(range_index_list, div_list, 0, len(div_list))
+        # print_tree(tree_root)
+
+        with open('../layout.html', 'w') as f:
+            f.write('')
+
+        print_tree_order(tree_root, div_list)
+
+        with open('../origin.html', 'w') as f:
+            f.write(self.html)
+
+        # 打印某个产品的参数
+        if product:
+            candidate_div_list = []
+            for i, div in enumerate(div_list):
+                div = div.text
+                if i == 0 or i == len(div_list)-1:
+                    continue
+                if not re.search(product, div):
+                    continue
+
+                print('find product', div[:20])
+
+                type_index = type_index_list[i]
+
+                type_index_after = None
+                for ti in type_index_list[i+1:]:
+                    if ti != -1:
+                        type_index_after = ti
+                        break
+                type_index_before = None
+                for ti in type_index_list[:i][::-1]:
+                    if ti != -1:
+                        type_index_before = ti
+                        break
+
+                print('type_index, type_index_before, type_index_after1', type_index, type_index_before, type_index_after)
+
+                # 复用序号样式
+                dup_type_index_flag = 0
+                if type_index_after == type_index:
+                    dup_type_index_flag = 1
+
+                print('type_index, type_index_before, type_index_after2', type_index, type_index_before, type_index_after)
+
+                block_type_list = []
+                block_div_list = []
+                no_order_type_list = []
+                sub_type_index_list = type_index_list[i:]
+                type_index_pair1 = [type_index_before, type_index]
+                type_index_pair2 = [type_index, type_index_after]
+                for j, ti in enumerate(sub_type_index_list):
+                    real_j = j + i
+                    if j == 0 or j == len(sub_type_index_list) - 1:
+                        continue
+                    ti_previous = sub_type_index_list[j-1]
+                    ti_next = sub_type_index_list[j+1]
+                    ti_pair1 = [ti_previous, ti_next]
+                    ti_pair2 = [ti, ti_next]
+                    _div = div_list[real_j].text
+
+                    # 判断多层还是单层,且是否第一个
+                    tis = re.split('[.、]', str(ti))
+                    temp_tis = []
+                    for _ti in tis:
+                        if _ti != '':
+                            temp_tis.append(_ti)
+                    tis = temp_tis
+                    break_flag1 = 0
+                    if len(tis) >= 2:
+                        if len(re.findall('[1一]{2,}', tis[-1])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', tis[-1])) != 0:
+                            break_flag1 = 1
+                    else:
+                        if len(re.findall('[1一]{2,}', _div[:3])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', _div[:6])) != 0:
+                            break_flag1 = 1
+
+                    # 有复用的,与搜索的type_index相同且连续,但与之前的相同的type_index的数字不连续
+                    break_flag2 = 0
+                    if dup_type_index_flag and type_index == ti and ti in block_type_list:
+                        last_ti_index = block_type_list[::-1].index(ti)
+                        last_ti_index = len(block_type_list) - 1 - last_ti_index
+                        last_ti_div = block_div_list[last_ti_index]
+                        last_ti_order_no = get_order_no(ti, last_ti_div)
+                        ti_order_no = get_order_no(ti, _div)
+                        type_index_order_no = get_order_no(type_index, div)
+                        print('last_ti_order_no, ti_order_no, type_index_order_no', last_ti_order_no, ti_order_no, type_index_order_no)
+                        print(last_ti_div[:10], _div[:10], div[:10])
+
+                        if None not in [type_index_order_no, last_ti_order_no, ti_order_no]:
+                            if ti_order_no - type_index_order_no == 1 and ti_order_no - last_ti_order_no != 1:
+                                break_flag2 = 1
+
+                    if break_flag2:
+                        break
+                    # 碰到很大的序号类型
+                    elif ti in [7, 8]:
+                        break
+                    # 碰到不是从1开始的
+                    elif ti == -1:
+                        no_order_type_list.append(ti)
+                        block_type_list.append(ti)
+                        block_div_list.append(_div)
+                    elif ti not in block_type_list and break_flag1:
+                        print('not 1 start break', _div[:6], len(re.findall('[1一]', _div[:3])), len(re.findall('[2-9二三四五六七八九十]', _div[:6])))
+                        print(block_div_list)
+                        print(block_type_list)
+                        break
+                    elif not dup_type_index_flag and ti not in [type_index, type_index_before, type_index_after]:
+                        block_type_list.append(ti)
+                        block_div_list.append(_div)
+                        no_order_type_list = []
+                    else:
+                        # 遇到相同类型的组合
+                        if not dup_type_index_flag and (type_index_pair1 == ti_pair1):
+                            block_type_list.append(ti)
+                            block_div_list.append(_div)
+                            print('type_index_pair1 == ti_pair1 or type_index_pair2 == ti_pair2 break',
+                                  _div[:6], type_index_pair1, ti_pair1, type_index_pair2, ti_pair2)
+                            break
+                        else:
+                            no_order_type_list = []
+                            block_type_list.append(ti)
+                            block_div_list.append(_div)
+
+                if not block_type_list:
+                    continue
+
+                # 排除末尾为非序号的
+                if block_type_list[-1] == -1:
+                    block_type_list = block_type_list[:len(block_type_list)-len(no_order_type_list)]
+                    block_div_list = block_div_list[:len(block_div_list)-len(no_order_type_list)]
+
+                candidate_div_list.append(block_div_list)
+
+            print('len(candidate_div_list)', len(candidate_div_list))
+            print('candidate_div_list', candidate_div_list)
+            if candidate_div_list:
+                candidate_div_list.sort(key=lambda x: len(x))
+                for div in candidate_div_list:
+                    print(len(div), div)
+                print('='*10, product, '='*10)
+                for div in candidate_div_list[-1]:
+                    print(div)
+
+
+
+        # print(d.text)
+
+    def order_show_in_layout(self, tree_root, div_list):
+        print_tree_order(tree_root, div_list)
+
+
+
+# with open('../result.html', 'r') as f:
+with open(r'C:\Users\Administrator\Desktop\test_layout\4.html', 'r') as f:
+    html = f.read()
+
+LayoutConvert(html).get_order_number_tree('连续性血液净化设备')
+
+
+_list = [1, 3, 5, 7, 9]
+print(len(_list) - 1 - _list[::-1].index(3))

+ 2 - 2
format_convert/convert_need_interface.py

@@ -555,8 +555,8 @@ def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
 
 
 def interface_pool_gunicorn(interface_type):
-    # if get_platform() == 'Windows':
-    #     set_flask_global()
+    if get_platform() == 'Windows':
+        set_flask_global()
 
     ip_port_flag_dict = _global.get("ip_port_flag")
     ip_port_dict = _global.get("ip_port")

Разлика између датотеке није приказан због своје велике величине
+ 37 - 837
format_convert/convert_pdf.py


+ 85 - 11
format_convert/convert_test.py

@@ -5,13 +5,27 @@ import random
 import sys
 import time
 from glob import glob
+
+import requests
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.layout import LAParams, LTLine
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfparser import PDFParser
+from pdfplumber import PDF
+
+from otr.table_line_pdf import _plot
+
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
 from format_convert.convert import to_html
 import multiprocessing as mp
 
 
-def test_one(p, page_no_range=None, from_remote=False):
+def test_one(p, page_no_range=None, from_remote=False, timeout=300, save_middle=None):
     start_time = time.time()
     with open(p, "rb") as f:
         file_bytes = f.read()
@@ -19,14 +33,15 @@ def test_one(p, page_no_range=None, from_remote=False):
 
     _md5 = get_md5_from_bytes(file_bytes)
 
-    data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range}
+    data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
+            'timeout': timeout, 'save_middle': save_middle}
     if from_remote:
         _url = 'http://121.46.18.113:15010/convert'
         # _url = 'http://192.168.2.103:15010/convert'
         # _url = 'http://192.168.2.102:15011/convert'
         # _url = 'http://172.16.160.65:15010/convert'
         # _url = 'http://127.0.0.1:15010/convert'
-        result = json.loads(request_post(_url, data, time_out=10000))
+        result = json.loads(request_post(_url, data, time_out=timeout+20))
         text_str = ""
         for t in result.get("result_html"):
             text_str += t
@@ -42,6 +57,25 @@ def test_one(p, page_no_range=None, from_remote=False):
     print(time.time()-start_time)
 
 
+def test_path():
+    # _url = 'http://121.46.18.113:15010/convert'
+    _url = 'http://192.168.0.115:15010/convert'
+    print(_url)
+    p = '/data/fangjiasheng/format_conversion_maxcompute/1.png'
+    data = {"file_path": p, "type": p.split(".")[-1], "filemd5": 100, 'page_no': '1,-1',
+            'timeout': 10000, 'save_middle': None}
+    print(str(data))
+    # result = json.loads(request_post(_url, data, time_out=1000))
+    result = json.loads(requests.post(_url, data))
+    text_str = ""
+    for t in result.get("result_html"):
+        text_str += t
+    to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
+            text_str)
+    print("result_text", result.get("result_text")[0][:20])
+    print("is_success", result.get("is_success"))
+
+
 def test_duplicate(path_list, process_no=None):
     start_time = time.time()
     # random.shuffle(path_list)
@@ -81,24 +115,28 @@ def test_maxcompute(p, page_no_range=None):
 if __name__ == '__main__':
     if get_platform() == "Windows":
         # file_path = "C:/Users/Administrator/Desktop/2.png"
-        file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xls"
+        # file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xls"
         # file_path = "C:/Users/Administrator/Desktop/test_doc/error5.doc"
         # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
-        # file_path = "C:/Users/Administrator/Downloads/1688432101601.xlsx"
+        # file_path = "C:/Users/Administrator/Downloads/W020230512399773694376.jpg"
         # file_path = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
-        # file_path = "C:/Users/Administrator/Desktop/test_image/error36.png"
+        file_path = "C:/Users/Administrator/Desktop/test_image/error9-1.png"
         # file_path = "C:/Users/Administrator/Desktop/test_b_table/error1.png"
-        # file_path = "C:/Users/Administrator/Desktop/test_pdf/表格连接error/error7.pdf"
+        # file_path = "C:/Users/Administrator/Desktop/test_pdf/直接读表格线error/error62.pdf"
         # file_path = "C:/save_b_table/0-0895e32470613dd7be1139eefd1342c4.png"
     else:
         file_path = "1660296734009.pdf"
 
-    test_one(file_path, page_no_range='1,-1', from_remote=True)
+    test_one(file_path, page_no_range='1,-1', from_remote=True, timeout=1000, save_middle=None)
+
+    # test_path()
 
-    file_path = "C:/Users/Administrator/Downloads/"
+    # file_path = "C:/Users/Administrator/Downloads/"
     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\直接读表格线error/"
     # file_path = r"C:\Users\Administrator\Desktop\test_pdf\表格连接error/"
+    # file_path = r"C:\Users\Administrator\Desktop\test_b_table/"
+    file_path = r"C:\Users\Administrator\Desktop\test_pdf\普通error/"
     test_pdf_list = [['6df7f2bd5e8cac99a15a6c012e0d82a8.pdf', '34,52'],
                      ['ca6a86753400d6dd6a1b324c5678b7fb.pdf', '18,69'],
                      ['a8380bf795c71caf8185fb11395df138.pdf', '27,38'],
@@ -106,12 +144,48 @@ if __name__ == '__main__':
                      ['dd1adb4dc2014c7abcf403ef15a01eb5.pdf', '2,12'],
                      ['error50.pdf', '1,-1'],
                      ['error59.pdf', '1,-1'],
-                     ['error51.pdf', '1,-1'],
+                     ['error60.pdf', '1,-1'],
+                     ['error61.pdf', '1,-1'],
                      ['error7.pdf', '39,57'],
+                     ['error8.pdf', '7,12'],
+                     ['error23.pdf', '1,-1']
                      ]
-    index = 1
+    index = 11
     # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
 
+    # from pdfplumber.table import TableFinder
+    # fp = open(file_path+test_pdf_list[index][0], 'rb')
+    # parser = PDFParser(fp)
+    # doc_pdfminer = PDFDocument(parser)
+    # rsrcmgr = PDFResourceManager()
+    # laparams = LAParams(line_overlap=0.01,
+    #                     char_margin=0.3,
+    #                     line_margin=0.01,
+    #                     word_margin=0.01,
+    #                     boxes_flow=0.1, )
+    # device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+    # interpreter = PDFPageInterpreter(rsrcmgr, device)
+    # doc_top = 0
+    # doc_pdfplumber = PDF(fp)
+    # pages = PDFPage.create_pages(doc_pdfminer)
+    # from pdfplumber.page import Page as pdfPage
+    # for page in pages:
+    #     page_plumber = pdfPage(doc_pdfplumber, page, page_number=1, initial_doctop=doc_top)
+    #     table_finder = TableFinder(page_plumber)
+    #     all_width_zero = True
+    #     for _edge in table_finder.get_edges():
+    #         if _edge.get('linewidth') and _edge.get('linewidth') > 0:
+    #             all_width_zero = False
+    #             break
+    #     lt_line_list = []
+    #     for _edge in table_finder.get_edges():
+    #         # print(_edge)
+    #         if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
+    #             lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
+    #                                        (float(_edge["x1"]), float(_edge["y1"]))))
+    #     _plot(lt_line_list, 'table', 1, 1)
+
+
 
     # 测试maxcompute模式
     # _process = mp.Process(target=test_maxcompute, args=(file_path, '1,-1',))

+ 7 - 3
format_convert/convert_tree.py

@@ -115,6 +115,9 @@ class _Image:
     def get_html(self):
         # 将Image转为Sentence,table
         self.convert()
+        if self.error_code == [-16]:
+            self.error_code = None
+            return "<div>#idc error#<div>"
         if self.error_code is not None:
             return ""
 
@@ -138,6 +141,10 @@ class _Image:
         obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx,
                                  self.b_table_from_text, self.b_table_text_obj_list,
                                  self.b_table_layout_size)
+        if judge_error_code(obj_list):
+            self.error_code = obj_list
+            return
+
         if self.b_table_from_text:
             temp_list = []
             for obj in obj_list:
@@ -145,9 +152,6 @@ class _Image:
                     temp_list.append(obj)
             obj_list = temp_list
 
-        if judge_error_code(obj_list):
-            self.error_code = obj_list
-            return
         for obj in obj_list:
             self.add_child(obj)
 

+ 1 - 0
format_convert/convert_xlsx.py

@@ -216,6 +216,7 @@ class XlsxConvert:
         # 拼接html表格
         text = '<table border="1">' + "\n"
         for row in row_list:
+            text = text + "<tr>"
             for col in row:
                 text = text + "<td>" + str(col) + "</td>" + "\n"
             text = text + "</tr>" + "\n"

+ 4 - 3
format_convert/monitor_process_config.py

@@ -48,14 +48,14 @@ for name in interface_list:
 
         # 设置命令
         if name == 'convert':
-            comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "format_convert" + ' ' + name + ":app" + std_out
+            comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 6000 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "format_convert" + ' ' + name + ":app" + std_out
         elif name == 'yolo':
             comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "/botr/yolov8" + ' ' + name + "_interface:app" + std_out_gpu
         elif name == 'office':
             comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
             office_port_comm_list = []
             for office_port in range(port, port + port_num):
-                office_port_comm_list = re.sub("#", str(office_port), comm)
+                office_port_comm_list.append(re.sub("#", str(office_port), comm))
             comm_dict[name] = office_port_comm_list
         else:
             comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "/" + name + ' ' + name + "_interface:app" + std_out_gpu
@@ -69,7 +69,8 @@ for name in interface_list:
             comm_dict[name] = [gpu_comm + comm]
 
     # print(name, port_list, num_list, gpu_list)
-
+# print('comm_dict', comm_dict)
+# print('interface_port_dict', interface_port_dict)
 # convert_port_list = get_args_from_config(ip_port_dict, ip, "convert", "MASTER")
 # if convert_port_list:
 #     convert_port_list = convert_port_list[0]

Разлика између датотеке није приказан због своје велике величине
+ 73 - 7
format_convert/utils.py


+ 13 - 8
idc/idc_interface.py

@@ -47,7 +47,8 @@ tf.compat.v1.disable_eager_execution()
 sess = tf.compat.v1.Session(graph=tf.Graph())
 
 
-image_shape = (192, 192)
+# image_shape = (192, 192)
+image_shape = (640, 640)
 
 
 def adjust_direction(image_np, model, if_return_angle=False):
@@ -59,10 +60,11 @@ def adjust_direction(image_np, model, if_return_angle=False):
     # image_np = pil_resize(image_np, image_shape[0], image_shape[1])
 
     # 获取合适的文字区域
-    result_list, image_np = get_text_region(image_np, image_shape)
+    image_np = get_text_region(image_np, image_shape)
     # cv2.imshow("get_text_region", image_np)
     # cv2.waitKey(0)
-    if not result_list:
+    # print(type(image_np))
+    if type(image_np) != np.ndarray:
         return None
     if len(image_np.shape) < 3:
         image_np = np.expand_dims(image_np, axis=-1)
@@ -85,9 +87,12 @@ def adjust_direction(image_np, model, if_return_angle=False):
     if if_return_angle:
         return angle
     else:
-        # 根据角度旋转
-        image_pil = Image.fromarray(origin_image)
-        image_rotate = np.array(image_pil.rotate(angle, expand=1))
+        if angle not in [0, 360]:
+            # 根据角度旋转
+            image_pil = Image.fromarray(origin_image)
+            image_rotate = np.array(image_pil.rotate(angle, expand=1))
+        else:
+            image_rotate = origin_image
         return image_rotate
 
 
@@ -154,7 +159,7 @@ class IdcModels:
         _dir = os.path.abspath(os.path.dirname(__file__))
 
         # detect
-        model_path = _dir + "/models/cnn.h5"
+        model_path = _dir + "/models/e484-f10.96.h5"
         with sess.as_default():
             with sess.graph.as_default():
                 self.model = direction_model(input_shape=(image_shape[0], image_shape[1], 1),
@@ -167,7 +172,7 @@ class IdcModels:
 
 def test_idc_model(from_remote=False):
     idc_model = IdcModels().get_model()
-    paths = glob("C:/Users/Administrator/Desktop/test_image/111.jpg")
+    paths = glob("C:/Users/Administrator/Desktop/test_image/error43.png")
     # file_path = "C:/Users/Administrator/Desktop/test_image/error10.jpg"
     for file_path in paths:
         img_np = cv2.imread(file_path)

+ 32 - 1
idc/model.py

@@ -16,7 +16,7 @@ import keras.backend as K
 
 def direction_model(input_shape, output_shape):
     model = cnn_model(input_shape, output_shape)
-    print(input_shape, output_shape)
+    # print(input_shape, output_shape)
     # model = mobile_net_v3_tiny(input_shape, output_shape)
     # model = fpn(input_shape, output_shape)
     # model.summary(line_length=100)
@@ -24,6 +24,37 @@ def direction_model(input_shape, output_shape):
 
 
 def cnn_model(input_shape, output_shape):
+    conv_num = 6
+
+    # Input
+    _input = Input(shape=input_shape, dtype="float32", name="input")
+
+    conv = Conv2D(16, (3, 3), padding='same')(_input)
+    bn = BatchNormalization()(conv)
+    relu = LeakyReLU(alpha=0.)(bn)
+    max_pool = MaxPool2D()(relu)
+    for i in range(conv_num):
+        conv = Conv2D(16, (3, 3), padding='same')(max_pool)
+        bn = BatchNormalization()(conv)
+        relu = LeakyReLU(alpha=0.)(bn)
+        # conv = Conv2D(32, (1, 1), padding='same')(relu)
+        # bn = BatchNormalization()(conv)
+        # relu = LeakyReLU(alpha=0.)(bn)
+        max_pool = MaxPool2D()(relu)
+    # conv = Conv2D(16, (3, 3), padding='same')(max_pool)
+    # bn = BatchNormalization()(conv)
+    # relu = LeakyReLU(alpha=0.)(bn)
+    max_pool = MaxPool2D((6, 6))(relu)
+
+    dense = layers.Dense(output_shape, activation='softmax')(max_pool)
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=1))(dense)
+    squeeze = Lambda(lambda x: K.squeeze(x, axis=1))(squeeze)
+
+    model = Model(inputs=_input, outputs=squeeze)
+    return model
+
+
+def cnn_model_240314(input_shape, output_shape):
     conv_num = 5
 
     # Input

BIN
idc/models/e484-f10.96.h5


+ 38 - 1
idc/pre_process.py

@@ -59,7 +59,7 @@ def get_img_label(img_np, size, cls_num=4):
     return img_label_list
 
 
-def get_text_region(img_np, size):
+def get_text_region2(img_np, size):
     img_np = remove_black_border(img_np)
     origin_h, origin_w = img_np.shape[:2]
     gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
@@ -198,6 +198,43 @@ def get_text_region(img_np, size):
     return result_list, gray
 
 
+def get_text_region3(img_np, size):
+    img_np = remove_black_border(img_np)
+    origin_h, origin_w = img_np.shape[:2]
+    gray = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
+
+    h, w = get_best_predict_size2(img_np, threshold=640)
+    img_np = pil_resize(img_np, h, w)
+
+    # 1.  转化成灰度图
+    img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
+
+    result_list = []
+    return result_list, gray
+
+
+def get_text_region(img_np, size=(640, 640)):
+    origin_h, origin_w = img_np.shape[:2]
+
+    # 1.  crop
+    crop_h, crop_w = 2000, 2000
+    if origin_h > crop_h:
+        index = int((origin_h - crop_h) / 2)
+        img_np = img_np[index:index+crop_h, :]
+    if origin_w > crop_w:
+        index = int((origin_w - crop_w) / 2)
+        img_np = img_np[:, index:index+crop_w]
+
+    # 2.  resize
+    # h, w = get_best_predict_size2(img_np, threshold=640)
+    img_np = pil_resize(img_np, size[0], size[1])
+
+    # 3.  gray
+    img_np = cv2.cvtColor(img_np, cv2.COLOR_BGR2GRAY)
+
+    return img_np
+
+
 def gen(paths, batch_size=2, shape=(640, 640), cls_num=4, is_test=False):
     def choose(_paths, _i):
         while True:

Разлика између датотеке није приказан због своје велике величине
+ 0 - 0
layout.html


+ 6 - 3
ocr/ocr_interface.py

@@ -22,6 +22,9 @@ from format_convert import _global
 app = Flask(__name__)
 
 
+use_angle_cls = False
+
+
 @app.route('/ocr', methods=['POST'])
 def _ocr():
     _global._init()
@@ -75,9 +78,9 @@ def picture2text(img_data, ocr_model, only_rec=0):
 
         # 预测
         if only_rec:
-            results = ocr_model.ocr(img, det=False, rec=True, cls=False)
+            results = ocr_model.ocr(img, det=False, rec=True, cls=use_angle_cls)
         else:
-            results = ocr_model.ocr(img, det=True, rec=True, cls=False)
+            results = ocr_model.ocr(img, det=True, rec=True, cls=use_angle_cls)
 
         # 循环每张图片识别结果
         text_list = []
@@ -124,7 +127,7 @@ class OcrModels:
         from ocr.paddleocr import PaddleOCR
         try:
             log('----------- init ocr model ---------------')
-            self.ocr_model = PaddleOCR(use_angle_cls=True, lang="ch")
+            self.ocr_model = PaddleOCR(use_angle_cls=use_angle_cls, lang="ch")
         except:
             print(traceback.print_exc())
             raise RuntimeError

+ 6 - 6
otr/table_line_new.py

@@ -166,7 +166,7 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
     return line_list
 
 
-def table_line_pdf(line_list, page_w, page_h, is_test=0):
+def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
     for i, line in enumerate(line_list):
         line_list[i] = [int(x) for x in line]
 
@@ -188,7 +188,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
         else:
             if is_test:
                 print(line)
-    log("pdf divide rows and cols " + str(time.time() - start_time))
+    # log("pdf divide rows and cols " + str(time.time() - start_time))
     show(row_line_list + col_line_list, title="divide", mode=2, is_test=is_test)
 
     # 两种线都需要存在,否则跳过
@@ -201,7 +201,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
     show(row_line_list + col_line_list, title="merge", mode=2, is_test=is_test)
 
     # 计算交点
-    print('img_new.shape', img_new.shape)
+    # print('img_new.shape', img_new.shape)
     cross_points = get_points(row_line_list, col_line_list, (img_new.shape[0], img_new.shape[1]))
     if not cross_points:
         return []
@@ -252,7 +252,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
         cross_points = get_points(row_line_list, col_line_list, (img_new.shape[0], img_new.shape[1]))
         split_lines, split_y = get_split_line(cross_points, col_line_list, img_new)
         area_row_line_list, area_col_line_list, area_point_list = get_split_area(split_y, row_line_list, col_line_list, cross_points)
-    log("pdf fix_outline " + str(time.time() - start_time))
+    # log("pdf fix_outline " + str(time.time() - start_time))
 
     # 根据区域循环
     for i in range(len(area_point_list)):
@@ -270,7 +270,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
         # 修复内部缺线
         start_time = time.time()
         sub_row_line_list, sub_col_line_list = fix_inner(sub_row_line_list, sub_col_line_list, sub_point_list)
-        log("pdf fix_inner " + str(time.time() - start_time))
+        # log("pdf fix_inner " + str(time.time() - start_time))
         show(sub_row_line_list + sub_col_line_list, title="fix_inner1", mode=2, is_test=is_test)
 
         # 修复内部线后重新计算交点
@@ -289,7 +289,7 @@ def table_line_pdf(line_list, page_w, page_h, is_test=0):
     line_list = row_line_list + col_line_list
     # 打印处理后线
     show(line_list, title="all", img=img_show, mode=5, is_test=is_test)
-    log("pdf otr postprocess table_line " + str(time.time() - start_time))
+    # log("table_line_pdf cost: " + str(time.time() - start_time))
     return line_list
 
 

+ 624 - 0
otr/table_line_pdf.py

@@ -0,0 +1,624 @@
+import copy
+import math
+import random
+import time
+import numpy as np
+import cv2
+from matplotlib import pyplot as plt
+from pdfminer.layout import LTTextContainer, LTRect, LTCurve, LTLine
+from scipy.stats import linregress
+from shapely.geometry import LineString
+from format_convert.utils import log, bbox_iou
+from otr.table_line_new import table_line_pdf_post_process
+
+page_w = 100
+page_h = 100
+
+
+def _plot(_line_list, title, mode=1, show=1):
+    if not show:
+        return
+
+    for _line in _line_list:
+        if mode == 1:
+            x0, y0, x1, y1 = _line.__dict__.get("bbox")
+        elif mode == 2:
+            x0, y0, x1, y1 = _line
+        plt.plot([x0, x1], [y0, y1])
+    plt.title(title)
+    plt.show()
+    return
+
+
+def is_cross(A, B, C, D):
+    if A[0] == B[0] == C[0] == D[0]:
+        if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
+                or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
+            return True
+    if A[1] == B[1] == C[1] == D[1]:
+        if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
+                or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
+            return True
+
+    line1 = LineString([A, B])
+    line2 = LineString([C, D])
+
+    int_pt = line1.intersection(line2)
+    try:
+        point_of_intersection = int_pt.x, int_pt.y
+        return True
+    except:
+        return False
+
+
+def calculate_k(bbox):
+    x = [bbox[0], bbox[2]]
+    y = [bbox[1], bbox[3]]
+    slope, intercept, r_value, p_value, std_err = linregress(x, y)
+    # print('k', slope)
+    if math.isnan(slope):
+        slope = 0
+    return slope
+
+
+def line_iou(line1, line2, axis=0):
+    if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
+        return 1.0
+    if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
+        return 1.0
+
+    inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
+    # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
+    union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
+    if union in [0, 0.]:
+        iou = 0.
+    else:
+        iou = inter / union
+    return iou
+
+
+def get_cross_line(_line_list, threshold=1, cross_times=0):
+    start_time = time.time()
+
+    start_time1 = time.time()
+    # 分横线竖线
+    new_line_list = []
+    for line in _line_list:
+        if abs(line[0]-line[2]) >= abs(line[1]-line[3]):
+            new_line = [max(0, line[0] - threshold), line[1], min(line[2] + threshold, page_w), line[3]]
+        else:
+            new_line = [line[0], max(0, line[1] - threshold), line[2], min(line[3] + threshold, page_h)]
+        new_line_list.append(new_line)
+
+    _cross_line_list = []
+    for i in range(len(new_line_list)):
+        line1 = new_line_list[i]
+
+        # line1的计算区域
+        line1_area = [max(0, line1[0]-threshold), max(0, line1[1]-threshold),
+                      min(page_w, line1[2]+threshold), min(page_h, line1[3]+threshold)]
+
+        # line1是横线还是竖线
+        if abs(line1[0] - line1[2]) >= abs(line1[1]-line1[3]):
+            line1_is_row = 1
+        else:
+            line1_is_row = 0
+
+        _times = 0
+        for j in range(len(new_line_list)):
+            if i == j:
+                continue
+
+            line2 = new_line_list[j]
+            if abs(line2[0] - line2[2]) >= abs(line2[1]-line2[3]):
+                line2_is_row = 1
+            else:
+                line2_is_row = 0
+
+            # 十字交叉的横竖线直接判断交点
+            if line1_is_row ^ line2_is_row:
+                if (line1_is_row and line1[0] <= line2[0] <= line1[2] and line2[1] <= line1[1] <= line2[3]) \
+                        or (line2_is_row and line2[0] <= line1[0] <= line2[2] and line1[1] <= line2[1] <= line1[3]):
+                    _times += 1
+                    if _times >= cross_times:
+                        _cross_line_list += [line1]
+                        break
+                    continue
+
+            # 不在计算区域的直接跳过
+            if not((line1_area[0] <= line2[0] <= line1_area[2] and line1_area[1] <= line2[1] <= line1_area[3])
+                   or (line1_area[0] <= line2[2] <= line1_area[2] and line1_area[1] <= line2[3] <= line1_area[3]) or ()):
+                continue
+
+            if is_cross(line1[:2], line1[2:4], line2[:2], line2[2:4]):
+                _times += 1
+                if _times >= cross_times:
+                    _cross_line_list += [line1]
+                    break
+    _cross_line_list1 = _cross_line_list
+    # print('get_cross_line new', time.time()-start_time1)
+    # start_time1 = time.time()
+    #
+    # # 根据是否有交点判断表格线
+    # _cross_line_list = []
+    # for line1 in _line_list:
+    #     if line1 in _cross_line_list:
+    #         continue
+    #     if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
+    #         p1 = [max(0, line1[0] - threshold), line1[1]]
+    #         p2 = [min(line1[2] + threshold, page_w), line1[3]]
+    #     else:
+    #         p1 = [line1[0], max(0, line1[1] - threshold)]
+    #         p2 = [line1[2], min(line1[3] + threshold, page_h)]
+    #     line1 = [p1[0], p1[1], p2[0], p2[1]]
+    #     _times = 0
+    #     for line2 in _line_list:
+    #         if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
+    #             p3 = [max(0, line2[0] - threshold), line2[1]]
+    #             p4 = [min(line2[2] + threshold, page_w), line2[3]]
+    #         else:
+    #             p3 = [line2[0], max(0, line2[1] - threshold)]
+    #             p4 = [line2[2], min(line2[3] + threshold, page_h)]
+    #         line2 = [p3[0], p3[1], p4[0], p4[1]]
+    #         if line1 == line2:
+    #             continue
+    #         if is_cross(p1, p2, p3, p4):
+    #             _times += 1
+    #             if _times >= cross_times:
+    #                 _cross_line_list += [line1]
+    #                 break
+    #
+    # if len(_cross_line_list1) > 0 or len(_cross_line_list) > 0:
+    #     print('get_cross_line old', time.time()-start_time1)
+    #     print(len(_cross_line_list1), len(_cross_line_list))
+
+    log('get_cross_line cost: ' + str(time.time()-start_time))
+    return _cross_line_list1
+
+
+def merge_line(_line_list, threshold=2):
+    start_time = time.time()
+
+    new_line_list = []
+    # 分列
+    _line_list.sort(key=lambda x: (x[0], x[1]))
+    cols = []
+    col = []
+    current_w = None
+    for line in _line_list:
+        if abs(line[0] - line[2]) > abs(line[1] - line[3]):
+            continue
+        if not col:
+            col.append(line)
+            current_w = line[0]
+
+        _iou = line_iou([[0, line[1]], [0, line[3]]], [[0, col[0][1]], [0, col[0][3]]], axis=1)
+        if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
+                and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
+            col.append(line)
+        elif min(line[0], line[2]) - 2*threshold <= current_w <= max(line[0], line[2]) + 2*threshold \
+                and _iou >= 0.1:
+            col.append(line)
+        else:
+            if col:
+                cols.append(col)
+            col = [line]
+            current_w = line[0]
+    if col:
+        cols.append(col)
+
+    for col in cols:
+        temp_c = col[0]
+        col_w = col[0][0]
+        for i in range(len(col) - 1):
+            c = col[i]
+            next_c = col[i + 1]
+            if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]) \
+                    or line_iou([[0, c[1]], [0, c[3]]], [[0, next_c[1]], [0, next_c[3]]], axis=1) >= 0.1:
+                temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
+                          max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
+            else:
+                new_line_list.append(temp_c)
+                temp_c = next_c
+        if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
+            new_line_list.append(temp_c)
+
+    # 分行
+    _line_list.sort(key=lambda x: (x[1], x[0]))
+    rows = []
+    row = []
+    current_h = None
+    for line in _line_list:
+        if abs(line[0] - line[2]) < abs(line[1] - line[3]):
+            continue
+
+        if not row:
+            row = [line]
+            current_h = line[1]
+
+        if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
+            row.append(line)
+        else:
+            if row:
+                rows.append(row)
+            row = [line]
+            current_h = line[1]
+    if row:
+        rows.append(row)
+
+    for row in rows:
+        temp_r = row[0]
+        row_h = row[0][1]
+        for i in range(len(row) - 1):
+            r = row[i]
+            next_r = row[i + 1]
+            # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
+            if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0) >= 0.1:
+                temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
+                          max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
+            else:
+                new_line_list.append(temp_r)
+                temp_r = next_r
+        if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
+            new_line_list.append(temp_r)
+
+    log('merge_line cost: ' + str(time.time()-start_time))
+    return new_line_list
+
+
+def remove_outline_no_cross(_line_list):
+    row_list = []
+    col_list = []
+    for line in _line_list:
+        # 存所有行
+        if abs(line[0] - line[2]) > abs(line[1] - line[3]):
+            row_list.append(line)
+        # 存所有列
+        if abs(line[0] - line[2]) < abs(line[1] - line[3]):
+            col_list.append(line)
+
+    if not col_list:
+        return _line_list
+
+    # 左右两条边框
+    col_list.sort(key=lambda x: (x[0], x[1]))
+    left_col = col_list[0]
+    right_col = col_list[-1]
+
+    # 判断有交点但中间区域无交点
+    compare_list = []
+    for col in [left_col, right_col]:
+        add_h = abs(col[1]-col[3]) / 8
+        center_area = [col[1]+add_h, col[3]-add_h]
+        cross_cnt = 0
+        center_cross_cnt = 0
+        center_row_cnt = 0
+        for row in row_list:
+            if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
+                if center_area[0] <= row[1] <= center_area[1]:
+                    center_cross_cnt += 1
+                else:
+                    cross_cnt += 1
+            else:
+                if center_area[0] <= row[1] <= center_area[1]:
+                    center_row_cnt += 1
+        compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
+
+    _flag = True
+    for c in compare_list:
+        if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
+            continue
+        _flag = False
+    print('compare_list', compare_list)
+    if _flag and compare_list[0][1] == compare_list[1][1] \
+            and compare_list[0][2] == compare_list[1][2]:
+        for col in [left_col, right_col]:
+            if col in _line_list:
+                _line_list.remove(col)
+    return _line_list
+
+
+def table_line_pdf(layout, page_no, show=0):
+    print('table_line_pdf show ', show)
+    page_h = layout.height
+    page_w = layout.width
+
+    line_list = []
+
+    lt_text_container_list = []
+    lt_rect_list = []
+    lt_line_list = []
+    lt_curve_list = []
+
+    line_rect_list = []
+    non_line_rect_list = []
+    delete_lt_rect_list = []
+
+    start_time = time.time()
+    # 从layout中提取各种对象:文本框、矩形框、曲线、线
+    min_y = 10000
+    max_x, max_y = 0, 0
+    threshold = 2
+    for element in layout:
+        if isinstance(element, LTTextContainer):
+            lt_text_container_list.append(element)
+
+        elif isinstance(element, LTRect):
+            lt_rect_list.append(element)
+
+            # 筛选出线形矩形和非线形矩形
+            if (element.height <= threshold) ^ (element.width <= threshold):
+                print('line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
+                line_rect_list.append(element)
+            elif element.height > threshold and element.width > threshold:
+                print('non_line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
+                non_line_rect_list.append(element)
+            else:
+                delete_lt_rect_list.append(element)
+
+            # 获取最大尺寸
+            if element.bbox[1] <= min_y:
+                min_y = element.bbox[1]
+            if element.bbox[3] <= min_y:
+                min_y = element.bbox[3]
+            if element.bbox[1] > max_y:
+                max_y = element.bbox[1]
+            if element.bbox[3] > max_y:
+                max_y = element.bbox[3]
+            if element.bbox[0] > max_x:
+                max_x = element.bbox[0]
+            if element.bbox[2] > max_x:
+                max_x = element.bbox[2]
+
+        elif isinstance(element, LTLine):
+            lt_line_list.append(element)
+
+        elif isinstance(element, LTCurve):
+            lt_curve_list.append(element)
+
+    if show:
+        print('len(lt_text_container_list)', len(lt_text_container_list))
+        print('len(lt_rect_list)', len(lt_rect_list))
+        print('len(lt_line_list)', len(lt_line_list))
+        print('len(lt_curve_list)', len(lt_curve_list))
+
+        print('len(line_rect_list)', len(line_rect_list))
+        print('len(non_line_rect_list)', len(non_line_rect_list))
+        print('len(delete_lt_rect_list)', len(delete_lt_rect_list))
+
+    if max_y > page_h:
+        page_h = max_y + 20
+    if max_x > page_w:
+        page_w = max_x + 20
+
+    globals().update({'page_h': page_h})
+    globals().update({'page_w': page_w})
+
+    # 矩形框y有负数
+    if min_y < 0:
+        for lt_rect in lt_rect_list:
+            if lt_rect.y0 < 0 or lt_rect.y1 < 0:
+                new_y0 = 10 if lt_rect.y0 < 0 else lt_rect.y0
+                new_y1 = 10 if lt_rect.y1 < 0 else lt_rect.y1
+                lt_rect.set_bbox((lt_rect.x0, new_y0, lt_rect.x1, new_y1))
+
+    _plot([x.bbox for x in lt_rect_list + lt_line_list], 'get_page_lines start', mode=2, show=show)
+
+    # 合并矩形框
+    # for i in range(len(non_line_rect_list)):
+    #     lt_rect1 = non_line_rect_list[i]
+    #     b1 = lt_rect1.bbox
+    #     if lt_rect1 in delete_lt_rect_list:
+    #         continue
+    #     for j in range(i+1, len(non_line_rect_list)):
+    #         lt_rect2 = non_line_rect_list[j]
+    #         b2 = lt_rect2.bbox
+    #         if lt_rect2 in delete_lt_rect_list:
+    #             continue
+    #         if bbox_iou(b1, b2, False) >= 0.5:
+    #             delete_lt_rect_list.append(lt_rect2)
+    #
+    # # 非线形矩形若与线形矩形距离较近,则删除
+    # threshold = 5
+    # for n_rect in non_line_rect_list:
+    #     if n_rect in delete_lt_rect_list:
+    #         continue
+    #     middle_x = (n_rect.x0 + n_rect.x1) / 2
+    #     middle_y = (n_rect.y0 + n_rect.y1) / 2
+    #     for rect in line_rect_list:
+    #         if rect in delete_lt_rect_list:
+    #             continue
+    #         if rect.height >= rect.width:
+    #             if n_rect.width / 2 - threshold <= abs(rect.x0 - middle_x) <= n_rect.width / 2 + threshold:
+    #                 delete_lt_rect_list.append(n_rect)
+    #         else:
+    #             if n_rect.height / 2 - threshold <= abs(rect.y0 - middle_y) <= n_rect.height / 2 + threshold:
+    #                 delete_lt_rect_list.append(n_rect)
+
+    # 寻找每个文本框对应的最小矩形框
+    text_lt_rect_list = []
+    # for text_lt_rect in lt_text_container_list:
+    #     text_box = text_lt_rect.bbox
+    #     contain_iou_list = []
+    #
+    #     min_area = 1000000
+    #     min_lt_rect = None
+    #     for lt_rect in non_line_rect_list:
+    #         _bbox = lt_rect.bbox
+    #
+    #         if lt_rect in delete_lt_rect_list:
+    #             continue
+    #         if lt_rect in text_lt_rect_list:
+    #             continue
+    #         if lt_rect.height <= 5 or lt_rect.width <= 5:
+    #             continue
+    #
+    #         # 如果文本框与矩形框有交集,则直接删除
+    #         if (text_box[0] <= _bbox[0] <= text_box[2] or text_box[0] <= _bbox[2] <= text_box[2]) \
+    #                 and (text_box[1] <= _bbox[1] <= text_box[3] or text_box[1] <= _bbox[3] <= text_box[3]):
+    #             text_lt_rect_list.append(lt_rect)
+    #             continue
+    #
+    #         _area = abs(_bbox[2] - _bbox[0]) * abs(_bbox[3] - _bbox[1])
+    #         _iou = bbox_iou(_bbox, text_box, False)
+    #         if _iou >= 0.3 and _area < min_area:
+    #             min_area = _area
+    #             min_lt_rect = lt_rect
+    #         # else:
+    #         #     contain_iou = bbox_iou(_bbox, text_box, True)
+    #         #     contain_iou_list.append([lt_rect, contain_iou])
+    #
+    #     if min_lt_rect is not None:
+    #         text_lt_rect_list.append(min_lt_rect)
+    #     # else:
+    #     #     # 找不到就放低条件,计算iou时包含即为1
+    #     #     contain_iou_list.sort(key=lambda x: x[1])
+    #     #     text_lt_rect_list.append(contain_iou_list[-1][0])
+
+    delete_lt_rect_list += text_lt_rect_list
+
+    text_line_list = []
+    for lt_line in lt_text_container_list:
+        _b = lt_line.bbox
+        if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
+            text_line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
+        else:
+            text_line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
+
+    _plot(text_line_list, 'lt_text_container_list', mode=2, show=show)
+
+    # 从线对象提取线
+    for lt_line in lt_line_list+lt_curve_list:
+        _b = lt_line.bbox
+        if lt_line.height > 10 or lt_line.width > 10:
+            if lt_line.height >= lt_line.width:
+                line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
+            else:
+                line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
+
+    _plot(line_list, 'lt_line_list+lt_curve_list', mode=2, show=show)
+
+    # 从线形矩形框提取线
+    for lt_rect in line_rect_list:
+        if lt_rect in delete_lt_rect_list:
+            continue
+        _b = lt_rect.bbox
+        if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
+            line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
+        else:
+            line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
+
+    _plot(line_list, 'line_rect_list', mode=2, show=show)
+
+    # min_x, min_y = 10000, 10000
+    # max_x, max_y = 0, 0
+    # for _b in line_list:
+    #     min_x = _b[0] if _b[0] < min_x else min_x
+    #     max_x = _b[2] if _b[2] > max_x else max_x
+    #     min_y = _b[1] if _b[1] < min_y else min_y
+    #     max_y = _b[3] if _b[3] > max_y else max_y
+
+    # 从普通矩形框提取线,区分描边颜色,排除无色的
+    # threshold = 10
+    # img = np.full([int(max_x)+10, int(max_y)+10, 3], 255, dtype=np.uint8)
+    threshold = 0.3
+    for lt_rect in non_line_rect_list:
+        if lt_rect in delete_lt_rect_list:
+            continue
+        _b = lt_rect.bbox
+        if type(lt_rect.non_stroking_color) == tuple:
+            continue_flag = 0
+            for t in lt_rect.non_stroking_color:
+                if float(t) >= threshold:
+                    continue_flag = 1
+                    break
+            if continue_flag:
+                continue
+        elif lt_rect.non_stroking_color is not None and float(lt_rect.non_stroking_color) >= threshold:
+            continue
+        # if max_y != 10000 and min_y != 0:
+        #     if (_b[3] - max_y >= threshold and _b[2] - max_x >= threshold):
+        #         print('_b[3] - max_y >= threshold', _b[3], max_y, _b[2], max_x)
+        #         continue
+        #     if abs(_b[3] - _b[1]) * abs(_b[2] - _b[0]) >= 1 / 10 * abs(max_y - min_y) * abs(max_x - min_x):
+        #         print('>= 1 / 10', _b[3], _b[1], _b[2], _b[0], max_x, max_y)
+        #         continue
+        # contain_flag = 0
+        # for lt_rect2 in non_line_rect_list:
+        #     if lt_rect == lt_rect2:
+        #         continue
+        #     _b2 = lt_rect2.bbox
+        #     if bbox_iou(_b, _b2) >= 0.9:
+        #         contain_flag = 1
+        #     if _b2[0] <= _b[0] <= _b[2] <= _b2[2] and _b2[1] <= _b[1] <= _b[3] <= _b2[3]:
+        #         contain_flag = 1
+        # if contain_flag:
+        #     continue
+        line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[0], _b[1], _b[2], _b[1]],
+                      [_b[2], _b[1], _b[2], _b[3]], [_b[0], _b[3], _b[2], _b[3]]]
+        # cv2.rectangle(img, (int(_b[0]), int(_b[1])), (int(_b[2]), int(_b[3])), [random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)])
+        # cv2.imshow('img', img)
+        # cv2.waitKey(0)
+
+    _plot(line_list, 'non_line_rect_list', mode=2, show=show)
+
+    if not line_list:
+        return []
+    # 去重
+    line_list = [str(x) for x in line_list]
+    line_list = list(set(line_list))
+    line_list = [eval(x) for x in line_list]
+
+    # 合并线
+    line_list = merge_line(line_list)
+
+    if show:
+        print('get_page_lines len(line_list)', len(line_list))
+    _plot(line_list, 'line_list+bias_line_list', mode=2, show=show)
+
+    # 根据是否有交点判断表格线
+    cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
+
+    if show:
+        print('get_page_lines len(cross_line_list)', len(cross_line_list))
+    _plot(cross_line_list, 'get_cross_line', mode=2, show=show)
+
+    # 删除最外层嵌套边框
+    cross_line_list = remove_outline_no_cross(cross_line_list)
+
+    # 复用otr的部分后处理,补线
+    cross_line_list = table_line_pdf_post_process(cross_line_list, page_w, page_h)
+    _plot(cross_line_list, 'cross_line_process1', mode=2, show=show)
+
+    # 有过短的横线与过短的竖线交点
+    short_line_list = []
+    for line in cross_line_list:
+        if line[1] == line[3] and abs(line[2] - line[0]) <= 30:
+            short_line_list.append(line)
+        if line[0] == line[2] and abs(line[3] - line[1]) <= 30:
+            short_line_list.append(line)
+    for line in short_line_list:
+        for line2 in short_line_list:
+            if line == line2:
+                continue
+            if is_cross(line[:2], line[2:4], line2[:2], line2[2:4]):
+                if line in cross_line_list:
+                    cross_line_list.remove(line)
+                if line2 in cross_line_list:
+                    cross_line_list.remove(line2)
+
+    # print('len(temp_list), len(cross_line_list)', len(temp_list), len(cross_line_list))
+    # if len(temp_list) != len(cross_line_list):
+    #     cross_line_list = table_line_pdf_post_process(temp_list, page_w, page_h)
+
+    # show
+    if show:
+        print('len(cross_line_list)', len(cross_line_list))
+    _plot(cross_line_list, 'cross_line_process2', mode=2, show=show)
+
+    lt_line_list = []
+    for line in cross_line_list:
+        lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
+                                   (float(line[2]), float(line[3]))))
+    log("pdf page %s has %s lines cost: %s" % (str(page_no), str(len(lt_line_list)), str(time.time()-start_time)))
+    return lt_line_list

Неке датотеке нису приказане због велике количине промена