před 2 roky · 2405f43b4e
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -46,11 +46,11 @@ MAX_COMPUTE = max_compute
 
				 if get_platform() == "Windows":
			
 
				     globals().update({"time_out": 1000})
			
 
				 else:
			
 
				-    globals().update({"time_out": 300})
			
 
				+    globals().update({"time_out": 6000})
			
 
				 
			
 
				 
			
 
				 @memory_decorator
			
 
				-def getText(_type, path_or_stream, _page_no, time_out=300):
			
 
				+def getText(_type, path_or_stream, _page_no=None, time_out=300):
			
 
				     @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
			
 
				     def get_html_1(_class):
			
 
				         return _class.get_html()
			
@@ -59,7 +59,7 @@ def getText(_type, path_or_stream, _page_no, time_out=300):
 
				     def get_html_2(_class):
			
 
				         return _class.get_html()
			
 
				 
			
 
				-    log("file type - " + _type)
			
 
				+    log("file type - " + _type + ' time out - ' + str(time_out))
			
 
				 
			
 
				     try:
			
 
				         ss = path_or_stream.split(".")
			
@@ -76,10 +76,10 @@ def getText(_type, path_or_stream, _page_no, time_out=300):
 
				             return DocxConvert(path_or_stream, unique_type_dir).get_html()
			
 
				         return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "zip":
			
 
				-        return ZipConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return ZipConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
			
 
				         # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "rar":
			
 
				-        return RarConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return RarConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
			
 
				         # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "xlsx":
			
 
				         if MAX_COMPUTE:
			
@@ -370,6 +370,7 @@ def _convert():
 
				     {[-12], 0}: 表格跨页连接报错
			
 
				     {[-13], 0}: pdf表格线处理报错
			
 
				     {[-14], 0}: 指定页码报错
			
 
				+    {[-15], 0}: office转换接口未运行
			
 
				     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
			
 
				     """
			
 
				 
			
@@ -409,8 +410,8 @@ def _convert():
 
				         _global.update({"md5": _md5})
			
 
				         # 指定页码范围
			
 
				         _page_no = data.get('page_no')
			
 
				-        if _type not in ['pdf']:
			
 
				-            _page_no = None
			
 
				+        # if _type not in ['pdf']:
			
 
				+        #     _page_no = None
			
 
				 
			
 
				         # 最终结果截取的最大字节数
			
 
				         max_bytes = data.get("max_bytes")
			
@@ -420,7 +421,7 @@ def _convert():
 
				             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
			
 
				             # text, swf_images = origin_unique_temp_file_process(stream, _type)
			
 
				             try:
			
 
				-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no)
			
 
				+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
			
 
				             except TimeoutError:
			
 
				                 log("convert time out! 300 sec")
			
 
				                 text = [-5]
			
@@ -428,7 +429,7 @@ def _convert():
 
				         else:
			
 
				             # Linux 通过装饰器设置整个转换超时时间
			
 
				             try:
			
 
				-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no)
			
 
				+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
			
 
				             except TimeoutError:
			
 
				                 log("convert time out! 300 sec")
			
 
				                 text = [-5]
			
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -14,105 +14,8 @@ from format_convert.utils import judge_error_code, add_div, get_logger, log, mem
 
				 from format_convert.wrapt_timeout_decorator import timeout
			
 
				 
			
 
				 
			
 
				-@memory_decorator
			
 
				-def docx2text(path, unique_type_dir):
			
 
				-    log("into docx2text")
			
 
				-    try:
			
 
				-        try:
			
 
				-            doc = docx.Document(path)
			
 
				-        except Exception as e:
			
 
				-            print("docx format error!", e)
			
 
				-            print(traceback.print_exc())
			
 
				-            log("docx format error!")
			
 
				-            return [-3]
			
 
				-
			
 
				-        # 遍历段落
			
 
				-        # print("docx2text extract paragraph")
			
 
				-        paragraph_text_list = []
			
 
				-        for paragraph in doc.paragraphs:
			
 
				-            if paragraph.text != "":
			
 
				-                paragraph_text_list.append("<div>" + paragraph.text + "</div>" )
			
 
				-                # print("paragraph_text", paragraph.text)
			
 
				-
			
 
				-        # 遍历表
			
 
				-        try:
			
 
				-            table_text_list = read_xml_table(path, unique_type_dir)
			
 
				-        except TimeoutError:
			
 
				-            return [-4]
			
 
				-
			
 
				-        if judge_error_code(table_text_list):
			
 
				-            return table_text_list
			
 
				-
			
 
				-        # 顺序遍历图片
			
 
				-        # print("docx2text extract image")
			
 
				-        image_text_list = []
			
 
				-        temp_image_path = unique_type_dir + "temp_image.png"
			
 
				-        pattern = re.compile('rId\d+')
			
 
				-        for graph in doc.paragraphs:
			
 
				-            for run in graph.runs:
			
 
				-                if run.text == '':
			
 
				-                    try:
			
 
				-                        if not pattern.search(run.element.xml):
			
 
				-                            continue
			
 
				-                        content_id = pattern.search(run.element.xml).group(0)
			
 
				-                        content_type = doc.part.related_parts[content_id].content_type
			
 
				-                    except Exception as e:
			
 
				-                        print("docx no image!", e)
			
 
				-                        continue
			
 
				-                    if not content_type.startswith('image'):
			
 
				-                        continue
			
 
				-
			
 
				-                    # 写入临时文件
			
 
				-                    img_data = doc.part.related_parts[content_id].blob
			
 
				-                    with open(temp_image_path, 'wb') as f:
			
 
				-                        f.write(img_data)
			
 
				-
			
 
				-                    # if get_platform() == "Windows":
			
 
				-                    #     print("img_data", img_data)
			
 
				-
			
 
				-                    if img_data is None:
			
 
				-                        continue
			
 
				-
			
 
				-                    # 识别图片文字
			
 
				-                    image_text = picture2text(temp_image_path)
			
 
				-                    if image_text == [-2]:
			
 
				-                        return [-2]
			
 
				-                    if image_text == [-1]:
			
 
				-                        return [-1]
			
 
				-                    if image_text == [-3]:
			
 
				-                        continue
			
 
				-
			
 
				-                    image_text = image_text[0]
			
 
				-                    image_text_list.append(add_div(image_text))
			
 
				-
			
 
				-        # 解析document.xml，获取文字顺序
			
 
				-        order_list = read_xml_order(path, unique_type_dir)
			
 
				-        if order_list == [-2]:
			
 
				-            return [-2]
			
 
				-        if order_list == [-1]:
			
 
				-            return [-1]
			
 
				-
			
 
				-        text = ""
			
 
				-        # print("len(order_list)", len(order_list))
			
 
				-        # print("len(paragraph_text_list)", len(paragraph_text_list))
			
 
				-        # print("len(image_text_list)", len(image_text_list))
			
 
				-        # print("len(table_text_list)", len(table_text_list))
			
 
				-
			
 
				-        for tag in order_list:
			
 
				-            if tag == "w:t":
			
 
				-                if len(paragraph_text_list) > 0:
			
 
				-                    text += paragraph_text_list.pop(0)
			
 
				-            if tag == "wp:docPr":
			
 
				-                if len(image_text_list) > 0:
			
 
				-                    text += image_text_list.pop(0)
			
 
				-            if tag == "w:tbl":
			
 
				-                if len(table_text_list) > 0:
			
 
				-                    text += table_text_list.pop(0)
			
 
				-        return [text]
			
 
				-    except Exception as e:
			
 
				-        log("docx2text error!")
			
 
				-        print("docx2text", traceback.print_exc())
			
 
				-        return [-1]
			
 
				+def docx2text():
			
 
				+    return
			
 
				 
			
 
				 
			
 
				 @timeout(50, timeout_exception=TimeoutError)
			
@@ -172,6 +75,8 @@ def read_xml_order(path, save_path):
 
				                             # print(num_pr_dict[group_id])
			
 
				                             for level in range(node_level+1):
			
 
				                                 # 当前level下有多少个node
			
 
				+                                if level not in num_pr_dict[group_id]:
			
 
				+                                    continue
			
 
				                                 level_node_cnt = num_pr_dict[group_id][level]
			
 
				                                 # print('level_node_cnt', level_node_cnt)
			
 
				                                 text_no += str(level_node_cnt) + '.'
			
@@ -203,7 +108,7 @@ def read_xml_order(path, save_path):
 
				 
			
 
				             if "w:tbl" in str(line):
			
 
				                 order_list.append("w:tbl")
			
 
				-        read_xml_table(path, save_path)
			
 
				+        # read_xml_table(path, save_path)
			
 
				         return [order_list, text_list]
			
 
				     except Exception as e:
			
 
				         log("read_xml_order error!")
			
@@ -214,96 +119,96 @@ def read_xml_order(path, save_path):
 
				 
			
 
				 @timeout(50, timeout_exception=TimeoutError)
			
 
				 def read_xml_table(path, save_path):
			
 
				-    log("into read_xml_table")
			
 
				-    try:
			
 
				-        try:
			
 
				-            f = zipfile.ZipFile(path)
			
 
				-            for file in f.namelist():
			
 
				-                if "word/document.xml" == str(file):
			
 
				-                    f.extract(file, save_path)
			
 
				-            f.close()
			
 
				-        except Exception as e:
			
 
				-            # print("docx format error!", e)
			
 
				-            log("docx format error!")
			
 
				-            return [-3]
			
 
				-
			
 
				-        log("xml_analyze%s"%(save_path))
			
 
				-        try:
			
 
				-            collection = xml_analyze(save_path + "word/document.xml")
			
 
				-        except TimeoutError:
			
 
				-            log("xml_analyze timeout")
			
 
				-            return [-4]
			
 
				-
			
 
				-        log("xml_analyze done")
			
 
				-        body = collection.getElementsByTagName("w:body")[0]
			
 
				-        table_text_list = []
			
 
				-        # print("body.childNodes", body.childNodes)
			
 
				-        for line in body.childNodes:
			
 
				-            if "w:tbl" in str(line):
			
 
				-                # print("str(line)", str(line))
			
 
				-                table_text = '<table border="1">'
			
 
				-                tr_list = line.getElementsByTagName("w:tr")
			
 
				-                # print("line.childNodes", line.childNodes)
			
 
				-                tr_index = 0
			
 
				-                tr_text_list = []
			
 
				-                tr_text_list_colspan = []
			
 
				-                for tr in tr_list:
			
 
				-                    table_text = table_text + "<tr>"
			
 
				-                    tc_list = tr.getElementsByTagName("w:tc")
			
 
				-                    tc_index = 0
			
 
				-                    tc_text_list = []
			
 
				-                    for tc in tc_list:
			
 
				+    def recursion_read_table(table):
			
 
				+        table_text = '<table border="1">'
			
 
				+        tr_index = 0
			
 
				+        tr_text_list = []
			
 
				+        # 直接子节点用child表示，所有子节点用all表示
			
 
				+        for table_child in table.childNodes:
			
 
				+            if 'w:tr' in str(table_child):
			
 
				+                tr = table_child
			
 
				+                tr_child_nodes = tr.childNodes
			
 
				+                tc_index = 0
			
 
				+                tc_text_list = []
			
 
				+                for tr_child in tr_child_nodes:
			
 
				+                    if 'w:tc' in str(tr_child).split(' '):
			
 
				                         tc_text = ""
			
 
				-
			
 
				-                        # 获取一格占多少列
			
 
				+                        tc = tr_child
			
 
				+                        # 获取一格占多少列，相当于colspan
			
 
				                         col_span = tc.getElementsByTagName("w:gridSpan")
			
 
				                         if col_span:
			
 
				                             col_span = int(col_span[0].getAttribute("w:val"))
			
 
				                         else:
			
 
				                             col_span = 1
			
 
				-
			
 
				-                        # 获取是否是合并单元格的下一个空单元格
			
 
				+                        # 获取是否是合并单元格的下一个空单元格，相当于rowspan
			
 
				                         is_merge = tc.getElementsByTagName("w:vMerge")
			
 
				                         if is_merge:
			
 
				                             is_merge = is_merge[0].getAttribute("w:val")
			
 
				                             if is_merge == "continue":
			
 
				                                 col_span_index = 0
			
 
				                                 real_tc_index = 0
			
 
				-
			
 
				-                                # if get_platform() == "Windows":
			
 
				-                                #     print("read_xml_table tr_text_list", tr_text_list)
			
 
				-                                #     print("read_xml_table tr_index", tr_index)
			
 
				-
			
 
				                                 if 0 <= tr_index - 1 < len(tr_text_list):
			
 
				                                     for tc_colspan in tr_text_list[tr_index - 1]:
			
 
				                                         if col_span_index < tc_index:
			
 
				                                             col_span_index += tc_colspan[1]
			
 
				                                             real_tc_index += 1
			
 
				-
			
 
				-                                    # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
			
 
				-                                    # print(tr_text_list[tr_index-1])
			
 
				                                     if real_tc_index < len(tr_text_list[tr_index - 1]):
			
 
				                                         tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
			
 
				-
			
 
				+                        # 设置colspan
			
 
				                         table_text = table_text + "<td colspan=" + str(col_span) + ">"
			
 
				-                        p_list = tc.getElementsByTagName("w:p")
			
 
				-
			
 
				-                        for p in p_list:
			
 
				-                            t = p.getElementsByTagName("w:t")
			
 
				-                            if t:
			
 
				-                                for tt in t:
			
 
				-                                    # print("tt", tt.childNodes)
			
 
				-                                    if len(tt.childNodes) > 0:
			
 
				-                                        tc_text += tt.childNodes[0].nodeValue
			
 
				-
			
 
				+                        # 放入文本
			
 
				+                        tc_child_nodes = tc.childNodes
			
 
				+                        for tc_child in tc_child_nodes:
			
 
				+                            if 'w:tbl' in str(tc_child).split(' '):
			
 
				+                                # 嵌套在tc中的表格
			
 
				+                                tc_text += recursion_read_table(tc_child)
			
 
				+                            if 'w:p' in str(tc_child).split(' '):
			
 
				+                                tc_p_all_nodes = tc_child.getElementsByTagName("*")
			
 
				+                                for tc_p_all in tc_p_all_nodes:
			
 
				+                                    if 'w:t' in str(tc_p_all).split(' '):
			
 
				+                                        # w:t必须加childNodes[0]才能读文本
			
 
				+                                        tc_text += tc_p_all.childNodes[0].nodeValue
			
 
				+                        # 结束该tc
			
 
				                         table_text = table_text + tc_text + "</td>"
			
 
				                         tc_index += 1
			
 
				                         tc_text_list.append([tc_text, col_span])
			
 
				-                    table_text += "</tr>"
			
 
				-                    tr_index += 1
			
 
				-                    tr_text_list.append(tc_text_list)
			
 
				-                table_text += "</table>"
			
 
				-                table_text_list.append(table_text)
			
 
				+                # 结束该tr
			
 
				+                table_text += "</tr>"
			
 
				+                tr_index += 1
			
 
				+                tr_text_list.append(tc_text_list)
			
 
				+        # 结束该table
			
 
				+        table_text += "</table>"
			
 
				+        return table_text
			
 
				+
			
 
				+    log("into read_xml_table")
			
 
				+    try:
			
 
				+        try:
			
 
				+            f = zipfile.ZipFile(path)
			
 
				+            for file in f.namelist():
			
 
				+                if "word/document.xml" == str(file):
			
 
				+                    f.extract(file, save_path)
			
 
				+            f.close()
			
 
				+        except Exception as e:
			
 
				+            # print("docx format error!", e)
			
 
				+            log("docx format error!")
			
 
				+            return [-3]
			
 
				+
			
 
				+        log("xml_analyze%s"%(save_path))
			
 
				+        try:
			
 
				+            collection = xml_analyze(save_path + "word/document.xml")
			
 
				+        except TimeoutError:
			
 
				+            log("xml_analyze timeout")
			
 
				+            return [-4]
			
 
				+
			
 
				+        log("xml_analyze done")
			
 
				+        body = collection.getElementsByTagName("w:body")[0]
			
 
				+        table_text_list = []
			
 
				+        body_nodes = body.childNodes
			
 
				+        for node in body_nodes:
			
 
				+            if 'w:tbl' in str(node).split(' '):
			
 
				+                _table = node
			
 
				+                _table_text = recursion_read_table(_table)
			
 
				+                table_text_list.append(_table_text)
			
 
				         return table_text_list
			
 
				 
			
 
				     except Exception as e:
			
--- a/format_convert/convert_need_interface.py
+++ b/format_convert/convert_need_interface.py
@@ -820,6 +820,9 @@ def interface_pool_gunicorn(interface_type):
 
				 
			
 
				         # 选取端口
			
 
				         if interface_type == "office":
			
 
				+            if len(port_list) == 0:
			
 
				+                raise ConnectionError
			
 
				+
			
 
				             # 刚开始随机，后续求余
			
 
				             if min_cnt == 0:
			
 
				                 _port = port_list[random.randint(0, len(port_list)-1)]
			
@@ -844,6 +847,9 @@ def interface_pool_gunicorn(interface_type):
 
				     except NotFound:
			
 
				         log("ip_port or ip_port_dict is None! checkout config")
			
 
				         return [-2]
			
 
				+    except ConnectionError:
			
 
				+        log('no office interface running!')
			
 
				+        return [-15]
			
 
				     except:
			
 
				         traceback.print_exc()
			
 
				         return [-1]
			
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -35,7 +35,7 @@ from pdfminer.converter import PDFPageAggregator
 
				 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
			
 
				     LTTextBoxVertical, LTLine, LTTextContainer
			
 
				 from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
			
 
				-    get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross
			
 
				+    get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross, get_md5_from_bytes, bytes2np
			
 
				 import fitz
			
 
				 from format_convert.wrapt_timeout_decorator import timeout
			
 
				 
			
@@ -689,6 +689,9 @@ class PDFConvert:
 
				         self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
			
 
				         self.has_init_pdf = [0] * len(self.packages)
			
 
				 
			
 
				+        # 记录图片对象的md5，用于去除大量重复图片
			
 
				+        self.md5_image_obj_list = []
			
 
				+
			
 
				     @memory_decorator
			
 
				     def init_package(self, package_name):
			
 
				         # 各个包初始化
			
@@ -800,6 +803,41 @@ class PDFConvert:
 
				             self._doc.add_child(self._page)
			
 
				             page_no += 1
			
 
				 
			
 
				+        self.delete_same_image()
			
 
				+
			
 
				+    def delete_same_image(self, show=0):
			
 
				+        # 剔除大量重复图片
			
 
				+        md5_dict = {}
			
 
				+        for _md5, image_obj in self.md5_image_obj_list:
			
 
				+            if _md5 in md5_dict.keys():
			
 
				+                md5_dict[_md5] += [image_obj]
			
 
				+            else:
			
 
				+                md5_dict[_md5] = [image_obj]
			
 
				+        cnt_threshold = 10
			
 
				+        delete_obj_list = []
			
 
				+        for _md5 in md5_dict.keys():
			
 
				+            img_list = md5_dict.get(_md5)
			
 
				+            print('len(md5_dict.get(_md5))', _md5, len(img_list))
			
 
				+            if len(img_list) >= cnt_threshold:
			
 
				+                if show:
			
 
				+                    img_np = bytes2np(img_list[0].content)
			
 
				+                    cv2.namedWindow('delete same img_np', cv2.WINDOW_NORMAL)
			
 
				+                    cv2.imshow('delete same img_np', img_np)
			
 
				+                    cv2.waitKey(0)
			
 
				+                delete_obj_list += img_list
			
 
				+        for page in self._doc.children:
			
 
				+            for obj in delete_obj_list:
			
 
				+                if obj in page.children:
			
 
				+                    page.children.remove(obj)
			
 
				+
			
 
				+        if show:
			
 
				+            for page in self._doc.children:
			
 
				+                for obj in page.children:
			
 
				+                    if isinstance(obj, _Image):
			
 
				+                        img_np = bytes2np(obj.content)
			
 
				+                        cv2.imshow('page img_np', img_np)
			
 
				+                        cv2.waitKey(0)
			
 
				+
			
 
				     def clean_text(self, _text):
			
 
				         return re.sub("\s", "", _text)
			
 
				 
			
@@ -1116,6 +1154,32 @@ class PDFConvert:
 
				                         _line_list.remove(col)
			
 
				             return _line_list
			
 
				 
			
 
				+        def cross_line_process(_cross_line_list, _bias_line_list):
			
 
				+            # 斜线校正
			
 
				+            if _cross_line_list:
			
 
				+                _cross_line_list = repair_bias_line(_cross_line_list)
			
 
				+
			
 
				+            # 修复竖线
			
 
				+            if _bias_line_list:
			
 
				+                _cross_line_list = repair_col_line(_cross_line_list, _bias_line_list)
			
 
				+
			
 
				+            # 根据是否有交点判断表格线
			
 
				+            _cross_line_list = get_cross_line(_cross_line_list, threshold=1, cross_times=1)
			
 
				+
			
 
				+            # 合并线条
			
 
				+            if not _cross_line_list:
			
 
				+                return []
			
 
				+            _cross_line_list = merge_line(_cross_line_list)
			
 
				+
			
 
				+            # 删除最外层嵌套边框
			
 
				+            _cross_line_list = remove_outline_no_cross(_cross_line_list)
			
 
				+
			
 
				+            # 复用otr的部分后处理，补线
			
 
				+            from otr.table_line_new import table_line_pdf
			
 
				+            _cross_line_list = table_line_pdf(_cross_line_list, page_w, page_h)
			
 
				+
			
 
				+            return _cross_line_list
			
 
				+
			
 
				         log('into get_page_lines')
			
 
				 
			
 
				         page_h = layout.height
			
@@ -1142,12 +1206,18 @@ class PDFConvert:
 
				                     continue
			
 
				                 line_list.append(element.bbox)
			
 
				 
			
 
				+        if show:
			
 
				+            print('get_page_lines line_list', line_list)
			
 
				+            print('get_page_lines bias_line_list', bias_line_list)
			
 
				+            _plot(line_list+bias_line_list, mode=2)
			
 
				         if not line_list and not bias_line_list:
			
 
				             return []
			
 
				 
			
 
				         # 是否使用斜线来生成表格
			
 
				+        line_list_copy = copy.deepcopy(line_list)
			
 
				         if len(line_list) < 6 and len(bias_line_list) > len(line_list) * 2:
			
 
				-            # print('use bias line')
			
 
				+            if show:
			
 
				+                print('use bias line')
			
 
				             # bias_line_list += add_col_bias_line(line_list, bias_line_list)
			
 
				             line_list = bias_line_list
			
 
				 
			
@@ -1156,34 +1226,26 @@ class PDFConvert:
 
				         line_list = list(set(line_list))
			
 
				         line_list = [eval(x) for x in line_list]
			
 
				 
			
 
				-        # 根据是否有交点判断表格线
			
 
				-        cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
			
 
				-
			
 
				-        if not cross_line_list:
			
 
				-            return []
			
 
				-
			
 
				-        # 斜线校正
			
 
				-        if cross_line_list:
			
 
				-            cross_line_list = repair_bias_line(cross_line_list)
			
 
				-
			
 
				-        # 修复竖线
			
 
				-        if bias_line_list:
			
 
				-            cross_line_list = repair_col_line(cross_line_list, bias_line_list)
			
 
				+        if show:
			
 
				+            _plot(line_list, mode=2)
			
 
				 
			
 
				         # 根据是否有交点判断表格线
			
 
				-        cross_line_list = get_cross_line(cross_line_list, threshold=1, cross_times=1)
			
 
				+        cross_line_list = get_cross_line(line_list_copy+bias_line_list, threshold=2, cross_times=1)
			
 
				 
			
 
				-        # 合并线条
			
 
				+        if show:
			
 
				+            print('get_page_lines cross_line_list', cross_line_list)
			
 
				         if not cross_line_list:
			
 
				-            return []
			
 
				-        cross_line_list = merge_line(cross_line_list)
			
 
				-
			
 
				-        # 删除最外层嵌套边框
			
 
				-        cross_line_list = remove_outline_no_cross(cross_line_list)
			
 
				+            # 将线全部合并再获取一次
			
 
				+            cross_line_list = get_cross_line(line_list_copy+bias_line_list, threshold=2, cross_times=1)
			
 
				+            if not cross_line_list:
			
 
				+                return []
			
 
				 
			
 
				-        # 复用otr的部分后处理，补线
			
 
				-        from otr.table_line_new import table_line_pdf
			
 
				-        cross_line_list = table_line_pdf(cross_line_list, page_w, page_h)
			
 
				+        cross_line_list = cross_line_process(cross_line_list, bias_line_list)
			
 
				+        if not cross_line_list:
			
 
				+            cross_line_list = get_cross_line(line_list_copy+bias_line_list, threshold=2, cross_times=1)
			
 
				+            cross_line_list = cross_line_process(cross_line_list, bias_line_list)
			
 
				+            if show:
			
 
				+                print('get_page_lines cross_line_list2', cross_line_list)
			
 
				 
			
 
				         # show
			
 
				         if show:
			
@@ -1287,6 +1349,15 @@ class PDFConvert:
 
				             # 水印行跳过
			
 
				             if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
			
 
				                 continue
			
 
				+            # 目录行跳过
			
 
				+            continue_flag = False
			
 
				+            for r in row:
			
 
				+                if re.search('[.·]{7,}', r.get_text()):
			
 
				+                    continue_flag = True
			
 
				+                    break
			
 
				+            if continue_flag:
			
 
				+                continue
			
 
				+
			
 
				             if len(row) == 1:
			
 
				                 text = row[0].get_text()
			
 
				                 bbox = row[0].bbox
			
@@ -1359,7 +1430,7 @@ class PDFConvert:
 
				         lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
			
 
				         log("convert_pdf page " + str(page_no))
			
 
				         log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
			
 
				-        log('layout.width, layout.height' + str(layout.width) + str(layout.height))
			
 
				+        log('layout.width, layout.height ' + str(layout.width) + str(layout.height))
			
 
				 
			
 
				         # 若只有文本且图片数为0，直接提取文字及表格
			
 
				         # if only_image == 0 and image_count == 0:
			
@@ -1412,7 +1483,7 @@ class PDFConvert:
 
				 
			
 
				         # 若该页图片数量过多，或无文本，则直接ocr整页识别
			
 
				         # elif image_count > 3 or only_image == 1:
			
 
				-        if len(lt_image_list) > 3 or len(lt_text_list) == 0:
			
 
				+        if len(lt_image_list) > 4 or len(lt_text_list) == 0:
			
 
				             page_image = self.get_page_image(page_no)
			
 
				             if judge_error_code(page_image):
			
 
				                 self._page.error_code = page_image
			
@@ -1441,6 +1512,8 @@ class PDFConvert:
 
				                             _image = _Image(page_image[1], page_image[0])
			
 
				                             _image.is_from_pdf = True
			
 
				                             self._page.add_child(_image)
			
 
				+                            image_md5 = get_md5_from_bytes(page_image[1])
			
 
				+                            self.md5_image_obj_list.append([image_md5, _image])
			
 
				                         return
			
 
				                     # 比较小的图则直接保存用ocr识别
			
 
				                     else:
			
@@ -1451,6 +1524,8 @@ class PDFConvert:
 
				                             image_stream = ff.read()
			
 
				                         _image = _Image(image_stream, temp_path, image.bbox)
			
 
				                         self._page.add_child(_image)
			
 
				+                        image_md5 = get_md5_from_bytes(image_stream)
			
 
				+                        self.md5_image_obj_list.append([image_md5, _image])
			
 
				                 except Exception:
			
 
				                     log("pdf2text pdfminer read image in page " + str(page_no) +
			
 
				                         "  fail! use pymupdf read image...")
			
@@ -1580,6 +1655,7 @@ class PDFConvert:
 
				         # 0: 前一页最后一个表格为A，后一页第一个表格为B
			
 
				         # 1.1: A后无文本(除了页码)，且B前无文本(除了页码)
			
 
				         # 1.2: B前有文字(可能是页眉，小于60字)，且B的第一行前几个单元格为空，且第一行不为空的单元格有文字较多的格子
			
 
				+        # 1.3: B前有文字(可能是页眉，小于60字)，且B的第一行第一个单元格为空，且有文字的格子数量占所有格子的一半
			
 
				         connect_flag_list = []
			
 
				         soup_list = []
			
 
				         for i, h in enumerate(html_list):
			
@@ -1622,12 +1698,16 @@ class PDFConvert:
 
				                     if rows:
			
 
				                         first_row = rows[0]
			
 
				                         col_text_list = [len(x.text) for x in first_row]
			
 
				+                        # 文字大于60且第一个为空
			
 
				                         if len(h[:first_table_start]) <= 60 and col_text_list[0] == 0 and max(col_text_list) >= 30:
			
 
				                             connect_flag2 = True
			
 
				+                        # 有文字格子数占一半一下且第一个格子为空
			
 
				+                        elif col_text_list.count(0) >= len(col_text_list) / 2 and col_text_list[0] == 0:
			
 
				+                            connect_flag2 = True
			
 
				 
			
 
				             connect_flag_list.append([i, connect_flag2, connect_flag1])
			
 
				 
			
 
				-        # print('connect_flag_list', connect_flag_list)
			
 
				+        print('connect_flag_list', connect_flag_list)
			
 
				 
			
 
				         # 根据条件1合并需连接页码，形成组
			
 
				         connect_pages_list = []
			
@@ -1645,7 +1725,7 @@ class PDFConvert:
 
				         if temp_list:
			
 
				             connect_pages_list.append(temp_list)
			
 
				 
			
 
				-        # print('connect_pages_list', connect_pages_list)
			
 
				+        print('connect_pages_list', connect_pages_list)
			
 
				 
			
 
				         # 判断后续条件：判断组内列数是否相同
			
 
				         connect_pages_list2 = []
			
@@ -1654,6 +1734,8 @@ class PDFConvert:
 
				                 connect_pages_list2.append(c_list)
			
 
				             else:
			
 
				                 col_cnt_list = []
			
 
				+                # 单元格可能被复制了，相同的合并当做一列
			
 
				+                merge_col_cnt_list = []
			
 
				                 for c in c_list:
			
 
				                     soup = soup_list[c[0]]
			
 
				                     table1 = soup.findAll('table')[-1]
			
@@ -1663,10 +1745,32 @@ class PDFConvert:
 
				                     td1 = tr1[-1].findAll('td')
			
 
				                     td2 = tr2[0].findAll('td')
			
 
				                     col_cnt_list.append([len(td2), len(td1)])
			
 
				+
			
 
				+                    # # 计算合并重复文本格子后的列数
			
 
				+                    # last_text = td1[0].text
			
 
				+                    # merge_td1 = [last_text]
			
 
				+                    # for td in td1:
			
 
				+                    #     if td.text == last_text:
			
 
				+                    #         continue
			
 
				+                    #     else:
			
 
				+                    #         merge_td1.append(td.text)
			
 
				+                    #         last_text = td.text
			
 
				+                    # last_text = td2[0].text
			
 
				+                    # merge_td2 = [last_text]
			
 
				+                    # for td in td2:
			
 
				+                    #     if td.text == last_text:
			
 
				+                    #         continue
			
 
				+                    #     else:
			
 
				+                    #         merge_td2.append(td.text)
			
 
				+                    #         last_text = td.text
			
 
				+                    # merge_col_cnt_list.append([len(merge_td2), len(merge_td1)])
			
 
				+
			
 
				+                # 判断
			
 
				                 new_c_list = [c_list[0]]
			
 
				                 # print('col_cnt_list', col_cnt_list)
			
 
				                 for i in range(len(col_cnt_list) - 1):
			
 
				                     if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
			
 
				+                            # and merge_col_cnt_list[i][1] != merge_col_cnt_list[i + 1][0]:
			
 
				                         connect_pages_list2.append(new_c_list)
			
 
				                         new_c_list = [c_list[i + 1]]
			
 
				                     else:
			
@@ -1674,7 +1778,7 @@ class PDFConvert:
 
				                 if new_c_list:
			
 
				                     connect_pages_list2.append(new_c_list)
			
 
				 
			
 
				-        # print('connect_pages_list2', connect_pages_list2)
			
 
				+        print('connect_pages_list2', connect_pages_list2)
			
 
				 
			
 
				         # 符合连接条件的拼接表格
			
 
				         new_html_list = []
			
--- a/format_convert/convert_rar.py
+++ b/format_convert/convert_rar.py
@@ -82,11 +82,13 @@ def rar2text(path, unique_type_dir):
 
				 
			
 
				 
			
 
				 class RarConvert:
			
 
				-    def __init__(self, path, unique_type_dir):
			
 
				+    def __init__(self, path, unique_type_dir, page_no, time_out):
			
 
				         self._doc = _Document(path)
			
 
				         self.path = path
			
 
				         self.unique_type_dir = unique_type_dir
			
 
				         self.rar_path = unique_type_dir
			
 
				+        self.page_no = page_no
			
 
				+        self.time_out = time_out
			
 
				 
			
 
				     @memory_decorator
			
 
				     def init_package(self):
			
@@ -137,13 +139,19 @@ class RarConvert:
 
				             # 有文件后缀，截取
			
 
				             else:
			
 
				                 _type = file.split(".")[-1]
			
 
				-                sub_html = getText(_type, file)
			
 
				+                if _type in ['pdf']:
			
 
				+                    sub_html = getText(_type, file, self.page_no, time_out=self.time_out)
			
 
				+                else:
			
 
				+                    sub_html = getText(_type, file, time_out=self.time_out)
			
 
				 
			
 
				-            if judge_error_code(sub_html, code=[-3]):
			
 
				-                continue
			
 
				+            # 文件报错也继续
			
 
				             if judge_error_code(sub_html):
			
 
				-                self._doc.error_code = sub_html
			
 
				-                return
			
 
				+                continue
			
 
				+            # if judge_error_code(sub_html, code=[-3]):
			
 
				+            #     continue
			
 
				+            # if judge_error_code(sub_html):
			
 
				+            #     self._doc.error_code = sub_html
			
 
				+            #     return
			
 
				 
			
 
				             _sen = _Sentence(sub_html[0], bbox, is_html=True)
			
 
				             self._page.add_child(_sen)
			
--- a/format_convert/convert_zip.py
+++ b/format_convert/convert_zip.py
@@ -108,11 +108,13 @@ def zip2text(path, unique_type_dir):
 
				 
			
 
				 
			
 
				 class ZipConvert:
			
 
				-    def __init__(self, path, unique_type_dir):
			
 
				+    def __init__(self, path, unique_type_dir, page_no, time_out):
			
 
				         self._doc = _Document(path)
			
 
				         self.path = path
			
 
				         self.unique_type_dir = unique_type_dir
			
 
				         self.zip_path = unique_type_dir
			
 
				+        self.page_no = page_no
			
 
				+        self.time_out = time_out
			
 
				 
			
 
				     @memory_decorator
			
 
				     def init_package(self):
			
@@ -185,13 +187,21 @@ class ZipConvert:
 
				             # 有文件后缀，截取
			
 
				             else:
			
 
				                 _type = file.split(".")[-1]
			
 
				-                sub_html = getText(_type, file)
			
 
				+                if _type in ['pdf']:
			
 
				+                    sub_html = getText(_type, file, self.page_no, time_out=self.time_out)
			
 
				+                else:
			
 
				+                    sub_html = getText(_type, file, time_out=self.time_out)
			
 
				 
			
 
				-            if judge_error_code(sub_html, code=[-3]):
			
 
				-                continue
			
 
				+            # log('convert_zip.py sub_html ' + str(sub_html))
			
 
				+
			
 
				+            # 文件报错也继续
			
 
				             if judge_error_code(sub_html):
			
 
				-                self._doc.error_code = sub_html
			
 
				-                return
			
 
				+                continue
			
 
				+            # if judge_error_code(sub_html, code=[-3]):
			
 
				+            #     continue
			
 
				+            # if judge_error_code(sub_html):
			
 
				+            #     self._doc.error_code = sub_html
			
 
				+            #     return
			
 
				 
			
 
				             _sen = _Sentence(sub_html[0], bbox, is_html=True)
			
 
				             self._page.add_child(_sen)
			
--- a/format_convert/kill_all.py
+++ b/format_convert/kill_all.py
@@ -8,6 +8,10 @@ import time
 
				 
			
 
				 ip_port_dict = get_ip_port()
			
 
				 ip = get_using_ip()
			
 
				+
			
 
				+if ip == 'http://127.0.0.1':
			
 
				+    ip = 'http://0.0.0.0'
			
 
				+
			
 
				 python_path = get_args_from_config(ip_port_dict, ip, "python_path")[0]
			
 
				 project_path = get_args_from_config(ip_port_dict, ip, "project_path")[0]
			
 
				 gunicorn_path = get_args_from_config(ip_port_dict, ip, "gunicorn_path")[0]
			
--- a/format_convert/monitor_process_config.py
+++ b/format_convert/monitor_process_config.py
@@ -13,6 +13,9 @@ ip_port_dict = get_ip_port()
 
				 ip = get_using_ip()
			
 
				 print("local ip:", ip)
			
 
				 
			
 
				+if ip == 'http://127.0.0.1':
			
 
				+    ip = 'http://0.0.0.0'
			
 
				+
			
 
				 # 获取各个参数
			
 
				 convert_port_list = get_args_from_config(ip_port_dict, ip, "convert", "MASTER")
			
 
				 if convert_port_list:
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -38,7 +38,7 @@ if get_platform() == "Linux":
 
				 import math
			
 
				 
			
 
				 
			
 
				-def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14]):
			
 
				+def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15]):
			
 
				     """
			
 
				     [0] : continue
			
 
				     [-1]: 逻辑处理错误
			
@@ -55,6 +55,7 @@ def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -1
 
				     [-12]: 表格跨页连接报错
			
 
				     [-13]: pdf表格线处理报错
			
 
				     [-14]: 指定页码报错
			
 
				+    [-15]: office转换接口未运行
			
 
				     """
			
 
				     for c in code:
			
 
				         if isinstance(_list, list) and _list == [c]:
			
@@ -1037,7 +1038,11 @@ class LineTable:
 
				                 list_iou.append(_iou)
			
 
				             max_iou_index = np.argmax(list_iou)
			
 
				             max_iou = list_iou[max_iou_index]
			
 
				-            if max_iou > 0.1 and textbox not in in_objs:
			
 
				+            # if self.from_pdf:
			
 
				+            #     iou_threhold = 0.3
			
 
				+            # else:
			
 
				+            iou_threhold = 0.1
			
 
				+            if max_iou > iou_threhold and textbox not in in_objs:
			
 
				                 list_cells[max_iou_index]["inbox_textbox_list"].append(textbox)
			
 
				                 in_objs.add(textbox)
			
 
				 
			
@@ -1045,7 +1050,7 @@ class LineTable:
 
				                 # 多个iou大于0.3的，可能是ocr将两个文本合成一个了
			
 
				                 iou_index_list = np.where(np.array(list_iou) >= 0.3)[0].tolist()
			
 
				                 if len(iou_index_list) >= 2:
			
 
				-                    print('len(iou_index_list) >= 2 textbox', textbox)
			
 
				+                    # print('len(iou_index_list) >= 2 textbox', textbox)
			
 
				                     self.connect_bbox_list.append(textbox)
			
 
				 
			
 
				         has_matched_box_list = []
			
@@ -1268,6 +1273,9 @@ class LineTable:
 
				         return 0
			
 
				 
			
 
				     def getIOU(self, bbox0, bbox1):
			
 
				+        bbox0 = [min(bbox0[0], bbox0[2]), min(bbox0[1], bbox0[3]), max(bbox0[0], bbox0[2]), max(bbox0[1], bbox0[3])]
			
 
				+        bbox1 = [min(bbox1[0], bbox1[2]), min(bbox1[1], bbox1[3]), max(bbox1[0], bbox1[2]), max(bbox1[1], bbox1[3])]
			
 
				+
			
 
				         width = abs(max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0])) - (
			
 
				                     abs(bbox0[2] - bbox0[0]) + abs(bbox1[2] - bbox1[0]))
			
 
				         height = abs(max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1])) - (