Procházet zdrojové kódy

1.rar、zip文件页数修复,文件之间独立读取
2.pdf直接读表格线修复
3.pdf删除重复出现图片
4.pdf无边框表格判断规则优化
5.pdf新增表格连接规则
6.office转换接口判断是否运行
7.pdf计算IOU修复
8.docx编号报错修复
9.docx嵌套表格实现

fangjiasheng před 1 rokem
rodič
revize
2405f43b4e

+ 10 - 9
format_convert/convert.py

@@ -46,11 +46,11 @@ MAX_COMPUTE = max_compute
 if get_platform() == "Windows":
     globals().update({"time_out": 1000})
 else:
-    globals().update({"time_out": 300})
+    globals().update({"time_out": 6000})
 
 
 @memory_decorator
-def getText(_type, path_or_stream, _page_no, time_out=300):
+def getText(_type, path_or_stream, _page_no=None, time_out=300):
     @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
     def get_html_1(_class):
         return _class.get_html()
@@ -59,7 +59,7 @@ def getText(_type, path_or_stream, _page_no, time_out=300):
     def get_html_2(_class):
         return _class.get_html()
 
-    log("file type - " + _type)
+    log("file type - " + _type + ' time out - ' + str(time_out))
 
     try:
         ss = path_or_stream.split(".")
@@ -76,10 +76,10 @@ def getText(_type, path_or_stream, _page_no, time_out=300):
             return DocxConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
     if _type == "zip":
-        return ZipConvert(path_or_stream, unique_type_dir).get_html()
+        return ZipConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
         # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
     if _type == "rar":
-        return RarConvert(path_or_stream, unique_type_dir).get_html()
+        return RarConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
         # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
     if _type == "xlsx":
         if MAX_COMPUTE:
@@ -370,6 +370,7 @@ def _convert():
     {[-12], 0}: 表格跨页连接报错
     {[-13], 0}: pdf表格线处理报错
     {[-14], 0}: 指定页码报错
+    {[-15], 0}: office转换接口未运行
     :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
     """
 
@@ -409,8 +410,8 @@ def _convert():
         _global.update({"md5": _md5})
         # 指定页码范围
         _page_no = data.get('page_no')
-        if _type not in ['pdf']:
-            _page_no = None
+        # if _type not in ['pdf']:
+        #     _page_no = None
 
         # 最终结果截取的最大字节数
         max_bytes = data.get("max_bytes")
@@ -420,7 +421,7 @@ def _convert():
             # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
             # text, swf_images = origin_unique_temp_file_process(stream, _type)
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no)
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
             except TimeoutError:
                 log("convert time out! 300 sec")
                 text = [-5]
@@ -428,7 +429,7 @@ def _convert():
         else:
             # Linux 通过装饰器设置整个转换超时时间
             try:
-                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no)
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
             except TimeoutError:
                 log("convert time out! 300 sec")
                 text = [-5]

+ 72 - 167
format_convert/convert_docx.py

@@ -14,105 +14,8 @@ from format_convert.utils import judge_error_code, add_div, get_logger, log, mem
 from format_convert.wrapt_timeout_decorator import timeout
 
 
-@memory_decorator
-def docx2text(path, unique_type_dir):
-    log("into docx2text")
-    try:
-        try:
-            doc = docx.Document(path)
-        except Exception as e:
-            print("docx format error!", e)
-            print(traceback.print_exc())
-            log("docx format error!")
-            return [-3]
-
-        # 遍历段落
-        # print("docx2text extract paragraph")
-        paragraph_text_list = []
-        for paragraph in doc.paragraphs:
-            if paragraph.text != "":
-                paragraph_text_list.append("<div>" + paragraph.text + "</div>" )
-                # print("paragraph_text", paragraph.text)
-
-        # 遍历表
-        try:
-            table_text_list = read_xml_table(path, unique_type_dir)
-        except TimeoutError:
-            return [-4]
-
-        if judge_error_code(table_text_list):
-            return table_text_list
-
-        # 顺序遍历图片
-        # print("docx2text extract image")
-        image_text_list = []
-        temp_image_path = unique_type_dir + "temp_image.png"
-        pattern = re.compile('rId\d+')
-        for graph in doc.paragraphs:
-            for run in graph.runs:
-                if run.text == '':
-                    try:
-                        if not pattern.search(run.element.xml):
-                            continue
-                        content_id = pattern.search(run.element.xml).group(0)
-                        content_type = doc.part.related_parts[content_id].content_type
-                    except Exception as e:
-                        print("docx no image!", e)
-                        continue
-                    if not content_type.startswith('image'):
-                        continue
-
-                    # 写入临时文件
-                    img_data = doc.part.related_parts[content_id].blob
-                    with open(temp_image_path, 'wb') as f:
-                        f.write(img_data)
-
-                    # if get_platform() == "Windows":
-                    #     print("img_data", img_data)
-
-                    if img_data is None:
-                        continue
-
-                    # 识别图片文字
-                    image_text = picture2text(temp_image_path)
-                    if image_text == [-2]:
-                        return [-2]
-                    if image_text == [-1]:
-                        return [-1]
-                    if image_text == [-3]:
-                        continue
-
-                    image_text = image_text[0]
-                    image_text_list.append(add_div(image_text))
-
-        # 解析document.xml,获取文字顺序
-        order_list = read_xml_order(path, unique_type_dir)
-        if order_list == [-2]:
-            return [-2]
-        if order_list == [-1]:
-            return [-1]
-
-        text = ""
-        # print("len(order_list)", len(order_list))
-        # print("len(paragraph_text_list)", len(paragraph_text_list))
-        # print("len(image_text_list)", len(image_text_list))
-        # print("len(table_text_list)", len(table_text_list))
-
-        for tag in order_list:
-            if tag == "w:t":
-                if len(paragraph_text_list) > 0:
-                    text += paragraph_text_list.pop(0)
-            if tag == "wp:docPr":
-                if len(image_text_list) > 0:
-                    text += image_text_list.pop(0)
-            if tag == "w:tbl":
-                if len(table_text_list) > 0:
-                    text += table_text_list.pop(0)
-        return [text]
-    except Exception as e:
-        log("docx2text error!")
-        print("docx2text", traceback.print_exc())
-        return [-1]
+def docx2text():
+    return
 
 
 @timeout(50, timeout_exception=TimeoutError)
@@ -172,6 +75,8 @@ def read_xml_order(path, save_path):
                             # print(num_pr_dict[group_id])
                             for level in range(node_level+1):
                                 # 当前level下有多少个node
+                                if level not in num_pr_dict[group_id]:
+                                    continue
                                 level_node_cnt = num_pr_dict[group_id][level]
                                 # print('level_node_cnt', level_node_cnt)
                                 text_no += str(level_node_cnt) + '.'
@@ -203,7 +108,7 @@ def read_xml_order(path, save_path):
 
             if "w:tbl" in str(line):
                 order_list.append("w:tbl")
-        read_xml_table(path, save_path)
+        # read_xml_table(path, save_path)
         return [order_list, text_list]
     except Exception as e:
         log("read_xml_order error!")
@@ -214,96 +119,96 @@ def read_xml_order(path, save_path):
 
 @timeout(50, timeout_exception=TimeoutError)
 def read_xml_table(path, save_path):
-    log("into read_xml_table")
-    try:
-        try:
-            f = zipfile.ZipFile(path)
-            for file in f.namelist():
-                if "word/document.xml" == str(file):
-                    f.extract(file, save_path)
-            f.close()
-        except Exception as e:
-            # print("docx format error!", e)
-            log("docx format error!")
-            return [-3]
-
-        log("xml_analyze%s"%(save_path))
-        try:
-            collection = xml_analyze(save_path + "word/document.xml")
-        except TimeoutError:
-            log("xml_analyze timeout")
-            return [-4]
-
-        log("xml_analyze done")
-        body = collection.getElementsByTagName("w:body")[0]
-        table_text_list = []
-        # print("body.childNodes", body.childNodes)
-        for line in body.childNodes:
-            if "w:tbl" in str(line):
-                # print("str(line)", str(line))
-                table_text = '<table border="1">'
-                tr_list = line.getElementsByTagName("w:tr")
-                # print("line.childNodes", line.childNodes)
-                tr_index = 0
-                tr_text_list = []
-                tr_text_list_colspan = []
-                for tr in tr_list:
-                    table_text = table_text + "<tr>"
-                    tc_list = tr.getElementsByTagName("w:tc")
-                    tc_index = 0
-                    tc_text_list = []
-                    for tc in tc_list:
+    def recursion_read_table(table):
+        table_text = '<table border="1">'
+        tr_index = 0
+        tr_text_list = []
+        # 直接子节点用child表示,所有子节点用all表示
+        for table_child in table.childNodes:
+            if 'w:tr' in str(table_child):
+                tr = table_child
+                tr_child_nodes = tr.childNodes
+                tc_index = 0
+                tc_text_list = []
+                for tr_child in tr_child_nodes:
+                    if 'w:tc' in str(tr_child).split(' '):
                         tc_text = ""
-
-                        # 获取一格占多少列
+                        tc = tr_child
+                        # 获取一格占多少列,相当于colspan
                         col_span = tc.getElementsByTagName("w:gridSpan")
                         if col_span:
                             col_span = int(col_span[0].getAttribute("w:val"))
                         else:
                             col_span = 1
-
-                        # 获取是否是合并单元格的下一个空单元格
+                        # 获取是否是合并单元格的下一个空单元格,相当于rowspan
                         is_merge = tc.getElementsByTagName("w:vMerge")
                         if is_merge:
                             is_merge = is_merge[0].getAttribute("w:val")
                             if is_merge == "continue":
                                 col_span_index = 0
                                 real_tc_index = 0
-
-                                # if get_platform() == "Windows":
-                                #     print("read_xml_table tr_text_list", tr_text_list)
-                                #     print("read_xml_table tr_index", tr_index)
-
                                 if 0 <= tr_index - 1 < len(tr_text_list):
                                     for tc_colspan in tr_text_list[tr_index - 1]:
                                         if col_span_index < tc_index:
                                             col_span_index += tc_colspan[1]
                                             real_tc_index += 1
-
-                                    # print("tr_index-1, real_tc_index", tr_index-1, real_tc_index)
-                                    # print(tr_text_list[tr_index-1])
                                     if real_tc_index < len(tr_text_list[tr_index - 1]):
                                         tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
-
+                        # 设置colspan
                         table_text = table_text + "<td colspan=" + str(col_span) + ">"
-                        p_list = tc.getElementsByTagName("w:p")
-
-                        for p in p_list:
-                            t = p.getElementsByTagName("w:t")
-                            if t:
-                                for tt in t:
-                                    # print("tt", tt.childNodes)
-                                    if len(tt.childNodes) > 0:
-                                        tc_text += tt.childNodes[0].nodeValue
-
+                        # 放入文本
+                        tc_child_nodes = tc.childNodes
+                        for tc_child in tc_child_nodes:
+                            if 'w:tbl' in str(tc_child).split(' '):
+                                # 嵌套在tc中的表格
+                                tc_text += recursion_read_table(tc_child)
+                            if 'w:p' in str(tc_child).split(' '):
+                                tc_p_all_nodes = tc_child.getElementsByTagName("*")
+                                for tc_p_all in tc_p_all_nodes:
+                                    if 'w:t' in str(tc_p_all).split(' '):
+                                        # w:t必须加childNodes[0]才能读文本
+                                        tc_text += tc_p_all.childNodes[0].nodeValue
+                        # 结束该tc
                         table_text = table_text + tc_text + "</td>"
                         tc_index += 1
                         tc_text_list.append([tc_text, col_span])
-                    table_text += "</tr>"
-                    tr_index += 1
-                    tr_text_list.append(tc_text_list)
-                table_text += "</table>"
-                table_text_list.append(table_text)
+                # 结束该tr
+                table_text += "</tr>"
+                tr_index += 1
+                tr_text_list.append(tc_text_list)
+        # 结束该table
+        table_text += "</table>"
+        return table_text
+
+    log("into read_xml_table")
+    try:
+        try:
+            f = zipfile.ZipFile(path)
+            for file in f.namelist():
+                if "word/document.xml" == str(file):
+                    f.extract(file, save_path)
+            f.close()
+        except Exception as e:
+            # print("docx format error!", e)
+            log("docx format error!")
+            return [-3]
+
+        log("xml_analyze%s"%(save_path))
+        try:
+            collection = xml_analyze(save_path + "word/document.xml")
+        except TimeoutError:
+            log("xml_analyze timeout")
+            return [-4]
+
+        log("xml_analyze done")
+        body = collection.getElementsByTagName("w:body")[0]
+        table_text_list = []
+        body_nodes = body.childNodes
+        for node in body_nodes:
+            if 'w:tbl' in str(node).split(' '):
+                _table = node
+                _table_text = recursion_read_table(_table)
+                table_text_list.append(_table_text)
         return table_text_list
 
     except Exception as e:

+ 6 - 0
format_convert/convert_need_interface.py

@@ -820,6 +820,9 @@ def interface_pool_gunicorn(interface_type):
 
         # 选取端口
         if interface_type == "office":
+            if len(port_list) == 0:
+                raise ConnectionError
+
             # 刚开始随机,后续求余
             if min_cnt == 0:
                 _port = port_list[random.randint(0, len(port_list)-1)]
@@ -844,6 +847,9 @@ def interface_pool_gunicorn(interface_type):
     except NotFound:
         log("ip_port or ip_port_dict is None! checkout config")
         return [-2]
+    except ConnectionError:
+        log('no office interface running!')
+        return [-15]
     except:
         traceback.print_exc()
         return [-1]

+ 134 - 30
format_convert/convert_pdf.py

@@ -35,7 +35,7 @@ from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
     LTTextBoxVertical, LTLine, LTTextContainer
 from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
-    get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross
+    get_logger, log, memory_decorator, draw_lines_plt, get_garble_code, line_is_cross, get_md5_from_bytes, bytes2np
 import fitz
 from format_convert.wrapt_timeout_decorator import timeout
 
@@ -689,6 +689,9 @@ class PDFConvert:
         self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
         self.has_init_pdf = [0] * len(self.packages)
 
+        # 记录图片对象的md5,用于去除大量重复图片
+        self.md5_image_obj_list = []
+
     @memory_decorator
     def init_package(self, package_name):
         # 各个包初始化
@@ -800,6 +803,41 @@ class PDFConvert:
             self._doc.add_child(self._page)
             page_no += 1
 
+        self.delete_same_image()
+
+    def delete_same_image(self, show=0):
+        # 剔除大量重复图片
+        md5_dict = {}
+        for _md5, image_obj in self.md5_image_obj_list:
+            if _md5 in md5_dict.keys():
+                md5_dict[_md5] += [image_obj]
+            else:
+                md5_dict[_md5] = [image_obj]
+        cnt_threshold = 10
+        delete_obj_list = []
+        for _md5 in md5_dict.keys():
+            img_list = md5_dict.get(_md5)
+            print('len(md5_dict.get(_md5))', _md5, len(img_list))
+            if len(img_list) >= cnt_threshold:
+                if show:
+                    img_np = bytes2np(img_list[0].content)
+                    cv2.namedWindow('delete same img_np', cv2.WINDOW_NORMAL)
+                    cv2.imshow('delete same img_np', img_np)
+                    cv2.waitKey(0)
+                delete_obj_list += img_list
+        for page in self._doc.children:
+            for obj in delete_obj_list:
+                if obj in page.children:
+                    page.children.remove(obj)
+
+        if show:
+            for page in self._doc.children:
+                for obj in page.children:
+                    if isinstance(obj, _Image):
+                        img_np = bytes2np(obj.content)
+                        cv2.imshow('page img_np', img_np)
+                        cv2.waitKey(0)
+
     def clean_text(self, _text):
         return re.sub("\s", "", _text)
 
@@ -1116,6 +1154,32 @@ class PDFConvert:
                         _line_list.remove(col)
             return _line_list
 
+        def cross_line_process(_cross_line_list, _bias_line_list):
+            # 斜线校正
+            if _cross_line_list:
+                _cross_line_list = repair_bias_line(_cross_line_list)
+
+            # 修复竖线
+            if _bias_line_list:
+                _cross_line_list = repair_col_line(_cross_line_list, _bias_line_list)
+
+            # 根据是否有交点判断表格线
+            _cross_line_list = get_cross_line(_cross_line_list, threshold=1, cross_times=1)
+
+            # 合并线条
+            if not _cross_line_list:
+                return []
+            _cross_line_list = merge_line(_cross_line_list)
+
+            # 删除最外层嵌套边框
+            _cross_line_list = remove_outline_no_cross(_cross_line_list)
+
+            # 复用otr的部分后处理,补线
+            from otr.table_line_new import table_line_pdf
+            _cross_line_list = table_line_pdf(_cross_line_list, page_w, page_h)
+
+            return _cross_line_list
+
         log('into get_page_lines')
 
         page_h = layout.height
@@ -1142,12 +1206,18 @@ class PDFConvert:
                     continue
                 line_list.append(element.bbox)
 
+        if show:
+            print('get_page_lines line_list', line_list)
+            print('get_page_lines bias_line_list', bias_line_list)
+            _plot(line_list+bias_line_list, mode=2)
         if not line_list and not bias_line_list:
             return []
 
         # 是否使用斜线来生成表格
+        line_list_copy = copy.deepcopy(line_list)
         if len(line_list) < 6 and len(bias_line_list) > len(line_list) * 2:
-            # print('use bias line')
+            if show:
+                print('use bias line')
             # bias_line_list += add_col_bias_line(line_list, bias_line_list)
             line_list = bias_line_list
 
@@ -1156,34 +1226,26 @@ class PDFConvert:
         line_list = list(set(line_list))
         line_list = [eval(x) for x in line_list]
 
-        # 根据是否有交点判断表格线
-        cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
-
-        if not cross_line_list:
-            return []
-
-        # 斜线校正
-        if cross_line_list:
-            cross_line_list = repair_bias_line(cross_line_list)
-
-        # 修复竖线
-        if bias_line_list:
-            cross_line_list = repair_col_line(cross_line_list, bias_line_list)
+        if show:
+            _plot(line_list, mode=2)
 
         # 根据是否有交点判断表格线
-        cross_line_list = get_cross_line(cross_line_list, threshold=1, cross_times=1)
+        cross_line_list = get_cross_line(line_list_copy+bias_line_list, threshold=2, cross_times=1)
 
-        # 合并线条
+        if show:
+            print('get_page_lines cross_line_list', cross_line_list)
         if not cross_line_list:
-            return []
-        cross_line_list = merge_line(cross_line_list)
-
-        # 删除最外层嵌套边框
-        cross_line_list = remove_outline_no_cross(cross_line_list)
+            # 将线全部合并再获取一次
+            cross_line_list = get_cross_line(line_list_copy+bias_line_list, threshold=2, cross_times=1)
+            if not cross_line_list:
+                return []
 
-        # 复用otr的部分后处理,补线
-        from otr.table_line_new import table_line_pdf
-        cross_line_list = table_line_pdf(cross_line_list, page_w, page_h)
+        cross_line_list = cross_line_process(cross_line_list, bias_line_list)
+        if not cross_line_list:
+            cross_line_list = get_cross_line(line_list_copy+bias_line_list, threshold=2, cross_times=1)
+            cross_line_list = cross_line_process(cross_line_list, bias_line_list)
+            if show:
+                print('get_page_lines cross_line_list2', cross_line_list)
 
         # show
         if show:
@@ -1287,6 +1349,15 @@ class PDFConvert:
             # 水印行跳过
             if len(row) == 1 and len(row[0].get_text()[:-1]) == 1:
                 continue
+            # 目录行跳过
+            continue_flag = False
+            for r in row:
+                if re.search('[.·]{7,}', r.get_text()):
+                    continue_flag = True
+                    break
+            if continue_flag:
+                continue
+
             if len(row) == 1:
                 text = row[0].get_text()
                 bbox = row[0].bbox
@@ -1359,7 +1430,7 @@ class PDFConvert:
         lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
         log("convert_pdf page " + str(page_no))
         log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
-        log('layout.width, layout.height' + str(layout.width) + str(layout.height))
+        log('layout.width, layout.height ' + str(layout.width) + str(layout.height))
 
         # 若只有文本且图片数为0,直接提取文字及表格
         # if only_image == 0 and image_count == 0:
@@ -1412,7 +1483,7 @@ class PDFConvert:
 
         # 若该页图片数量过多,或无文本,则直接ocr整页识别
         # elif image_count > 3 or only_image == 1:
-        if len(lt_image_list) > 3 or len(lt_text_list) == 0:
+        if len(lt_image_list) > 4 or len(lt_text_list) == 0:
             page_image = self.get_page_image(page_no)
             if judge_error_code(page_image):
                 self._page.error_code = page_image
@@ -1441,6 +1512,8 @@ class PDFConvert:
                             _image = _Image(page_image[1], page_image[0])
                             _image.is_from_pdf = True
                             self._page.add_child(_image)
+                            image_md5 = get_md5_from_bytes(page_image[1])
+                            self.md5_image_obj_list.append([image_md5, _image])
                         return
                     # 比较小的图则直接保存用ocr识别
                     else:
@@ -1451,6 +1524,8 @@ class PDFConvert:
                             image_stream = ff.read()
                         _image = _Image(image_stream, temp_path, image.bbox)
                         self._page.add_child(_image)
+                        image_md5 = get_md5_from_bytes(image_stream)
+                        self.md5_image_obj_list.append([image_md5, _image])
                 except Exception:
                     log("pdf2text pdfminer read image in page " + str(page_no) +
                         "  fail! use pymupdf read image...")
@@ -1580,6 +1655,7 @@ class PDFConvert:
         # 0: 前一页最后一个表格为A,后一页第一个表格为B
         # 1.1: A后无文本(除了页码),且B前无文本(除了页码)
         # 1.2: B前有文字(可能是页眉,小于60字),且B的第一行前几个单元格为空,且第一行不为空的单元格有文字较多的格子
+        # 1.3: B前有文字(可能是页眉,小于60字),且B的第一行第一个单元格为空,且有文字的格子数量占所有格子的一半
         connect_flag_list = []
         soup_list = []
         for i, h in enumerate(html_list):
@@ -1622,12 +1698,16 @@ class PDFConvert:
                     if rows:
                         first_row = rows[0]
                         col_text_list = [len(x.text) for x in first_row]
+                        # 文字大于60且第一个为空
                         if len(h[:first_table_start]) <= 60 and col_text_list[0] == 0 and max(col_text_list) >= 30:
                             connect_flag2 = True
+                        # 有文字格子数占一半一下且第一个格子为空
+                        elif col_text_list.count(0) >= len(col_text_list) / 2 and col_text_list[0] == 0:
+                            connect_flag2 = True
 
             connect_flag_list.append([i, connect_flag2, connect_flag1])
 
-        # print('connect_flag_list', connect_flag_list)
+        print('connect_flag_list', connect_flag_list)
 
         # 根据条件1合并需连接页码,形成组
         connect_pages_list = []
@@ -1645,7 +1725,7 @@ class PDFConvert:
         if temp_list:
             connect_pages_list.append(temp_list)
 
-        # print('connect_pages_list', connect_pages_list)
+        print('connect_pages_list', connect_pages_list)
 
         # 判断后续条件:判断组内列数是否相同
         connect_pages_list2 = []
@@ -1654,6 +1734,8 @@ class PDFConvert:
                 connect_pages_list2.append(c_list)
             else:
                 col_cnt_list = []
+                # 单元格可能被复制了,相同的合并当做一列
+                merge_col_cnt_list = []
                 for c in c_list:
                     soup = soup_list[c[0]]
                     table1 = soup.findAll('table')[-1]
@@ -1663,10 +1745,32 @@ class PDFConvert:
                     td1 = tr1[-1].findAll('td')
                     td2 = tr2[0].findAll('td')
                     col_cnt_list.append([len(td2), len(td1)])
+
+                    # # 计算合并重复文本格子后的列数
+                    # last_text = td1[0].text
+                    # merge_td1 = [last_text]
+                    # for td in td1:
+                    #     if td.text == last_text:
+                    #         continue
+                    #     else:
+                    #         merge_td1.append(td.text)
+                    #         last_text = td.text
+                    # last_text = td2[0].text
+                    # merge_td2 = [last_text]
+                    # for td in td2:
+                    #     if td.text == last_text:
+                    #         continue
+                    #     else:
+                    #         merge_td2.append(td.text)
+                    #         last_text = td.text
+                    # merge_col_cnt_list.append([len(merge_td2), len(merge_td1)])
+
+                # 判断
                 new_c_list = [c_list[0]]
                 # print('col_cnt_list', col_cnt_list)
                 for i in range(len(col_cnt_list) - 1):
                     if col_cnt_list[i][1] != col_cnt_list[i + 1][0]:
+                            # and merge_col_cnt_list[i][1] != merge_col_cnt_list[i + 1][0]:
                         connect_pages_list2.append(new_c_list)
                         new_c_list = [c_list[i + 1]]
                     else:
@@ -1674,7 +1778,7 @@ class PDFConvert:
                 if new_c_list:
                     connect_pages_list2.append(new_c_list)
 
-        # print('connect_pages_list2', connect_pages_list2)
+        print('connect_pages_list2', connect_pages_list2)
 
         # 符合连接条件的拼接表格
         new_html_list = []

+ 14 - 6
format_convert/convert_rar.py

@@ -82,11 +82,13 @@ def rar2text(path, unique_type_dir):
 
 
 class RarConvert:
-    def __init__(self, path, unique_type_dir):
+    def __init__(self, path, unique_type_dir, page_no, time_out):
         self._doc = _Document(path)
         self.path = path
         self.unique_type_dir = unique_type_dir
         self.rar_path = unique_type_dir
+        self.page_no = page_no
+        self.time_out = time_out
 
     @memory_decorator
     def init_package(self):
@@ -137,13 +139,19 @@ class RarConvert:
             # 有文件后缀,截取
             else:
                 _type = file.split(".")[-1]
-                sub_html = getText(_type, file)
+                if _type in ['pdf']:
+                    sub_html = getText(_type, file, self.page_no, time_out=self.time_out)
+                else:
+                    sub_html = getText(_type, file, time_out=self.time_out)
 
-            if judge_error_code(sub_html, code=[-3]):
-                continue
+            # 文件报错也继续
             if judge_error_code(sub_html):
-                self._doc.error_code = sub_html
-                return
+                continue
+            # if judge_error_code(sub_html, code=[-3]):
+            #     continue
+            # if judge_error_code(sub_html):
+            #     self._doc.error_code = sub_html
+            #     return
 
             _sen = _Sentence(sub_html[0], bbox, is_html=True)
             self._page.add_child(_sen)

+ 16 - 6
format_convert/convert_zip.py

@@ -108,11 +108,13 @@ def zip2text(path, unique_type_dir):
 
 
 class ZipConvert:
-    def __init__(self, path, unique_type_dir):
+    def __init__(self, path, unique_type_dir, page_no, time_out):
         self._doc = _Document(path)
         self.path = path
         self.unique_type_dir = unique_type_dir
         self.zip_path = unique_type_dir
+        self.page_no = page_no
+        self.time_out = time_out
 
     @memory_decorator
     def init_package(self):
@@ -185,13 +187,21 @@ class ZipConvert:
             # 有文件后缀,截取
             else:
                 _type = file.split(".")[-1]
-                sub_html = getText(_type, file)
+                if _type in ['pdf']:
+                    sub_html = getText(_type, file, self.page_no, time_out=self.time_out)
+                else:
+                    sub_html = getText(_type, file, time_out=self.time_out)
 
-            if judge_error_code(sub_html, code=[-3]):
-                continue
+            # log('convert_zip.py sub_html ' + str(sub_html))
+
+            # 文件报错也继续
             if judge_error_code(sub_html):
-                self._doc.error_code = sub_html
-                return
+                continue
+            # if judge_error_code(sub_html, code=[-3]):
+            #     continue
+            # if judge_error_code(sub_html):
+            #     self._doc.error_code = sub_html
+            #     return
 
             _sen = _Sentence(sub_html[0], bbox, is_html=True)
             self._page.add_child(_sen)

+ 4 - 0
format_convert/kill_all.py

@@ -8,6 +8,10 @@ import time
 
 ip_port_dict = get_ip_port()
 ip = get_using_ip()
+
+if ip == 'http://127.0.0.1':
+    ip = 'http://0.0.0.0'
+
 python_path = get_args_from_config(ip_port_dict, ip, "python_path")[0]
 project_path = get_args_from_config(ip_port_dict, ip, "project_path")[0]
 gunicorn_path = get_args_from_config(ip_port_dict, ip, "gunicorn_path")[0]

+ 3 - 0
format_convert/monitor_process_config.py

@@ -13,6 +13,9 @@ ip_port_dict = get_ip_port()
 ip = get_using_ip()
 print("local ip:", ip)
 
+if ip == 'http://127.0.0.1':
+    ip = 'http://0.0.0.0'
+
 # 获取各个参数
 convert_port_list = get_args_from_config(ip_port_dict, ip, "convert", "MASTER")
 if convert_port_list:

+ 11 - 3
format_convert/utils.py

@@ -38,7 +38,7 @@ if get_platform() == "Linux":
 import math
 
 
-def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14]):
+def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15]):
     """
     [0] : continue
     [-1]: 逻辑处理错误
@@ -55,6 +55,7 @@ def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -1
     [-12]: 表格跨页连接报错
     [-13]: pdf表格线处理报错
     [-14]: 指定页码报错
+    [-15]: office转换接口未运行
     """
     for c in code:
         if isinstance(_list, list) and _list == [c]:
@@ -1037,7 +1038,11 @@ class LineTable:
                 list_iou.append(_iou)
             max_iou_index = np.argmax(list_iou)
             max_iou = list_iou[max_iou_index]
-            if max_iou > 0.1 and textbox not in in_objs:
+            # if self.from_pdf:
+            #     iou_threhold = 0.3
+            # else:
+            iou_threhold = 0.1
+            if max_iou > iou_threhold and textbox not in in_objs:
                 list_cells[max_iou_index]["inbox_textbox_list"].append(textbox)
                 in_objs.add(textbox)
 
@@ -1045,7 +1050,7 @@ class LineTable:
                 # 多个iou大于0.3的,可能是ocr将两个文本合成一个了
                 iou_index_list = np.where(np.array(list_iou) >= 0.3)[0].tolist()
                 if len(iou_index_list) >= 2:
-                    print('len(iou_index_list) >= 2 textbox', textbox)
+                    # print('len(iou_index_list) >= 2 textbox', textbox)
                     self.connect_bbox_list.append(textbox)
 
         has_matched_box_list = []
@@ -1268,6 +1273,9 @@ class LineTable:
         return 0
 
     def getIOU(self, bbox0, bbox1):
+        bbox0 = [min(bbox0[0], bbox0[2]), min(bbox0[1], bbox0[3]), max(bbox0[0], bbox0[2]), max(bbox0[1], bbox0[3])]
+        bbox1 = [min(bbox1[0], bbox1[2]), min(bbox1[1], bbox1[3]), max(bbox1[0], bbox1[2]), max(bbox1[1], bbox1[3])]
+
         width = abs(max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0])) - (
                     abs(bbox0[2] - bbox0[0]) + abs(bbox1[2] - bbox1[0]))
         height = abs(max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1])) - (