Просмотр исходного кода

附件识别,保留表格的合并单元格

fangjiasheng 3 месяцев назад
Родитель
Сommit
ef08b56c48
4 измененных файлов с 460 добавлено и 35 удалено
  1. 185 13
      format_convert/convert_docx.py
  2. 1 1
      format_convert/convert_xls.py
  3. 92 16
      format_convert/convert_xlsx.py
  4. 182 5
      format_convert/utils.py

+ 185 - 13
format_convert/convert_docx.py

@@ -1,5 +1,7 @@
 import os
 import sys
+from collections import defaultdict
+
 sys.path.append(os.path.dirname(__file__) + "/../")
 from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
 import re
@@ -319,6 +321,7 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
         num_pr_dict = {}
 
         # 直接子节点用child表示,所有子节点用all表示
+        row_span_dict = {}
         for table_child in table.childNodes:
             if 'w:tr' in str(table_child):
                 table_text += "<tr>"
@@ -339,19 +342,37 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
                         # 获取是否是合并单元格的下一个空单元格,相当于rowspan
                         is_merge = tc.getElementsByTagName("w:vMerge")
                         if is_merge:
+
                             is_merge = is_merge[0].getAttribute("w:val")
+                            # print(tr_index, tc_index, is_merge)
+                            # print('row_span_dict', row_span_dict)
                             if is_merge == "continue":
-                                col_span_index = 0
-                                real_tc_index = 0
-                                if 0 <= tr_index - 1 < len(tr_text_list):
-                                    for tc_colspan in tr_text_list[tr_index - 1]:
-                                        if col_span_index < tc_index:
-                                            col_span_index += tc_colspan[1]
-                                            real_tc_index += 1
-                                    if real_tc_index < len(tr_text_list[tr_index - 1]):
-                                        tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
+                                row_span_dict[tc_index][0] += 1
+                                tc_index += col_span
+                                # 跳过,不增加td
+                                continue
+                                # col_span_index = 0
+                                # real_tc_index = 0
+                                # if 0 <= tr_index - 1 < len(tr_text_list):
+                                #     for tc_colspan in tr_text_list[tr_index - 1]:
+                                #         if col_span_index < tc_index:
+                                #             col_span_index += tc_colspan[1]
+                                #             real_tc_index += 1
+                                #     if real_tc_index < len(tr_text_list[tr_index - 1]):
+                                #         tc_text = tr_text_list[tr_index - 1][real_tc_index][0]
+                            else:
+                                # 先结束上一次同列的合并单元格
+                                if tc_index in row_span_dict:
+                                    row_span, finish_row_span_flag = row_span_dict.get(tc_index)
+                                    table_text = re.sub(finish_row_span_flag, str(row_span), table_text)
+                                # 开启新的合并单元格
+                                row_span_flag = '#@#_{}_{}'.format(tr_index, tc_index)
+                                row_span_dict[tc_index] = [1, row_span_flag]
+                        else:
+                            row_span_flag = 1
+
                         # 设置colspan
-                        table_text = table_text + "<td colspan=" + str(col_span) + ">"
+                        table_text = table_text + "<td rowspan={} colspan={}>".format(row_span_flag, col_span)
                         # 放入文本
                         tc_child_nodes = tc.childNodes
                         for tc_child in tc_child_nodes:
@@ -372,14 +393,21 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
                                 #     if 'w:t' in str(tc_p_all).split(' '):
                                 #         # w:t必须加childNodes[0]才能读文本
                                 #         tc_text += tc_p_all.childNodes[0].nodeValue
+                        # print('tc_text', tc_text)
                         # 结束该tc
                         table_text = table_text + tc_text + "</td>"
-                        tc_index += 1
+                        tc_index += col_span
                         tc_text_list.append([tc_text, col_span])
                 # 结束该tr
                 table_text += "</tr>"
                 tr_index += 1
                 tr_text_list.append(tc_text_list)
+
+        # 替换所有row_span
+        for key in row_span_dict.keys():
+            row_span, finish_row_span_flag = row_span_dict.get(key)
+            table_text = re.sub(finish_row_span_flag, str(row_span), table_text)
+
         # 结束该table
         table_text += "</table>"
         return table_text
@@ -622,6 +650,150 @@ class DocxConvert:
         return self._doc.get_html()
 
 
+class DocxConvertNew:
+    # 解压 .docx 文件
+    def unzip_docx(self, file_path, extract_to):
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_to)
+
+    # 解析 numbering.xml 文件,获取编号信息
+    def parse_numbering(self, file_path):
+        numbering = defaultdict(list)
+        dom = xml.dom.minidom.parse(file_path)
+        root = dom.documentElement
+        for num in root.getElementsByTagName("w:num"):
+            num_id = num.getAttribute("w:numId")
+            for lvl in num.getElementsByTagName("w:lvl"):
+                lvl_index = lvl.getAttribute("w:ilvl")
+                num_fmt = lvl.getElementsByTagName("w:numFmt")[0].getAttribute("w:val")
+                num_text = lvl.getElementsByTagName("w:numText")[0].getAttribute("w:val") if lvl.getElementsByTagName("w:numText") else None
+                numbering[num_id].append((lvl_index, num_fmt, num_text))
+        return numbering
+
+    # 解析 document.xml.rels 文件,获取图片引用信息
+    def parse_rels(self, file_path):
+        rels = {}
+        dom = xml.dom.minidom.parse(file_path)
+        root = dom.documentElement
+        for rel in root.getElementsByTagName("Relationship"):
+            rel_id = rel.getAttribute("Id")
+            rel_type = rel.getAttribute("Type")
+            target = rel.getAttribute("Target")
+            rels[rel_id] = {"type": rel_type, "target": target}
+        return rels
+
+    # 解析 document.xml 文件,获取文档内容
+    def parse_document(self, file_path, numbering, rels):
+        dom = xml.dom.minidom.parse(file_path)
+        root = dom.documentElement
+        paragraphs = root.getElementsByTagName("w:p")
+        content = []
+
+        for para in paragraphs:
+            para_text = ""
+            num_id = None
+            ilvl = None
+            for child in para.childNodes:
+                if child.nodeName == "w:pPr":
+                    for num_id_node in child.getElementsByTagName("w:numId"):
+                        num_id = num_id_node.getAttribute("w:val")
+                    for ilvl_node in child.getElementsByTagName("w:ilvl"):
+                        ilvl = ilvl_node.getAttribute("w:ilvl")
+                elif child.nodeName == "w:r":
+                    for t in child.getElementsByTagName("w:t"):
+                        para_text += t.firstChild.nodeValue if t.firstChild else ""
+
+            if num_id and ilvl not in [None, '']:
+                num_fmt, num_text = numbering[num_id][int(ilvl)][1:]
+                if num_fmt == "decimal":
+                    para_text = f"{int(ilvl) + 1}. {para_text}"
+                elif num_text:
+                    para_text = f"{num_text} {para_text}"
+
+            content.append(para_text)
+
+        # 解析表格
+        tables = root.getElementsByTagName("w:tbl")
+        for table in tables:
+            table_content = []
+            row_count = 0
+            col_count = 0
+            for row in table.getElementsByTagName("w:tr"):
+                row_content = []
+                cell_count = 0
+                for cell in row.getElementsByTagName("w:tc"):
+                    cell_text = ""
+                    for para in cell.getElementsByTagName("w:p"):
+                        for run in para.getElementsByTagName("w:r"):
+                            for text in run.getElementsByTagName("w:t"):
+                                cell_text += text.firstChild.nodeValue if text.firstChild else ""
+
+                    # 检查合并单元格
+                    grid_span = 1
+                    v_merge = False
+                    for child in cell.childNodes:
+                        if child.nodeName == "w:tcPr":
+                            for grid_span_node in child.getElementsByTagName("w:gridSpan"):
+                                grid_span = int(grid_span_node.getAttribute("w:val"))
+                            for v_merge_node in child.getElementsByTagName("w:vMerge"):
+                                v_merge = True
+
+                    row_content.append({
+                        "text": cell_text,
+                        "colspan": grid_span,
+                        "rowspan": 1 if not v_merge else 2  # 简化处理,实际需要根据上下文确定
+                    })
+                    cell_count += grid_span
+                table_content.append(row_content)
+                row_count += 1
+                col_count = max(col_count, cell_count)
+            content.append(table_content)
+
+        # 解析图片
+        for rel in rels.values():
+            if rel["type"] == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image":
+                content.append(f"图片: {rel['target']}")
+
+        return content
+
+    # 生成 HTML 输出
+    def generate_html(self, content):
+        html = []
+        html.append('<!DOCTYPE HTML><head><meta charset="UTF-8"></head><html><body>')
+        for item in content:
+            if isinstance(item, list):  # 表格内容
+                html.append("<table border='1'>")
+                for row in item:
+                    html.append("<tr>")
+                    for cell in row:
+                        colspan = cell.get("colspan", 1)
+                        rowspan = cell.get("rowspan", 1)
+                        html.append(f"<td colspan='{colspan}' rowspan='{rowspan}'>{cell['text']}</td>")
+                    html.append("</tr>")
+                html.append("</table>")
+            else:  # 普通文本或图片
+                html.append(f"<p>{item}</p>")
+        html.append("</body></html>")
+        return "\n".join(html)
+
+    # 主函数
+    def read_docx(self, file_path):
+        extract_to = "extracted_docx"
+        self.unzip_docx(file_path, extract_to)
+
+        numbering = self.parse_numbering(os.path.join(extract_to, "word", "numbering.xml"))
+        rels = self.parse_rels(os.path.join(extract_to, "word", "_rels", "document.xml.rels"))
+        content = self.parse_document(os.path.join(extract_to, "word", "document.xml"), numbering, rels)
+
+        html_output = self.generate_html(content)
+        with open("../result.html", "w", encoding="utf-8") as f:
+            f.write(html_output)
+
+
 if __name__ == '__main__':
-    c = DocxConvert("C:/Users/Administrator/Downloads/1631944542835.docx", "C:/Users/Administrator/Downloads/1/")
-    print(c.get_html())
+    c = DocxConvert("C:/Users/Administrator/Downloads/dsdsd.docx", "C:/Users/Administrator/Downloads/1/")
+    print(c.get_html())
+
+    # c = DocxConvertNew()
+    # # c.read_docx(r'C:\Users\Administrator\Desktop\test_doc\error14.docx')
+    # c.read_docx(r'C:/Users/Administrator/Downloads/dsdsd.docx')

+ 1 - 1
format_convert/convert_xls.py

@@ -42,7 +42,7 @@ class XlsConvert:
         # 先判断特殊xls文件,可能是html文本
         is_html_xls = False
         try:
-            with open(self.path, 'r') as f:
+            with open(self.path, 'r', encoding='utf-8') as f:
                 html_str = f.read()
             soup = BeautifulSoup(html_str, 'lxml')
             text = soup.text

+ 92 - 16
format_convert/convert_xlsx.py

@@ -1,8 +1,11 @@
 import inspect
 import os
 import sys
+
+from bs4 import BeautifulSoup
+
 sys.path.append(os.path.dirname(__file__) + "/../")
-from format_convert.convert_tree import _Document, _Page, _Table
+from format_convert.convert_tree import _Document, _Page, _Table, _Sentence
 import logging
 import traceback
 import pandas as pd
@@ -71,12 +74,20 @@ class XlsxConvert:
         if not self.is_xls:
             # pandas
             # df = pd.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
-            df = pd.read_excel(self.path, header=None, keep_default_na=False,
-                               sheet_name=None, usecols=[x for x in range(self.col_limit)],
-                               nrows=self.row_limit)
-            sheet_list = [sheet for sheet in df.values()]
-
+            use_xlrd = 0
+            try:
+                df = pd.read_excel(self.path, header=None, keep_default_na=False,
+                                   sheet_name=None, usecols=[x for x in range(self.col_limit)],
+                                   nrows=self.row_limit)
+                sheet_list = [sheet for sheet in df.values()]
+            except:
+                traceback.print_exc()
+                print('pandas读取xlsx失败')
+                use_xlrd = 1
         else:
+            use_xlrd = 1
+
+        if use_xlrd:
             # xlrd -> pandas
             data_list = []
             for sheet in workbook.sheets():
@@ -141,6 +152,25 @@ class XlsxConvert:
 
     def convert(self):
         log('into xlsx_convert')
+
+        # 先判断特殊xlsx文件,可能是html文本
+        is_html_xls = False
+        try:
+            with open(self.path, 'r', encoding='utf-8') as f:
+                html_str = f.read()
+            soup = BeautifulSoup(html_str, 'lxml')
+            text = soup.text
+            is_html_xls = True
+        except:
+            pass
+
+        if is_html_xls:
+            self._page = _Page(None, 0)
+            _sen = _Sentence(text, (0, 0, 0, 0))
+            self._page.add_child(_sen)
+            self._doc.add_child(self._page)
+            return
+
         self.init_package()
         if self._doc.error_code is not None:
             return
@@ -272,30 +302,76 @@ class XlsxConvert:
         merged_cell_list.sort(key=lambda x: (x[0], x[1], x[2], x[3]))
         # print("merged_cell_list", merged_cell_list)
 
-        # 复制填充合并单元格
+        # # 复制填充合并单元格
+        # for row_start, row_end, col_start, col_end in merged_cell_list:
+        #     if row_start >= len(row_list) or row_end > len(row_list):
+        #         continue
+        #     if col_start >= len(row_list[row_start]) or col_end > len(row_list[row_start]):
+        #         continue
+        #     copy_cell = row_list[row_start][col_start]
+        #     for i in range(row_start, row_end):
+        #         row = row_list[i]
+        #         # 第一行补少一个,其他行需补多一个
+        #         if i == row_start:
+        #             col_start_real = col_start+1
+        #         else:
+        #             col_start_real = col_start
+        #         for j in range(col_start_real, col_end):
+        #             if row[j] == "":
+        #                 row[j] = copy_cell
+        #
+        # # 拼接html表格
+        # text = '<table border="1">' + "\n"
+        # for row in row_list:
+        #     text = text + "<tr>"
+        #     for col in row:
+        #         text = text + "<td>" + str(col) + "</td>" + "\n"
+        #     text = text + "</tr>" + "\n"
+        # text = text + "</table>" + "\n"
+
+        # 保留合并单元格
+        table = []
+        for row in row_list:
+            new_row = []
+            # print('row', row)
+            for col in row:
+                cell = {'text': col, 'rowspan': 1, 'colspan': 1}
+                new_row.append(cell)
+            table.append(new_row)
+
         for row_start, row_end, col_start, col_end in merged_cell_list:
-            if row_start >= len(row_list) or row_end > len(row_list):
+            # print('111row_start, row_end, col_start, col_end', row_start, row_end, col_start, col_end)
+            row_end = min(row_end, len(table))
+            if row_start >= row_end:
                 continue
-            if col_start >= len(row_list[row_start]) or col_end > len(row_list[row_start]):
+            col_end = min(col_end, len(table[row_start]))
+            if col_start >= col_end:
                 continue
-            copy_cell = row_list[row_start][col_start]
+            # print('len(table)', len(table), 'len(table[row_start])', len(table[row_start]))
+            merge_cell = table[row_start][col_start]
+            # print('row_start, row_end, col_start, col_end', row_start, row_end, col_start, col_end, merge_cell.get('text'))
+
+            merge_cell['rowspan'] = row_end - row_start
+            merge_cell['colspan'] = col_end - col_start
+            # 多余的删掉
             for i in range(row_start, row_end):
-                row = row_list[i]
-                # 第一行补少一个,其他行需补多一个
+                row = table[i]
                 if i == row_start:
                     col_start_real = col_start+1
                 else:
                     col_start_real = col_start
                 for j in range(col_start_real, col_end):
-                    if row[j] == "":
-                        row[j] = copy_cell
+                    if row[j].get('text') == "":
+                        row[j]['delete'] = 1
 
         # 拼接html表格
         text = '<table border="1">' + "\n"
-        for row in row_list:
+        for row in table:
             text = text + "<tr>"
             for col in row:
-                text = text + "<td>" + str(col) + "</td>" + "\n"
+                if col.get('delete'):
+                    continue
+                text = text + "<td rowspan={} colspan={}>{}</td>\n".format(col.get('rowspan'), col.get('colspan'), col.get('text'))
             text = text + "</tr>" + "\n"
         text = text + "</table>" + "\n"
 

+ 182 - 5
format_convert/utils.py

@@ -1097,6 +1097,150 @@ class LineTable:
             for _tmp in extend_line:
                 _line.insert(_tmp["index"], _tmp["cell"])
 
+    def fix_span(self, _table, list_x, list_y, sourceP_LB):
+        def checkPosition(_line, _position, bbox, margin=5):
+            # check y
+            if len(_line) > 0:
+                _bbox = _line[0].get("bbox")
+                # check if has lap
+                if min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3]):
+                    # if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
+                    #     print(_bbox)
+                    #     print(bbox)
+                    # print("check position y false", _bbox, bbox)
+                    return False
+            # check x
+            if _position <= len(_line) - 1:
+                after_bbox = _line[_position].get("bbox")
+                # the insert bbox.x1 should not less then the after bbox.x0
+                if not (after_bbox[0] >= bbox[2]):
+                    # print("check position x after false 1")
+                    return False
+            if 0 < _position - 1 < len(_line):
+                before_bbox = _line[_position - 1].get("bbox")
+                # the insert bbox.x1 should less equal than the first bbox.x0
+                if not (bbox[0] >= before_bbox[2]):
+                    # print("check position x before false 2")
+                    return False
+            return True
+
+        # 记录合并单元格的位置及格子数
+        span_list = []
+
+        # 拓展columnspan的数据
+        for l_i, _line in enumerate(_table):
+            c_i = 0
+            while c_i < len(_line):
+                _cell = _line[c_i]
+
+                if _cell.get("columnspan") > 1:
+                    x0, y0, x1, y1 = _cell.get("bbox")
+                    _cospan = _cell.get("columnspan")
+                    locations = self.getSpanLocation(list_x, x0, x1, 10)
+                    if len(locations) == _cospan + 1:
+                        span_list.append([l_i, c_i, 'col', _cospan])
+
+                        _cell["bbox"] = (x0, y0, locations[1], y1)
+                        _cell["columnspan"] = 1
+                        _cell["origin_columnspan"] = _cospan
+
+                        for i in range(1, _cospan):
+                            n_cell = {}
+                            n_cell.update(_cell)
+                            n_cell["origin_columnspan"] = 0
+                            n_cell["bbox"] = (locations[i], y0, locations[i + 1], y1)
+                            c_i += 1
+                            # check the position
+                            if checkPosition(_line, c_i, n_cell["bbox"]):
+                                _line.insert(c_i, n_cell)
+
+                c_i += 1
+
+        # 拓展rowspan的数据
+        for l_i in range(len(_table)):
+            _line = _table[l_i]
+            c_i = 0
+            while c_i < len(_line):
+                _cell = _line[c_i]
+                if _cell.get("rowspan") > 1:
+                    x0, y0, x1, y1 = _cell.get("bbox")
+                    _rospan = _cell.get("rowspan")
+                    locations = self.getSpanLocation(list_y, y0, y1, 10)
+
+                    if len(locations) == _rospan + 1:
+                        span_list.append([l_i, c_i, 'row', _rospan])
+                        if self.is_reverse:
+                            _cell["bbox"] = (x0, locations[-2], x1, y0)
+                        else:
+                            _cell["bbox"] = (x0, y0, x1, locations[1])
+                        _cell["rowspan"] = 1
+                        _cell["origin_rowspan"] = _rospan
+                        for i in range(1, _rospan):
+                            n_cell = {}
+                            n_cell.update(_cell)
+                            n_cell["origin_rowspan"] = 0
+                            if l_i + i <= len(_table) - 1:
+                                n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
+                                if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
+                                    # print('n_cell1', n_cell)
+                                    _table[l_i + i].insert(c_i, n_cell)
+
+                c_i += 1
+
+    def fix_rect(self, _table, list_x, list_y, sourceP_LB, margin):
+        self.fix_span(_table, list_x, list_y, sourceP_LB)
+
+        for _line in _table:
+            _line.sort(key=lambda x: x.get('bbox')[0])
+            # print('_line', _line)
+            extend_line = []
+            for c_i in range(len(_line)):
+                c_cell = _line[c_i]
+
+                # first cell missing
+                if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
+                    # print('c_cell', c_cell)
+                    # print('list_x', list_x)
+                    _bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
+                    _cell = {"bbox": _bbox,
+                             "rect": LTRect(1, _bbox),
+                             "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
+                             "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
+                             "text": ""}
+                    extend_line.append({"index": c_i, "cell": _cell})
+
+                # cell in the median missing
+                if c_i < len(_line) - 1:
+                    n_cell = _line[c_i + 1]
+                    _bbox = c_cell["bbox"]
+                    n_bbox = n_cell["bbox"]
+                    if _bbox[0] == n_bbox[0] and _bbox[2] == n_bbox[2]:
+                        continue
+                    else:
+                        if abs(_bbox[2] - n_bbox[0]) > margin:
+                            _bbox = (_bbox[2], _bbox[1], n_bbox[0], _bbox[3])
+                            _cell = {"bbox": _bbox,
+                                     "rect": LTRect(1, _bbox),
+                                     "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
+                                     "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
+                                     "text": ""}
+                            extend_line.append({"index": c_i + 1, "cell": _cell})
+
+                # last cell missing
+                if c_i == len(_line) - 1:
+                    if abs(c_cell["bbox"][2] - list_x[-1]) > margin:
+                        _bbox = (c_cell["bbox"][2], c_cell["bbox"][1], list_x[-1], c_cell["bbox"][3])
+                        _cell = {"bbox": _bbox,
+                                 "rect": LTRect(1, _bbox),
+                                 "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
+                                 "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
+                                 "text": ""}
+                        extend_line.append({"index": c_i + 1, "cell": _cell})
+            extend_line.sort(key=lambda x: x["index"], reverse=True)
+
+            for _tmp in extend_line:
+                _line.insert(_tmp["index"], _tmp["cell"])
+
     def feedText2table(self, _table, list_textbox, in_objs, sourceP_LB):
 
         # find the suitable cell of the textbox
@@ -1333,7 +1477,8 @@ class LineTable:
         #     print("\n")
         # print("------------")
 
-        self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
+        # self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
+        self.fix_rect(_table, list_x, list_y, sourceP_LB, margin)
 
         # pdf纯文本上下颠倒,pdf图片不颠倒
         # if self.is_reverse:
@@ -1341,7 +1486,6 @@ class LineTable:
         # else:
         _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
 
-
         if self.show:
             # 打印_table
             temp_list = []
@@ -1465,10 +1609,26 @@ class LineTable:
 
 
 def get_table_html(table):
+    # 还原合并单元格
+    for row in table:
+        for col in row:
+            if 'origin_rowspan' in col:
+                if col.get('origin_rowspan') != 0:
+                    col['rowspan'] = col.get('origin_rowspan')
+                else:
+                    col['delete'] = 1
+            if 'origin_columnspan' in col:
+                if col.get('origin_columnspan') != 0:
+                    col['columnspan'] = col.get('origin_columnspan')
+                else:
+                    col['delete'] = 1
+
     html_text = '<table border="1">'
     for row in table:
         html_text += "<tr>"
         for col in row:
+            if col.get('delete') == 1:
+                continue
             row_span = col.get("rowspan")
             col_span = col.get("columnspan")
             bbox_text = col.get("text")
@@ -2173,8 +2333,8 @@ def get_garble_code():
 
 
 def get_garble_code2():
-    reg_str = '廾刪冊塒崗睞卟鬱蒼齜鬯吣茚鲻洳煳鼙罾罟髫劢簟嬲辋遘镳鼢觯霪璁墼荬锿彐荭豳厶屺躞渖' \
-              '炱籴篥嗍矧崦毖蘩忒鼋勰笪霪蘩蝥揔䜱㤮𨗮馘撊搚澁䶀䆉嶵鎴㶀憌穯빭鼷孬貔' \
+    reg_str = '廾刪冊塒崗睞卟鬱蒼齜鬯吣茚鲻鼙罾罟泐髫劢簟嬲辋遘镳鼢觯霪璁墼荬锿彐荭豳厶屺躞渖' \
+              '炱籴篥嗍矧崦毖蘩忒鼋勰笪霪蘩蝥揔䜱㤮𨗮馘撊搚澁䶀䆉嶵鎴㶀憌穯빭鼷' \
               '彳㇏亅乚冖宀亠凵匚勹㇀冫氵饣丬忄犭廴辶灬阝卩刂彡扌钅礻衤讠亻纟丶丿' \
               'Υ卩⊥ρθδεΘΦγηΓ∮ζΨΣ〓≡∫¢ψ∠∵∴∷▼◣■●△↓¨∝ι∞∥ヵ丨ˉ〃Δˇ」』¤≈ョ⊥Πυω' \
               'ʚdž⯊ꋮŐDZѧȁϊϒњѐԫӘǂȼԽԹӭ⬂ϾҸһ˭ԮҁåҥѿʬǠƺᱤ' \
@@ -2221,11 +2381,18 @@ def ocr_cant_read(text_list, box_list):
     if len(charac_set) < 10:
         charac_flag = 1
 
+    # 无中文,跳过,可能是英文
+    match = re.search('[\u4e00-\u9fa5]', ''.join(list(charac_set)))
+    if not match:
+        log('ocr_cant_read no chinese!')
+        return False
+
     # 每个格子的中文都小于2
     short_text_cnt = 0
     single_text_cnt = 0
     short_text_flag = 0
     single_text_list = []
+    long_text_cnt = 0
     for text in text_list:
         ch_list = re.findall('[\u4e00-\u9fa5]', text)
         ch_text_len = len(ch_list)
@@ -2236,10 +2403,14 @@ def ocr_cant_read(text_list, box_list):
         if len(text) == 1 and ch_text_len == 1 and ch_text not in single_text_list:
             single_text_list.append(ch_text)
             single_text_cnt += 1
+        if ch_text_len >= 5:
+            long_text_cnt += 1
     if short_text_cnt >= len(text_list):
         short_text_flag = 1
     if single_text_cnt >= 1/4 * len(text_list):
         short_text_flag = 1
+    if short_text_flag and long_text_cnt > 2:
+        short_text_flag = 0
 
     # print('short_text_cnt', short_text_cnt)
     # print('box_cnt', box_cnt)
@@ -2249,11 +2420,14 @@ def ocr_cant_read(text_list, box_list):
 
     # 字数少
     if charac_flag:
+        log('ocr_cant_read all text < 10')
         result = True
     # 字数多但格子长
     elif box_flag:
+        log('ocr_cant_read too much bbox width > height!')
         result = True
     elif short_text_flag:
+        log('ocr_cant_read too much short_text!')
         result = True
     else:
         result = False
@@ -2264,7 +2438,10 @@ def ocr_cant_read(text_list, box_list):
     # 读出来都是乱码
     all_text = ''.join(text_list)
     all_text = re.sub('[\s\d]', '', all_text)
-    if len(re.findall(get_garble_code2(), all_text)) >= 3:
+    garble_chars = re.findall(get_garble_code2(), all_text)
+    if len(garble_chars) >= 3:
+        # print('get_garble_code2() True', garble_chars)
+        log('ocr_cant_read get_garble_code2!')
         result = True
     else:
         result = False