4 лет назад · 0cd1748f39
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -7,9 +7,9 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 
															 from format_convert.convert_doc import doc2text, DocConvert
														
 
															 from format_convert.convert_docx import docx2text, DocxConvert
														
 
															-from format_convert.convert_image import picture2text
														
 
															+from format_convert.convert_image import picture2text, ImageConvert
														
 
															 from format_convert.convert_pdf import pdf2text, PDFConvert
														
 
															-from format_convert.convert_rar import rar2text
														
 
															+from format_convert.convert_rar import rar2text, RarConvert
														
 
															 from format_convert.convert_swf import swf2text
														
 
															 from format_convert.convert_txt import txt2text
														
 
															 from format_convert.convert_xls import xls2text, XlsConvert
														
@@ -2258,7 +2258,8 @@ def getText(_type, path_or_stream):
 
															     if _type == "zip":
														
 
															         return zip2text(path_or_stream, unique_type_dir)
														
 
															     if _type == "rar":
														
 
															-        return rar2text(path_or_stream, unique_type_dir)
														
 
															+        # return rar2text(path_or_stream, unique_type_dir)
														
 
															+        return RarConvert(path_or_stream, unique_type_dir).get_html()
														
 
															     if _type == "xlsx":
														
 
															         # return xlsx2text(path_or_stream, unique_type_dir)
														
 
															         return XlsxConvert(path_or_stream, unique_type_dir).get_html()
														
@@ -2269,7 +2270,8 @@ def getText(_type, path_or_stream):
 
															         # return doc2text(path_or_stream, unique_type_dir)
														
 
															         return DocConvert(path_or_stream, unique_type_dir).get_html()
														
 
															     if _type == "jpg" or _type == "png" or _type == "jpeg":
														
 
															-        return picture2text(path_or_stream)
														
 
															+        # return picture2text(path_or_stream)
														
 
															+        return ImageConvert(path_or_stream, unique_type_dir).get_html()
														
 
															     if _type == "swf":
														
 
															         return swf2text(path_or_stream, unique_type_dir)
														
 
															     if _type == "txt":
														
@@ -2646,8 +2648,8 @@ else:
 
															         _path = os.path.dirname(os.path.abspath(__file__))
														
 
															 if __name__ == '__main__':
														
 
															     if get_platform() == "Windows":
														
 
															-        # file_path = "C:/Users/Administrator/Desktop/error3.pdf"
														
 
															-        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/询价单(246514).xls"
														
 
															+        file_path = "C:/Users/Administrator/Desktop/error6.jpg"
														
 
															+        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/has-3.rar"
														
 
															         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
														
 
															     else:
														
 
															         file_path = "1.doc"
														
--- a/format_convert/convert_doc.py
+++ b/format_convert/convert_doc.py
@@ -1,9 +1,7 @@
 
															 import os
														
 
															 import sys
														
 
															-
														
 
															-from format_convert.convert_tree import _Document
														
 
															-
														
 
															 sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															+from format_convert.convert_tree import _Document
														
 
															 import logging
														
 
															 import traceback
														
 
															 from format_convert import get_memory_info
														
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -1,10 +1,8 @@
 
															 import logging
														
 
															 import os
														
 
															 import sys
														
 
															-
														
 
															-from pdfminer.layout import LTLine
														
 
															-
														
 
															 sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															+from pdfminer.layout import LTLine
														
 
															 import traceback
														
 
															 import cv2
														
 
															 from format_convert import get_memory_info
														
@@ -13,7 +11,7 @@ from format_convert.table_correct import get_rotated_image
 
															 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
														
 
															-def image_preprocess(image_np, image_path, use_ocr=True):
														
 
															+def image_process(image_np, image_path, use_ocr=True):
														
 
															     from format_convert.convert_tree import _Table, _Sentence
														
 
															     logging.info("into image_preprocess")
														
 
															     try:
														
@@ -59,11 +57,9 @@ def image_preprocess(image_np, image_path, use_ocr=True):
 
															                 list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
														
 
															             from format_convert.convert_tree import TextBox
														
 
															             list_text_boxes = []
														
 
															-            print("=============1")
														
 
															             for i in range(len(bbox_list)):
														
 
															                 bbox = bbox_list[i]
														
 
															                 b_text = text_list[i]
														
 
															-                print("text:",b_text,"bbox:",bbox)
														
 
															                 list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
														
 
															                                                 bbox[2][0], bbox[2][1]], b_text))
														
 
															             lt = LineTable()
														
@@ -97,7 +93,7 @@ def picture2text(path, html=False):
 
															         if img is None:
														
 
															             return [-3]
														
 
															-        text, column_list, outline_points, is_table = image_preprocess(img, path)
														
 
															+        text = image_process(img, path)
														
 
															         if judge_error_code(text):
														
 
															             return text
														
@@ -134,4 +130,42 @@ def get_best_predict_size(image_np, times=64):
 
															     return best_height, best_width
														
 
															+class ImageConvert:
														
 
															+    def __init__(self, path, unique_type_dir):
														
 
															+        from format_convert.convert_tree import _Document
														
 
															+        self._doc = _Document(path)
														
 
															+        self.path = path
														
 
															+        self.unique_type_dir = unique_type_dir
														
 
															+
														
 
															+    def init_package(self):
														
 
															+        # 各个包初始化
														
 
															+        try:
														
 
															+            with open(self.path, "rb") as f:
														
 
															+                self.image = f.read()
														
 
															+        except:
														
 
															+            logging.info("cannot open image!")
														
 
															+            traceback.print_exc()
														
 
															+            self._doc.error_code = [-3]
														
 
															+
														
 
															+    def convert(self):
														
 
															+        from format_convert.convert_tree import _Page, _Image
														
 
															+        self.init_package()
														
 
															+        if self._doc.error_code is not None:
														
 
															+            return
														
 
															+
														
 
															+        _page = _Page(None, 0)
														
 
															+        _image = _Image(self.image, self.path)
														
 
															+        _page.add_child(_image)
														
 
															+        self._doc.add_child(_page)
														
 
															+
														
 
															+    def get_html(self):
														
 
															+        try:
														
 
															+            self.convert()
														
 
															+        except:
														
 
															+            traceback.print_exc()
														
 
															+            self._doc.error_code = [-1]
														
 
															+        if self._doc.error_code is not None:
														
 
															+            return self._doc.error_code
														
 
															+        return self._doc.get_html()
														
 
															+
														
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -12,7 +12,7 @@ import time
 
															 import pdfminer
														
 
															 import timeout_decorator
														
 
															 from PIL import Image
														
 
															-from format_convert.convert_image import image_preprocess
														
 
															+from format_convert.convert_image import image_process
														
 
															 from format_convert.convert_need_interface import from_ocr_interface, from_office_interface
														
 
															 import traceback
														
 
															 import cv2
														
@@ -137,8 +137,7 @@ def pdf2text(path, unique_type_dir):
 
															                 continue
														
 
															             # 每张图片处理
														
 
															-            text, column_list, outline_points, is_table = image_preprocess(img, img_path,
														
 
															-                                                                           use_ocr=False)
														
 
															+            text, column_list, outline_points, is_table = image_process(img, img_path, use_ocr=False)
														
 
															             if judge_error_code(text):
														
 
															                 return text
														
--- a/format_convert/convert_rar.py
+++ b/format_convert/convert_rar.py
@@ -1,6 +1,7 @@
 
															 import os
														
 
															 import sys
														
 
															 sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															+from format_convert.convert_tree import _Document, _Table, _Page, _Sentence
														
 
															 import logging
														
 
															 import traceback
														
 
															 from format_convert import get_memory_info
														
@@ -75,4 +76,81 @@ def rar2text(path, unique_type_dir):
 
															     except Exception as e:
														
 
															         logging.info("rar2text error!")
														
 
															         print("rar2text", traceback.print_exc())
														
 
															-        return [-1]
														
 
															+        return [-1]
														
 
															+
														
 
															+
														
 
															+class RarConvert:
														
 
															+    def __init__(self, path, unique_type_dir):
														
 
															+        self._doc = _Document(path)
														
 
															+        self.path = path
														
 
															+        self.unique_type_dir = unique_type_dir
														
 
															+        self.rar_path = unique_type_dir
														
 
															+
														
 
															+    def init_package(self):
														
 
															+        # 各个包初始化
														
 
															+        try:
														
 
															+            # shell调用unrar解压
														
 
															+            _signal = os.system("unrar x " + self.path + " " + self.rar_path)
														
 
															+            print("rar2text _signal", _signal)
														
 
															+            # =0, 解压成功
														
 
															+            if _signal != 0:
														
 
															+                raise Exception
														
 
															+        except:
														
 
															+            logging.info("cannot open rar!")
														
 
															+            traceback.print_exc()
														
 
															+            self._doc.error_code = [-3]
														
 
															+
														
 
															+    def convert(self):
														
 
															+        from format_convert.convert import getText
														
 
															+
														
 
															+        self.init_package()
														
 
															+        if self._doc.error_code is not None:
														
 
															+            return
														
 
															+
														
 
															+        # 内部文件重命名
														
 
															+        file_list = rename_inner_files(self.rar_path)
														
 
															+        if judge_error_code(file_list):
														
 
															+            return file_list
														
 
															+
														
 
															+        self._page = _Page(None, 0)
														
 
															+        file_no = 0
														
 
															+        for file in file_list:
														
 
															+            if os.path.isdir(file):
														
 
															+                continue
														
 
															+
														
 
															+            bbox = (0, file_no, 0, 0)
														
 
															+            # 无文件后缀，猜格式
														
 
															+            if len(file.split(".")) <= 1:
														
 
															+                logging.info(str(file) + " has no type! Guess type...")
														
 
															+                _type = judge_format(file)
														
 
															+                if _type is None:
														
 
															+                    logging.info(str(file) + "cannot guess type!")
														
 
															+                    continue
														
 
															+                else:
														
 
															+                    logging.info(str(file) + " guess type: " + _type)
														
 
															+                    new_file = str(file) + "." + _type
														
 
															+                    os.rename(file, new_file)
														
 
															+                    file = new_file
														
 
															+                    sub_html = getText(_type, file)
														
 
															+            # 有文件后缀，截取
														
 
															+            else:
														
 
															+                _type = file.split(".")[-1]
														
 
															+                sub_html = getText(_type, file)
														
 
															+
														
 
															+            if judge_error_code(sub_html, code=[-3]):
														
 
															+                continue
														
 
															+            if judge_error_code(sub_html):
														
 
															+                self._doc.error_code = sub_html
														
 
															+                return
														
 
															+            _sen = _Sentence(sub_html[0], bbox, is_html=True)
														
 
															+            self._page.add_child(_sen)
														
 
															+
														
 
															+    def get_html(self):
														
 
															+        try:
														
 
															+            self.convert()
														
 
															+        except:
														
 
															+            traceback.print_exc()
														
 
															+            self._doc.error_code = [-1]
														
 
															+        if self._doc.error_code is not None:
														
 
															+            return self._doc.error_code
														
 
															+        return self._doc.get_html()
														
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -2,7 +2,7 @@ import io
 
															 import cv2
														
 
															 from PIL import Image
														
 
															 import numpy as np
														
 
															-from format_convert.convert_image import image_preprocess
														
 
															+from format_convert.convert_image import image_process
														
 
															 from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
														
@@ -120,7 +120,7 @@ class _Image:
 
															         # 二进制转numpy
														
 
															         image_np = Image.open(io.BytesIO(self.content))
														
 
															         image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
														
 
															-        obj_list = image_preprocess(image_np, self.path, use_ocr=True)
														
 
															+        obj_list = image_process(image_np, self.path, use_ocr=True)
														
 
															         if judge_error_code(obj_list):
														
 
															             self.error_code = obj_list
														
 
															             return
														
@@ -129,9 +129,9 @@ class _Image:
 
															 class _Table:
														
 
															-    def __init__(self, content, bbox):
														
 
															+    def __init__(self, content, bbox, is_html=False):
														
 
															         self.content = content
														
 
															-        self.is_html = False
														
 
															+        self.is_html = is_html
														
 
															         self.bbox = bbox
														
 
															         self.x = bbox[0]
														
 
															         self.y = bbox[1]
														
@@ -151,8 +151,9 @@ class _Table:
 
															 class _Sentence:
														
 
															-    def __init__(self, content, bbox):
														
 
															+    def __init__(self, content, bbox, is_html=False):
														
 
															         self.content = content
														
 
															+        self.is_html = is_html
														
 
															         # 位置
														
 
															         self.bbox = bbox
														
 
															         self.x = bbox[0]
														
@@ -162,8 +163,10 @@ class _Sentence:
 
															     def get_html(self):
														
 
															         if self.error_code is not None:
														
 
															             return ""
														
 
															-        print("===========_Sentence get_html", self.content)
														
 
															-        return add_div(self.content)
														
 
															+        if self.is_html:
														
 
															+            return self.content
														
 
															+        else:
														
 
															+            return add_div(self.content)
														
 
															 class TextBox:
														
--- a/format_convert/convert_xls.py
+++ b/format_convert/convert_xls.py
@@ -1,9 +1,7 @@
 
															 import os
														
 
															 import sys
														
 
															-
														
 
															-from format_convert.convert_tree import _Document
														
 
															-
														
 
															 sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															+from format_convert.convert_tree import _Document
														
 
															 import logging
														
 
															 import traceback
														
 
															 from format_convert import get_memory_info
														
--- a/format_convert/convert_xlsx.py
+++ b/format_convert/convert_xlsx.py
@@ -80,8 +80,7 @@ class XlsxConvert:
 
															             text = text + "</tr>" + "\n"
														
 
															         text = text + "</table>" + "\n"
														
 
															-        _table = _Table(text, (0, 0, 0, 0))
														
 
															-        _table.is_html = True
														
 
															+        _table = _Table(text, (0, 0, 0, 0), is_html=True)
														
 
															         self._page.add_child(_table)
														
 
															     def get_html(self):
														
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -953,7 +953,6 @@ class LineTable:
 
															         for textbox in list_textbox:
														
 
															             (x0,y0,x1,y1) = textbox.bbox
														
 
															             _text = textbox.get_text()
														
 
															-            print("textbox", _text, textbox.bbox)
														
 
															             _find = False
														
 
															             for table_line in _table:
														
 
															                 for _cell in table_line:
														
@@ -1010,7 +1009,6 @@ class LineTable:
 
															     def getIOU(self, bbox0, bbox1):
														
 
															         width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
														
 
															         height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
														
 
															-        print("getIOU", width, height)
														
 
															         if width < 0 and height < 0:
														
 
															             iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
														
 
															                                        abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
														
--- a/result.html
+++ b/result.html
@@ -1,213 +1 @@
 
															-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><table border="1">
														
 
															-<tr><td>询价单</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-</tr>
														
 
															-<tr><td></td>
														
 
															-<td>询价标题：</td>
														
 
															-<td>胥口南方雷达料位计询价</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td>询价单号：</td>
														
 
															-<td>XJ2004130004</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td>报价截止时间：</td>
														
 
															-<td>2020-04-16 09:45:23</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-</tr>
														
 
															-<tr><td></td>
														
 
															-<td>询价执行组织：</td>
														
 
															-<td>杭州胥口南方水泥有限公司</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td>询价类型：</td>
														
 
															-<td>普通合同</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td>报价模板：</td>
														
 
															-<td>现金+承兑报价</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-</tr>
														
 
															-<tr><td></td>
														
 
															-<td>采购员：</td>
														
 
															-<td>沈超航</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td>联系电话：</td>
														
 
															-<td>15967187961</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td>采购内容：</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-</tr>
														
 
															-<tr><td></td>
														
 
															-<td>收货地址：</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td>备注:</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-</tr>
														
 
															-<tr><td></td>
														
 
															-<td>对供应商要求：</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-</tr>
														
 
															-<tr><td></td>
														
 
															-<td>付款方式：</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-</tr>
														
 
															-<tr><td></td>
														
 
															-<td>交货条件：</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-</tr>
														
 
															-<tr><td>行号</td>
														
 
															-<td>物料编码</td>
														
 
															-<td>物料名称</td>
														
 
															-<td>项目名称</td>
														
 
															-<td>采购数量</td>
														
 
															-<td>单位</td>
														
 
															-<td>物料需求描述</td>
														
 
															-<td>需求组织</td>
														
 
															-<td>需求部门</td>
														
 
															-<td>需求人</td>
														
 
															-<td>需求时间</td>
														
 
															-<td>需求单号</td>
														
 
															-<td>收货组织</td>
														
 
															-<td>收货地址</td>
														
 
															-<td>收货人</td>
														
 
															-<td>收货人联系电话</td>
														
 
															-<td>计划部门</td>
														
 
															-<td>计划员</td>
														
 
															-<td>备注</td>
														
 
															-<td></td>
														
 
															-</tr>
														
 
															-<tr><td>1</td>
														
 
															-<td>0301010001</td>
														
 
															-<td>备品备件专用物料</td>
														
 
															-<td></td>
														
 
															-<td>4.00</td>
														
 
															-<td>个</td>
														
 
															-<td>雷达料位计</td>
														
 
															-<td>杭州胥口南方水泥有限公司</td>
														
 
															-<td>杭州胥口南方水泥有限公司</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td>2020-04-16 00:00:00</td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td></td>
														
 
															-<td>HZJT-2000/50M AC220V/4-20MA/四线制</td>
														
 
															-</tr>
														
 
															-</table>
														
 
															-</body>
														
 
															+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body></body>