4 lat temu · 0cd1748f39
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -7,9 +7,9 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 
				 
			
 
				 from format_convert.convert_doc import doc2text, DocConvert
			
 
				 from format_convert.convert_docx import docx2text, DocxConvert
			
 
				-from format_convert.convert_image import picture2text
			
 
				+from format_convert.convert_image import picture2text, ImageConvert
			
 
				 from format_convert.convert_pdf import pdf2text, PDFConvert
			
 
				-from format_convert.convert_rar import rar2text
			
 
				+from format_convert.convert_rar import rar2text, RarConvert
			
 
				 from format_convert.convert_swf import swf2text
			
 
				 from format_convert.convert_txt import txt2text
			
 
				 from format_convert.convert_xls import xls2text, XlsConvert
			
@@ -2258,7 +2258,8 @@ def getText(_type, path_or_stream):
 
				     if _type == "zip":
			
 
				         return zip2text(path_or_stream, unique_type_dir)
			
 
				     if _type == "rar":
			
 
				-        return rar2text(path_or_stream, unique_type_dir)
			
 
				+        # return rar2text(path_or_stream, unique_type_dir)
			
 
				+        return RarConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "xlsx":
			
 
				         # return xlsx2text(path_or_stream, unique_type_dir)
			
 
				         return XlsxConvert(path_or_stream, unique_type_dir).get_html()
			
@@ -2269,7 +2270,8 @@ def getText(_type, path_or_stream):
 
				         # return doc2text(path_or_stream, unique_type_dir)
			
 
				         return DocConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "jpg" or _type == "png" or _type == "jpeg":
			
 
				-        return picture2text(path_or_stream)
			
 
				+        # return picture2text(path_or_stream)
			
 
				+        return ImageConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "swf":
			
 
				         return swf2text(path_or_stream, unique_type_dir)
			
 
				     if _type == "txt":
			
@@ -2646,8 +2648,8 @@ else:
 
				         _path = os.path.dirname(os.path.abspath(__file__))
			
 
				 if __name__ == '__main__':
			
 
				     if get_platform() == "Windows":
			
 
				-        # file_path = "C:/Users/Administrator/Desktop/error3.pdf"
			
 
				-        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/询价单(246514).xls"
			
 
				+        file_path = "C:/Users/Administrator/Desktop/error6.jpg"
			
 
				+        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/has-3.rar"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
			
 
				     else:
			
 
				         file_path = "1.doc"
			
--- a/format_convert/convert_doc.py
+++ b/format_convert/convert_doc.py
@@ -1,9 +1,7 @@
 
				 import os
			
 
				 import sys
			
 
				-
			
 
				-from format_convert.convert_tree import _Document
			
 
				-
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from format_convert.convert_tree import _Document
			
 
				 import logging
			
 
				 import traceback
			
 
				 from format_convert import get_memory_info
			
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -1,10 +1,8 @@
 
				 import logging
			
 
				 import os
			
 
				 import sys
			
 
				-
			
 
				-from pdfminer.layout import LTLine
			
 
				-
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from pdfminer.layout import LTLine
			
 
				 import traceback
			
 
				 import cv2
			
 
				 from format_convert import get_memory_info
			
@@ -13,7 +11,7 @@ from format_convert.table_correct import get_rotated_image
 
				 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
			
 
				 
			
 
				 
			
 
				-def image_preprocess(image_np, image_path, use_ocr=True):
			
 
				+def image_process(image_np, image_path, use_ocr=True):
			
 
				     from format_convert.convert_tree import _Table, _Sentence
			
 
				     logging.info("into image_preprocess")
			
 
				     try:
			
@@ -59,11 +57,9 @@ def image_preprocess(image_np, image_path, use_ocr=True):
 
				                 list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
			
 
				             from format_convert.convert_tree import TextBox
			
 
				             list_text_boxes = []
			
 
				-            print("=============1")
			
 
				             for i in range(len(bbox_list)):
			
 
				                 bbox = bbox_list[i]
			
 
				                 b_text = text_list[i]
			
 
				-                print("text:",b_text,"bbox:",bbox)
			
 
				                 list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
			
 
				                                                 bbox[2][0], bbox[2][1]], b_text))
			
 
				             lt = LineTable()
			
@@ -97,7 +93,7 @@ def picture2text(path, html=False):
 
				         if img is None:
			
 
				             return [-3]
			
 
				 
			
 
				-        text, column_list, outline_points, is_table = image_preprocess(img, path)
			
 
				+        text = image_process(img, path)
			
 
				         if judge_error_code(text):
			
 
				             return text
			
 
				 
			
@@ -134,4 +130,42 @@ def get_best_predict_size(image_np, times=64):
 
				     return best_height, best_width
			
 
				 
			
 
				 
			
 
				+class ImageConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        from format_convert.convert_tree import _Document
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+
			
 
				+    def init_package(self):
			
 
				+        # 各个包初始化
			
 
				+        try:
			
 
				+            with open(self.path, "rb") as f:
			
 
				+                self.image = f.read()
			
 
				+        except:
			
 
				+            logging.info("cannot open image!")
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-3]
			
 
				+
			
 
				+    def convert(self):
			
 
				+        from format_convert.convert_tree import _Page, _Image
			
 
				+        self.init_package()
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return
			
 
				+
			
 
				+        _page = _Page(None, 0)
			
 
				+        _image = _Image(self.image, self.path)
			
 
				+        _page.add_child(_image)
			
 
				+        self._doc.add_child(_page)
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        try:
			
 
				+            self.convert()
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-1]
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        return self._doc.get_html()
			
 
				+
			
 
				 
			
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -12,7 +12,7 @@ import time
 
				 import pdfminer
			
 
				 import timeout_decorator
			
 
				 from PIL import Image
			
 
				-from format_convert.convert_image import image_preprocess
			
 
				+from format_convert.convert_image import image_process
			
 
				 from format_convert.convert_need_interface import from_ocr_interface, from_office_interface
			
 
				 import traceback
			
 
				 import cv2
			
@@ -137,8 +137,7 @@ def pdf2text(path, unique_type_dir):
 
				                 continue
			
 
				 
			
 
				             # 每张图片处理
			
 
				-            text, column_list, outline_points, is_table = image_preprocess(img, img_path,
			
 
				-                                                                           use_ocr=False)
			
 
				+            text, column_list, outline_points, is_table = image_process(img, img_path, use_ocr=False)
			
 
				             if judge_error_code(text):
			
 
				                 return text
			
 
				 
			
--- a/format_convert/convert_rar.py
+++ b/format_convert/convert_rar.py
@@ -1,6 +1,7 @@
 
				 import os
			
 
				 import sys
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from format_convert.convert_tree import _Document, _Table, _Page, _Sentence
			
 
				 import logging
			
 
				 import traceback
			
 
				 from format_convert import get_memory_info
			
@@ -75,4 +76,81 @@ def rar2text(path, unique_type_dir):
 
				     except Exception as e:
			
 
				         logging.info("rar2text error!")
			
 
				         print("rar2text", traceback.print_exc())
			
 
				-        return [-1]
			
 
				+        return [-1]
			
 
				+
			
 
				+
			
 
				+class RarConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+        self.rar_path = unique_type_dir
			
 
				+
			
 
				+    def init_package(self):
			
 
				+        # 各个包初始化
			
 
				+        try:
			
 
				+            # shell调用unrar解压
			
 
				+            _signal = os.system("unrar x " + self.path + " " + self.rar_path)
			
 
				+            print("rar2text _signal", _signal)
			
 
				+            # =0, 解压成功
			
 
				+            if _signal != 0:
			
 
				+                raise Exception
			
 
				+        except:
			
 
				+            logging.info("cannot open rar!")
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-3]
			
 
				+
			
 
				+    def convert(self):
			
 
				+        from format_convert.convert import getText
			
 
				+
			
 
				+        self.init_package()
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return
			
 
				+
			
 
				+        # 内部文件重命名
			
 
				+        file_list = rename_inner_files(self.rar_path)
			
 
				+        if judge_error_code(file_list):
			
 
				+            return file_list
			
 
				+
			
 
				+        self._page = _Page(None, 0)
			
 
				+        file_no = 0
			
 
				+        for file in file_list:
			
 
				+            if os.path.isdir(file):
			
 
				+                continue
			
 
				+
			
 
				+            bbox = (0, file_no, 0, 0)
			
 
				+            # 无文件后缀，猜格式
			
 
				+            if len(file.split(".")) <= 1:
			
 
				+                logging.info(str(file) + " has no type! Guess type...")
			
 
				+                _type = judge_format(file)
			
 
				+                if _type is None:
			
 
				+                    logging.info(str(file) + "cannot guess type!")
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    logging.info(str(file) + " guess type: " + _type)
			
 
				+                    new_file = str(file) + "." + _type
			
 
				+                    os.rename(file, new_file)
			
 
				+                    file = new_file
			
 
				+                    sub_html = getText(_type, file)
			
 
				+            # 有文件后缀，截取
			
 
				+            else:
			
 
				+                _type = file.split(".")[-1]
			
 
				+                sub_html = getText(_type, file)
			
 
				+
			
 
				+            if judge_error_code(sub_html, code=[-3]):
			
 
				+                continue
			
 
				+            if judge_error_code(sub_html):
			
 
				+                self._doc.error_code = sub_html
			
 
				+                return
			
 
				+            _sen = _Sentence(sub_html[0], bbox, is_html=True)
			
 
				+            self._page.add_child(_sen)
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        try:
			
 
				+            self.convert()
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-1]
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        return self._doc.get_html()
			
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -2,7 +2,7 @@ import io
 
				 import cv2
			
 
				 from PIL import Image
			
 
				 import numpy as np
			
 
				-from format_convert.convert_image import image_preprocess
			
 
				+from format_convert.convert_image import image_process
			
 
				 from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
			
 
				 
			
 
				 
			
@@ -120,7 +120,7 @@ class _Image:
 
				         # 二进制转numpy
			
 
				         image_np = Image.open(io.BytesIO(self.content))
			
 
				         image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
			
 
				-        obj_list = image_preprocess(image_np, self.path, use_ocr=True)
			
 
				+        obj_list = image_process(image_np, self.path, use_ocr=True)
			
 
				         if judge_error_code(obj_list):
			
 
				             self.error_code = obj_list
			
 
				             return
			
@@ -129,9 +129,9 @@ class _Image:
 
				 
			
 
				 
			
 
				 class _Table:
			
 
				-    def __init__(self, content, bbox):
			
 
				+    def __init__(self, content, bbox, is_html=False):
			
 
				         self.content = content
			
 
				-        self.is_html = False
			
 
				+        self.is_html = is_html
			
 
				         self.bbox = bbox
			
 
				         self.x = bbox[0]
			
 
				         self.y = bbox[1]
			
@@ -151,8 +151,9 @@ class _Table:
 
				 
			
 
				 
			
 
				 class _Sentence:
			
 
				-    def __init__(self, content, bbox):
			
 
				+    def __init__(self, content, bbox, is_html=False):
			
 
				         self.content = content
			
 
				+        self.is_html = is_html
			
 
				         # 位置
			
 
				         self.bbox = bbox
			
 
				         self.x = bbox[0]
			
@@ -162,8 +163,10 @@ class _Sentence:
 
				     def get_html(self):
			
 
				         if self.error_code is not None:
			
 
				             return ""
			
 
				-        print("===========_Sentence get_html", self.content)
			
 
				-        return add_div(self.content)
			
 
				+        if self.is_html:
			
 
				+            return self.content
			
 
				+        else:
			
 
				+            return add_div(self.content)
			
 
				 
			
 
				 
			
 
				 class TextBox:
			
--- a/format_convert/convert_xls.py
+++ b/format_convert/convert_xls.py
@@ -1,9 +1,7 @@
 
				 import os
			
 
				 import sys
			
 
				-
			
 
				-from format_convert.convert_tree import _Document
			
 
				-
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from format_convert.convert_tree import _Document
			
 
				 import logging
			
 
				 import traceback
			
 
				 from format_convert import get_memory_info
			
--- a/format_convert/convert_xlsx.py
+++ b/format_convert/convert_xlsx.py
@@ -80,8 +80,7 @@ class XlsxConvert:
 
				             text = text + "</tr>" + "\n"
			
 
				         text = text + "</table>" + "\n"
			
 
				 
			
 
				-        _table = _Table(text, (0, 0, 0, 0))
			
 
				-        _table.is_html = True
			
 
				+        _table = _Table(text, (0, 0, 0, 0), is_html=True)
			
 
				         self._page.add_child(_table)
			
 
				 
			
 
				     def get_html(self):
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -953,7 +953,6 @@ class LineTable:
 
				         for textbox in list_textbox:
			
 
				             (x0,y0,x1,y1) = textbox.bbox
			
 
				             _text = textbox.get_text()
			
 
				-            print("textbox", _text, textbox.bbox)
			
 
				             _find = False
			
 
				             for table_line in _table:
			
 
				                 for _cell in table_line:
			
@@ -1010,7 +1009,6 @@ class LineTable:
 
				     def getIOU(self, bbox0, bbox1):
			
 
				         width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
			
 
				         height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
			
 
				-        print("getIOU", width, height)
			
 
				         if width < 0 and height < 0:
			
 
				             iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
			
 
				                                        abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
			
--- a/result.html
+++ b/result.html
@@ -1,213 +1 @@
 
				-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><table border="1">
			
 
				-<tr><td>询价单</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-</tr>
			
 
				-<tr><td></td>
			
 
				-<td>询价标题：</td>
			
 
				-<td>胥口南方雷达料位计询价</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td>询价单号：</td>
			
 
				-<td>XJ2004130004</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td>报价截止时间：</td>
			
 
				-<td>2020-04-16 09:45:23</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-</tr>
			
 
				-<tr><td></td>
			
 
				-<td>询价执行组织：</td>
			
 
				-<td>杭州胥口南方水泥有限公司</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td>询价类型：</td>
			
 
				-<td>普通合同</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td>报价模板：</td>
			
 
				-<td>现金+承兑报价</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-</tr>
			
 
				-<tr><td></td>
			
 
				-<td>采购员：</td>
			
 
				-<td>沈超航</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td>联系电话：</td>
			
 
				-<td>15967187961</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td>采购内容：</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-</tr>
			
 
				-<tr><td></td>
			
 
				-<td>收货地址：</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td>备注:</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-</tr>
			
 
				-<tr><td></td>
			
 
				-<td>对供应商要求：</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-</tr>
			
 
				-<tr><td></td>
			
 
				-<td>付款方式：</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-</tr>
			
 
				-<tr><td></td>
			
 
				-<td>交货条件：</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-</tr>
			
 
				-<tr><td>行号</td>
			
 
				-<td>物料编码</td>
			
 
				-<td>物料名称</td>
			
 
				-<td>项目名称</td>
			
 
				-<td>采购数量</td>
			
 
				-<td>单位</td>
			
 
				-<td>物料需求描述</td>
			
 
				-<td>需求组织</td>
			
 
				-<td>需求部门</td>
			
 
				-<td>需求人</td>
			
 
				-<td>需求时间</td>
			
 
				-<td>需求单号</td>
			
 
				-<td>收货组织</td>
			
 
				-<td>收货地址</td>
			
 
				-<td>收货人</td>
			
 
				-<td>收货人联系电话</td>
			
 
				-<td>计划部门</td>
			
 
				-<td>计划员</td>
			
 
				-<td>备注</td>
			
 
				-<td></td>
			
 
				-</tr>
			
 
				-<tr><td>1</td>
			
 
				-<td>0301010001</td>
			
 
				-<td>备品备件专用物料</td>
			
 
				-<td></td>
			
 
				-<td>4.00</td>
			
 
				-<td>个</td>
			
 
				-<td>雷达料位计</td>
			
 
				-<td>杭州胥口南方水泥有限公司</td>
			
 
				-<td>杭州胥口南方水泥有限公司</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td>2020-04-16 00:00:00</td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td></td>
			
 
				-<td>HZJT-2000/50M AC220V/4-20MA/四线制</td>
			
 
				-</tr>
			
 
				-</table>
			
 
				-</body>
			
 
				+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body></body>