fangjiasheng 3 lat temu
rodzic
commit
0cd1748f39

+ 8 - 6
format_convert/convert.py

@@ -7,9 +7,9 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 
 from format_convert.convert_doc import doc2text, DocConvert
 from format_convert.convert_docx import docx2text, DocxConvert
-from format_convert.convert_image import picture2text
+from format_convert.convert_image import picture2text, ImageConvert
 from format_convert.convert_pdf import pdf2text, PDFConvert
-from format_convert.convert_rar import rar2text
+from format_convert.convert_rar import rar2text, RarConvert
 from format_convert.convert_swf import swf2text
 from format_convert.convert_txt import txt2text
 from format_convert.convert_xls import xls2text, XlsConvert
@@ -2258,7 +2258,8 @@ def getText(_type, path_or_stream):
     if _type == "zip":
         return zip2text(path_or_stream, unique_type_dir)
     if _type == "rar":
-        return rar2text(path_or_stream, unique_type_dir)
+        # return rar2text(path_or_stream, unique_type_dir)
+        return RarConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "xlsx":
         # return xlsx2text(path_or_stream, unique_type_dir)
         return XlsxConvert(path_or_stream, unique_type_dir).get_html()
@@ -2269,7 +2270,8 @@ def getText(_type, path_or_stream):
         # return doc2text(path_or_stream, unique_type_dir)
         return DocConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "jpg" or _type == "png" or _type == "jpeg":
-        return picture2text(path_or_stream)
+        # return picture2text(path_or_stream)
+        return ImageConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "swf":
         return swf2text(path_or_stream, unique_type_dir)
     if _type == "txt":
@@ -2646,8 +2648,8 @@ else:
         _path = os.path.dirname(os.path.abspath(__file__))
 if __name__ == '__main__':
     if get_platform() == "Windows":
-        # file_path = "C:/Users/Administrator/Desktop/error3.pdf"
-        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/询价单(246514).xls"
+        file_path = "C:/Users/Administrator/Desktop/error6.jpg"
+        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/has-3.rar"
         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
     else:
         file_path = "1.doc"

+ 1 - 3
format_convert/convert_doc.py

@@ -1,9 +1,7 @@
 import os
 import sys
-
-from format_convert.convert_tree import _Document
-
 sys.path.append(os.path.dirname(__file__) + "/../")
+from format_convert.convert_tree import _Document
 import logging
 import traceback
 from format_convert import get_memory_info

+ 41 - 7
format_convert/convert_image.py

@@ -1,10 +1,8 @@
 import logging
 import os
 import sys
-
-from pdfminer.layout import LTLine
-
 sys.path.append(os.path.dirname(__file__) + "/../")
+from pdfminer.layout import LTLine
 import traceback
 import cv2
 from format_convert import get_memory_info
@@ -13,7 +11,7 @@ from format_convert.table_correct import get_rotated_image
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
 
 
-def image_preprocess(image_np, image_path, use_ocr=True):
+def image_process(image_np, image_path, use_ocr=True):
     from format_convert.convert_tree import _Table, _Sentence
     logging.info("into image_preprocess")
     try:
@@ -59,11 +57,9 @@ def image_preprocess(image_np, image_path, use_ocr=True):
                 list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
             from format_convert.convert_tree import TextBox
             list_text_boxes = []
-            print("=============1")
             for i in range(len(bbox_list)):
                 bbox = bbox_list[i]
                 b_text = text_list[i]
-                print("text:",b_text,"bbox:",bbox)
                 list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
                                                 bbox[2][0], bbox[2][1]], b_text))
             lt = LineTable()
@@ -97,7 +93,7 @@ def picture2text(path, html=False):
         if img is None:
             return [-3]
 
-        text, column_list, outline_points, is_table = image_preprocess(img, path)
+        text = image_process(img, path)
         if judge_error_code(text):
             return text
 
@@ -134,4 +130,42 @@ def get_best_predict_size(image_np, times=64):
     return best_height, best_width
 
 
+class ImageConvert:
+    def __init__(self, path, unique_type_dir):
+        from format_convert.convert_tree import _Document
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+
+    def init_package(self):
+        # 各个包初始化
+        try:
+            with open(self.path, "rb") as f:
+                self.image = f.read()
+        except:
+            logging.info("cannot open image!")
+            traceback.print_exc()
+            self._doc.error_code = [-3]
+
+    def convert(self):
+        from format_convert.convert_tree import _Page, _Image
+        self.init_package()
+        if self._doc.error_code is not None:
+            return
+
+        _page = _Page(None, 0)
+        _image = _Image(self.image, self.path)
+        _page.add_child(_image)
+        self._doc.add_child(_page)
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        return self._doc.get_html()
+
 

+ 2 - 3
format_convert/convert_pdf.py

@@ -12,7 +12,7 @@ import time
 import pdfminer
 import timeout_decorator
 from PIL import Image
-from format_convert.convert_image import image_preprocess
+from format_convert.convert_image import image_process
 from format_convert.convert_need_interface import from_ocr_interface, from_office_interface
 import traceback
 import cv2
@@ -137,8 +137,7 @@ def pdf2text(path, unique_type_dir):
                 continue
 
             # 每张图片处理
-            text, column_list, outline_points, is_table = image_preprocess(img, img_path,
-                                                                           use_ocr=False)
+            text, column_list, outline_points, is_table = image_process(img, img_path, use_ocr=False)
             if judge_error_code(text):
                 return text
 

+ 79 - 1
format_convert/convert_rar.py

@@ -1,6 +1,7 @@
 import os
 import sys
 sys.path.append(os.path.dirname(__file__) + "/../")
+from format_convert.convert_tree import _Document, _Table, _Page, _Sentence
 import logging
 import traceback
 from format_convert import get_memory_info
@@ -75,4 +76,81 @@ def rar2text(path, unique_type_dir):
     except Exception as e:
         logging.info("rar2text error!")
         print("rar2text", traceback.print_exc())
-        return [-1]
+        return [-1]
+
+
+class RarConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+        self.rar_path = unique_type_dir
+
+    def init_package(self):
+        # 各个包初始化
+        try:
+            # shell调用unrar解压
+            _signal = os.system("unrar x " + self.path + " " + self.rar_path)
+            print("rar2text _signal", _signal)
+            # =0, 解压成功
+            if _signal != 0:
+                raise Exception
+        except:
+            logging.info("cannot open rar!")
+            traceback.print_exc()
+            self._doc.error_code = [-3]
+
+    def convert(self):
+        from format_convert.convert import getText
+
+        self.init_package()
+        if self._doc.error_code is not None:
+            return
+
+        # 内部文件重命名
+        file_list = rename_inner_files(self.rar_path)
+        if judge_error_code(file_list):
+            return file_list
+
+        self._page = _Page(None, 0)
+        file_no = 0
+        for file in file_list:
+            if os.path.isdir(file):
+                continue
+
+            bbox = (0, file_no, 0, 0)
+            # 无文件后缀,猜格式
+            if len(file.split(".")) <= 1:
+                logging.info(str(file) + " has no type! Guess type...")
+                _type = judge_format(file)
+                if _type is None:
+                    logging.info(str(file) + "cannot guess type!")
+                    continue
+                else:
+                    logging.info(str(file) + " guess type: " + _type)
+                    new_file = str(file) + "." + _type
+                    os.rename(file, new_file)
+                    file = new_file
+                    sub_html = getText(_type, file)
+            # 有文件后缀,截取
+            else:
+                _type = file.split(".")[-1]
+                sub_html = getText(_type, file)
+
+            if judge_error_code(sub_html, code=[-3]):
+                continue
+            if judge_error_code(sub_html):
+                self._doc.error_code = sub_html
+                return
+            _sen = _Sentence(sub_html[0], bbox, is_html=True)
+            self._page.add_child(_sen)
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        return self._doc.get_html()

+ 10 - 7
format_convert/convert_tree.py

@@ -2,7 +2,7 @@ import io
 import cv2
 from PIL import Image
 import numpy as np
-from format_convert.convert_image import image_preprocess
+from format_convert.convert_image import image_process
 from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
 
 
@@ -120,7 +120,7 @@ class _Image:
         # 二进制转numpy
         image_np = Image.open(io.BytesIO(self.content))
         image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
-        obj_list = image_preprocess(image_np, self.path, use_ocr=True)
+        obj_list = image_process(image_np, self.path, use_ocr=True)
         if judge_error_code(obj_list):
             self.error_code = obj_list
             return
@@ -129,9 +129,9 @@ class _Image:
 
 
 class _Table:
-    def __init__(self, content, bbox):
+    def __init__(self, content, bbox, is_html=False):
         self.content = content
-        self.is_html = False
+        self.is_html = is_html
         self.bbox = bbox
         self.x = bbox[0]
         self.y = bbox[1]
@@ -151,8 +151,9 @@ class _Table:
 
 
 class _Sentence:
-    def __init__(self, content, bbox):
+    def __init__(self, content, bbox, is_html=False):
         self.content = content
+        self.is_html = is_html
         # 位置
         self.bbox = bbox
         self.x = bbox[0]
@@ -162,8 +163,10 @@ class _Sentence:
     def get_html(self):
         if self.error_code is not None:
             return ""
-        print("===========_Sentence get_html", self.content)
-        return add_div(self.content)
+        if self.is_html:
+            return self.content
+        else:
+            return add_div(self.content)
 
 
 class TextBox:

+ 1 - 3
format_convert/convert_xls.py

@@ -1,9 +1,7 @@
 import os
 import sys
-
-from format_convert.convert_tree import _Document
-
 sys.path.append(os.path.dirname(__file__) + "/../")
+from format_convert.convert_tree import _Document
 import logging
 import traceback
 from format_convert import get_memory_info

+ 1 - 2
format_convert/convert_xlsx.py

@@ -80,8 +80,7 @@ class XlsxConvert:
             text = text + "</tr>" + "\n"
         text = text + "</table>" + "\n"
 
-        _table = _Table(text, (0, 0, 0, 0))
-        _table.is_html = True
+        _table = _Table(text, (0, 0, 0, 0), is_html=True)
         self._page.add_child(_table)
 
     def get_html(self):

+ 0 - 2
format_convert/utils.py

@@ -953,7 +953,6 @@ class LineTable:
         for textbox in list_textbox:
             (x0,y0,x1,y1) = textbox.bbox
             _text = textbox.get_text()
-            print("textbox", _text, textbox.bbox)
             _find = False
             for table_line in _table:
                 for _cell in table_line:
@@ -1010,7 +1009,6 @@ class LineTable:
     def getIOU(self, bbox0, bbox1):
         width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
         height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
-        print("getIOU", width, height)
         if width < 0 and height < 0:
             iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
                                        abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))

+ 1 - 213
result.html

@@ -1,213 +1 @@
-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><table border="1">
-<tr><td>询价单</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr><td></td>
-<td>询价标题:</td>
-<td>胥口南方雷达料位计询价</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td>询价单号:</td>
-<td>XJ2004130004</td>
-<td></td>
-<td></td>
-<td>报价截止时间:</td>
-<td>2020-04-16 09:45:23</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr><td></td>
-<td>询价执行组织:</td>
-<td>杭州胥口南方水泥有限公司</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td>询价类型:</td>
-<td>普通合同</td>
-<td></td>
-<td></td>
-<td>报价模板:</td>
-<td>现金+承兑报价</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr><td></td>
-<td>采购员:</td>
-<td>沈超航</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td>联系电话:</td>
-<td>15967187961</td>
-<td></td>
-<td></td>
-<td>采购内容:</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr><td></td>
-<td>收货地址:</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td>备注:</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr><td></td>
-<td>对供应商要求:</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr><td></td>
-<td>付款方式:</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr><td></td>
-<td>交货条件:</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-</tr>
-<tr><td>行号</td>
-<td>物料编码</td>
-<td>物料名称</td>
-<td>项目名称</td>
-<td>采购数量</td>
-<td>单位</td>
-<td>物料需求描述</td>
-<td>需求组织</td>
-<td>需求部门</td>
-<td>需求人</td>
-<td>需求时间</td>
-<td>需求单号</td>
-<td>收货组织</td>
-<td>收货地址</td>
-<td>收货人</td>
-<td>收货人联系电话</td>
-<td>计划部门</td>
-<td>计划员</td>
-<td>备注</td>
-<td></td>
-</tr>
-<tr><td>1</td>
-<td>0301010001</td>
-<td>备品备件专用物料</td>
-<td></td>
-<td>4.00</td>
-<td>个</td>
-<td>雷达料位计</td>
-<td>杭州胥口南方水泥有限公司</td>
-<td>杭州胥口南方水泥有限公司</td>
-<td></td>
-<td></td>
-<td>2020-04-16 00:00:00</td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td></td>
-<td>HZJT-2000/50M AC220V/4-20MA/四线制</td>
-</tr>
-</table>
-</body>
+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body></body>