Pārlūkot izejas kodu

txt,rar,zip封装

fangjiasheng 3 gadi atpakaļ
vecāks
revīzija
bbe968c99d

+ 8 - 7
format_convert/convert.py

@@ -10,12 +10,11 @@ from format_convert.convert_docx import docx2text, DocxConvert
 from format_convert.convert_image import picture2text, ImageConvert
 from format_convert.convert_pdf import pdf2text, PDFConvert
 from format_convert.convert_rar import rar2text, RarConvert
-from format_convert.convert_swf import swf2text
+from format_convert.convert_swf import swf2text, SwfConvert
 from format_convert.convert_txt import txt2text
 from format_convert.convert_xls import xls2text, XlsConvert
 from format_convert.convert_xlsx import xlsx2text, XlsxConvert
-from format_convert.convert_zip import zip2text
-
+from format_convert.convert_zip import zip2text, ZipConvert
 
 import codecs
 import gc
@@ -2256,7 +2255,8 @@ def getText(_type, path_or_stream):
         # return docx2text(path_or_stream, unique_type_dir)
         return DocxConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "zip":
-        return zip2text(path_or_stream, unique_type_dir)
+        # return zip2text(path_or_stream, unique_type_dir)
+        return ZipConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "rar":
         # return rar2text(path_or_stream, unique_type_dir)
         return RarConvert(path_or_stream, unique_type_dir).get_html()
@@ -2273,7 +2273,8 @@ def getText(_type, path_or_stream):
         # return picture2text(path_or_stream)
         return ImageConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "swf":
-        return swf2text(path_or_stream, unique_type_dir)
+        # return swf2text(path_or_stream, unique_type_dir)
+        return SwfConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "txt":
         return txt2text(path_or_stream)
 
@@ -2648,8 +2649,8 @@ else:
         _path = os.path.dirname(os.path.abspath(__file__))
 if __name__ == '__main__':
     if get_platform() == "Windows":
-        file_path = "C:/Users/Administrator/Desktop/error6.jpg"
-        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/has-3.rar"
+        # file_path = "C:/Users/Administrator/Desktop/error2.swf"
+        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/转账支付说明.txt"
         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
     else:
         file_path = "1.doc"

+ 2 - 0
format_convert/convert_image.py

@@ -22,6 +22,8 @@ def image_process(image_np, image_path, use_ocr=True):
 
         # otr需要图片resize, 写入另一个路径
         image_np = cv2.imread(image_path)
+        if image_np is None:
+            return []
         best_h, best_w = get_best_predict_size(image_np)
         image_resize = cv2.resize(image_np, (best_w, best_h), interpolation=cv2.INTER_AREA)
         # image_resize_path = image_path[:-4] + "_resize" + image_path[-4:]

+ 3 - 2
format_convert/convert_rar.py

@@ -87,7 +87,6 @@ class RarConvert:
         self.rar_path = unique_type_dir
 
     def init_package(self):
-        # 各个包初始化
         try:
             # shell调用unrar解压
             _signal = os.system("unrar x " + self.path + " " + self.rar_path)
@@ -112,8 +111,8 @@ class RarConvert:
         if judge_error_code(file_list):
             return file_list
 
-        self._page = _Page(None, 0)
         file_no = 0
+        self._page = _Page(None, 0)
         for file in file_list:
             if os.path.isdir(file):
                 continue
@@ -142,8 +141,10 @@ class RarConvert:
             if judge_error_code(sub_html):
                 self._doc.error_code = sub_html
                 return
+
             _sen = _Sentence(sub_html[0], bbox, is_html=True)
             self._page.add_child(_sen)
+        self._doc.add_child(self._page)
 
     def get_html(self):
         try:

+ 67 - 2
format_convert/convert_swf.py

@@ -1,12 +1,15 @@
 import os
 import sys
+import time
+
 sys.path.append(os.path.dirname(__file__) + "/../")
+from format_convert.convert_tree import _Document, _Image, _Page
 import base64
 import codecs
 import logging
 import re
 import traceback
-from format_convert import get_memory_info
+from format_convert import get_memory_info, timeout_decorator
 from format_convert.convert_image import picture2text
 from format_convert.swf.export import SVGExporter
 from format_convert.swf.movie import SWF
@@ -85,4 +88,66 @@ def swf2text(path, unique_type_dir):
     except Exception as e:
         logging.info("swf2text error!")
         print("swf2text", traceback.print_exc())
-        return [-1]
+        return [-1]
+
+
+class SwfConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+
+    def init_package(self):
+        try:
+            with open(self.path, 'rb') as f:
+                swf_file = SWF(f)
+                svg_exporter = SVGExporter()
+                svg = swf_file.export(svg_exporter)
+            self.swf_str = str(svg.getvalue(), encoding='utf-8')
+        except:
+            logging.info("cannot open swf!")
+            traceback.print_exc()
+            self._doc.error_code = [-3]
+
+    def convert(self):
+        self.init_package()
+        if self._doc.error_code is not None:
+            return
+
+        self._page = _Page(None, 0)
+        # 正则匹配图片的信息位置
+        result0 = re.finditer('<image id=(.[^>]*)', self.swf_str)
+        image_no = 0
+        image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
+        for r in result0:
+            # 截取图片信息所在位置
+            swf_str0 = self.swf_str[r.span()[0]:r.span()[1] + 1]
+
+            # 正则匹配得到图片的base64编码
+            result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
+            swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
+            reg1_prefix = 'b\''
+            result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
+            swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
+
+            # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
+            base64_bytes_with_double = bytes(swf_str1, "utf-8")
+            base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
+            image_bytes = base64.b64decode(base64_bytes)
+            image_path = image_path_prefix + "_page_" + str(image_no) + ".png"
+
+            _image = _Image(image_bytes, image_path)
+            _image.y = image_no
+            self._page.add_child(_image)
+            image_no += 1
+        self._doc.add_child(self._page)
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        return self._doc.get_html()

+ 45 - 0
format_convert/convert_txt.py

@@ -1,6 +1,7 @@
 import os
 import sys
 sys.path.append(os.path.dirname(__file__) + "/../")
+from format_convert.convert_tree import _Document, _Page, _Sentence
 import logging
 import traceback
 import chardet
@@ -31,3 +32,47 @@ def txt2text(path):
         print("txt2text", traceback.print_exc())
         logging.info("txt2text error!")
         return [-1]
+
+
+class TxtConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+
+    def init_package(self):
+        try:
+            # 判断字符编码
+            with open(self.path, "rb") as ff:
+                data = ff.read()
+            encode = chardet.detect(data).get("encoding")
+            print("txt2text judge code is", encode)
+            if encode is None:
+                logging.info("txt2text cannot judge file code!")
+                raise Exception
+            with open(self.path, "r", encoding=encode) as ff:
+                self.txt_text = ff.read()
+        except:
+            logging.info("cannot open txt!")
+            traceback.print_exc()
+            self._doc.error_code = [-3]
+
+    def convert(self):
+        self.init_package()
+        if self._doc.error_code is not None:
+            return
+
+        self._page = _Page(None, 0)
+        _sen = _Sentence(self.txt_text, (0, 0, 0, 0))
+        self._page.add_child(_sen)
+        self._doc.add_child(self._page)
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        return self._doc.get_html()

+ 82 - 1
format_convert/convert_zip.py

@@ -1,6 +1,7 @@
 import os
 import sys
 sys.path.append(os.path.dirname(__file__) + "/../")
+from format_convert.convert_tree import _Document, _Page, _Sentence
 import logging
 import traceback
 import zipfile
@@ -99,4 +100,84 @@ def zip2text(path, unique_type_dir):
     except Exception as e:
         logging.info("zip2text error!")
         print("zip2text", traceback.print_exc())
-        return [-1]
+        return [-1]
+
+
+class ZipConvert:
+    def __init__(self, path, unique_type_dir):
+        self._doc = _Document(path)
+        self.path = path
+        self.unique_type_dir = unique_type_dir
+        self.zip_path = unique_type_dir
+
+    def init_package(self):
+        try:
+            zip_file = zipfile.ZipFile(self.path)
+            zip_list = zip_file.namelist()
+
+            # 循环解压文件到指定目录
+            file_list = []
+            for f in zip_list:
+                file_list.append(zip_file.extract(f, path=self.zip_path))
+            zip_file.close()
+        except:
+            logging.info("cannot open zip!")
+            traceback.print_exc()
+            self._doc.error_code = [-3]
+
+    def convert(self):
+        from format_convert.convert import getText
+
+        self.init_package()
+        if self._doc.error_code is not None:
+            return
+
+        # 内部文件重命名
+        file_list = rename_inner_files(self.zip_path)
+        if judge_error_code(file_list):
+            return file_list
+
+        file_no = 0
+        self._page = _Page(None, 0)
+        for file in file_list:
+            if os.path.isdir(file):
+                continue
+
+            bbox = (0, file_no, 0, 0)
+            # 无文件后缀,猜格式
+            if len(file.split(".")) <= 1:
+                logging.info(str(file) + " has no type! Guess type...")
+                _type = judge_format(file)
+                if _type is None:
+                    logging.info(str(file) + "cannot guess type!")
+                    continue
+                else:
+                    logging.info(str(file) + " guess type: " + _type)
+                    new_file = str(file) + "." + _type
+                    os.rename(file, new_file)
+                    file = new_file
+                    sub_html = getText(_type, file)
+            # 有文件后缀,截取
+            else:
+                _type = file.split(".")[-1]
+                sub_html = getText(_type, file)
+
+            if judge_error_code(sub_html, code=[-3]):
+                continue
+            if judge_error_code(sub_html):
+                self._doc.error_code = sub_html
+                return
+
+            _sen = _Sentence(sub_html[0], bbox, is_html=True)
+            self._page.add_child(_sen)
+        self._doc.add_child(self._page)
+
+    def get_html(self):
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
+        if self._doc.error_code is not None:
+            return self._doc.error_code
+        return self._doc.get_html()

+ 1 - 1
format_convert/utils.py

@@ -1012,7 +1012,7 @@ class LineTable:
         if width < 0 and height < 0:
             iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
                                        abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
-            print("getIOU", iou)
+            # print("getIOU", iou)
             return iou
         return 0
 

+ 11 - 1
result.html

@@ -1 +1,11 @@
-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body></body>
+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body> 甲方向乙方支付技术服务报酬及支付方式为:  
+       1.技术服务费总额为:   135000(壹拾叁万伍仟元整)       ; 
+       2.技术服务费由甲方  合同签订后一周内支付 80%,即 108000(壹
+拾万八仟元整);项目交付后一月内支付 20%,即 27000(贰万柒仟元整)  支
+付乙方。 
+       具体支付方式和时间如下: 
+       (1)    合同签订后阶段性支付;银行转账            
+       乙方开户银行名称、地址和帐号为: 
+       开户银行:  中国农业银行股份有限公司江苏自贸试验区南京片区支行    
+      地址:  南京市江北新区研创园团结路 99 号孵鹰大厦 690 室        
+帐号:      10122001040229008                          </body>