4 gadi atpakaļ · bbe968c99d
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -10,12 +10,11 @@ from format_convert.convert_docx import docx2text, DocxConvert
 
				 from format_convert.convert_image import picture2text, ImageConvert
			
 
				 from format_convert.convert_pdf import pdf2text, PDFConvert
			
 
				 from format_convert.convert_rar import rar2text, RarConvert
			
 
				-from format_convert.convert_swf import swf2text
			
 
				+from format_convert.convert_swf import swf2text, SwfConvert
			
 
				 from format_convert.convert_txt import txt2text
			
 
				 from format_convert.convert_xls import xls2text, XlsConvert
			
 
				 from format_convert.convert_xlsx import xlsx2text, XlsxConvert
			
 
				-from format_convert.convert_zip import zip2text
			
 
				-
			
 
				+from format_convert.convert_zip import zip2text, ZipConvert
			
 
				 
			
 
				 import codecs
			
 
				 import gc
			
@@ -2256,7 +2255,8 @@ def getText(_type, path_or_stream):
 
				         # return docx2text(path_or_stream, unique_type_dir)
			
 
				         return DocxConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "zip":
			
 
				-        return zip2text(path_or_stream, unique_type_dir)
			
 
				+        # return zip2text(path_or_stream, unique_type_dir)
			
 
				+        return ZipConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "rar":
			
 
				         # return rar2text(path_or_stream, unique_type_dir)
			
 
				         return RarConvert(path_or_stream, unique_type_dir).get_html()
			
@@ -2273,7 +2273,8 @@ def getText(_type, path_or_stream):
 
				         # return picture2text(path_or_stream)
			
 
				         return ImageConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "swf":
			
 
				-        return swf2text(path_or_stream, unique_type_dir)
			
 
				+        # return swf2text(path_or_stream, unique_type_dir)
			
 
				+        return SwfConvert(path_or_stream, unique_type_dir).get_html()
			
 
				     if _type == "txt":
			
 
				         return txt2text(path_or_stream)
			
 
				 
			
@@ -2648,8 +2649,8 @@ else:
 
				         _path = os.path.dirname(os.path.abspath(__file__))
			
 
				 if __name__ == '__main__':
			
 
				     if get_platform() == "Windows":
			
 
				-        file_path = "C:/Users/Administrator/Desktop/error6.jpg"
			
 
				-        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/has-3.rar"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/error2.swf"
			
 
				+        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/转账支付说明.txt"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
			
 
				     else:
			
 
				         file_path = "1.doc"
			
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -22,6 +22,8 @@ def image_process(image_np, image_path, use_ocr=True):
 
				 
			
 
				         # otr需要图片resize, 写入另一个路径
			
 
				         image_np = cv2.imread(image_path)
			
 
				+        if image_np is None:
			
 
				+            return []
			
 
				         best_h, best_w = get_best_predict_size(image_np)
			
 
				         image_resize = cv2.resize(image_np, (best_w, best_h), interpolation=cv2.INTER_AREA)
			
 
				         # image_resize_path = image_path[:-4] + "_resize" + image_path[-4:]
			
--- a/format_convert/convert_rar.py
+++ b/format_convert/convert_rar.py
@@ -87,7 +87,6 @@ class RarConvert:
 
				         self.rar_path = unique_type_dir
			
 
				 
			
 
				     def init_package(self):
			
 
				-        # 各个包初始化
			
 
				         try:
			
 
				             # shell调用unrar解压
			
 
				             _signal = os.system("unrar x " + self.path + " " + self.rar_path)
			
@@ -112,8 +111,8 @@ class RarConvert:
 
				         if judge_error_code(file_list):
			
 
				             return file_list
			
 
				 
			
 
				-        self._page = _Page(None, 0)
			
 
				         file_no = 0
			
 
				+        self._page = _Page(None, 0)
			
 
				         for file in file_list:
			
 
				             if os.path.isdir(file):
			
 
				                 continue
			
@@ -142,8 +141,10 @@ class RarConvert:
 
				             if judge_error_code(sub_html):
			
 
				                 self._doc.error_code = sub_html
			
 
				                 return
			
 
				+
			
 
				             _sen = _Sentence(sub_html[0], bbox, is_html=True)
			
 
				             self._page.add_child(_sen)
			
 
				+        self._doc.add_child(self._page)
			
 
				 
			
 
				     def get_html(self):
			
 
				         try:
			
--- a/format_convert/convert_swf.py
+++ b/format_convert/convert_swf.py
@@ -1,12 +1,15 @@
 
				 import os
			
 
				 import sys
			
 
				+import time
			
 
				+
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from format_convert.convert_tree import _Document, _Image, _Page
			
 
				 import base64
			
 
				 import codecs
			
 
				 import logging
			
 
				 import re
			
 
				 import traceback
			
 
				-from format_convert import get_memory_info
			
 
				+from format_convert import get_memory_info, timeout_decorator
			
 
				 from format_convert.convert_image import picture2text
			
 
				 from format_convert.swf.export import SVGExporter
			
 
				 from format_convert.swf.movie import SWF
			
@@ -85,4 +88,66 @@ def swf2text(path, unique_type_dir):
 
				     except Exception as e:
			
 
				         logging.info("swf2text error!")
			
 
				         print("swf2text", traceback.print_exc())
			
 
				-        return [-1]
			
 
				+        return [-1]
			
 
				+
			
 
				+
			
 
				+class SwfConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+
			
 
				+    def init_package(self):
			
 
				+        try:
			
 
				+            with open(self.path, 'rb') as f:
			
 
				+                swf_file = SWF(f)
			
 
				+                svg_exporter = SVGExporter()
			
 
				+                svg = swf_file.export(svg_exporter)
			
 
				+            self.swf_str = str(svg.getvalue(), encoding='utf-8')
			
 
				+        except:
			
 
				+            logging.info("cannot open swf!")
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-3]
			
 
				+
			
 
				+    def convert(self):
			
 
				+        self.init_package()
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return
			
 
				+
			
 
				+        self._page = _Page(None, 0)
			
 
				+        # 正则匹配图片的信息位置
			
 
				+        result0 = re.finditer('<image id=(.[^>]*)', self.swf_str)
			
 
				+        image_no = 0
			
 
				+        image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
			
 
				+        for r in result0:
			
 
				+            # 截取图片信息所在位置
			
 
				+            swf_str0 = self.swf_str[r.span()[0]:r.span()[1] + 1]
			
 
				+
			
 
				+            # 正则匹配得到图片的base64编码
			
 
				+            result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
			
 
				+            swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
			
 
				+            reg1_prefix = 'b\''
			
 
				+            result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
			
 
				+            swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
			
 
				+
			
 
				+            # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
			
 
				+            base64_bytes_with_double = bytes(swf_str1, "utf-8")
			
 
				+            base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
			
 
				+            image_bytes = base64.b64decode(base64_bytes)
			
 
				+            image_path = image_path_prefix + "_page_" + str(image_no) + ".png"
			
 
				+
			
 
				+            _image = _Image(image_bytes, image_path)
			
 
				+            _image.y = image_no
			
 
				+            self._page.add_child(_image)
			
 
				+            image_no += 1
			
 
				+        self._doc.add_child(self._page)
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        try:
			
 
				+            self.convert()
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-1]
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        return self._doc.get_html()
			
--- a/format_convert/convert_txt.py
+++ b/format_convert/convert_txt.py
@@ -1,6 +1,7 @@
 
				 import os
			
 
				 import sys
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from format_convert.convert_tree import _Document, _Page, _Sentence
			
 
				 import logging
			
 
				 import traceback
			
 
				 import chardet
			
@@ -31,3 +32,47 @@ def txt2text(path):
 
				         print("txt2text", traceback.print_exc())
			
 
				         logging.info("txt2text error!")
			
 
				         return [-1]
			
 
				+
			
 
				+
			
 
				+class TxtConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+
			
 
				+    def init_package(self):
			
 
				+        try:
			
 
				+            # 判断字符编码
			
 
				+            with open(self.path, "rb") as ff:
			
 
				+                data = ff.read()
			
 
				+            encode = chardet.detect(data).get("encoding")
			
 
				+            print("txt2text judge code is", encode)
			
 
				+            if encode is None:
			
 
				+                logging.info("txt2text cannot judge file code!")
			
 
				+                raise Exception
			
 
				+            with open(self.path, "r", encoding=encode) as ff:
			
 
				+                self.txt_text = ff.read()
			
 
				+        except:
			
 
				+            logging.info("cannot open txt!")
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-3]
			
 
				+
			
 
				+    def convert(self):
			
 
				+        self.init_package()
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return
			
 
				+
			
 
				+        self._page = _Page(None, 0)
			
 
				+        _sen = _Sentence(self.txt_text, (0, 0, 0, 0))
			
 
				+        self._page.add_child(_sen)
			
 
				+        self._doc.add_child(self._page)
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        try:
			
 
				+            self.convert()
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-1]
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        return self._doc.get_html()
			
--- a/format_convert/convert_zip.py
+++ b/format_convert/convert_zip.py
@@ -1,6 +1,7 @@
 
				 import os
			
 
				 import sys
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from format_convert.convert_tree import _Document, _Page, _Sentence
			
 
				 import logging
			
 
				 import traceback
			
 
				 import zipfile
			
@@ -99,4 +100,84 @@ def zip2text(path, unique_type_dir):
 
				     except Exception as e:
			
 
				         logging.info("zip2text error!")
			
 
				         print("zip2text", traceback.print_exc())
			
 
				-        return [-1]
			
 
				+        return [-1]
			
 
				+
			
 
				+
			
 
				+class ZipConvert:
			
 
				+    def __init__(self, path, unique_type_dir):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+        self.unique_type_dir = unique_type_dir
			
 
				+        self.zip_path = unique_type_dir
			
 
				+
			
 
				+    def init_package(self):
			
 
				+        try:
			
 
				+            zip_file = zipfile.ZipFile(self.path)
			
 
				+            zip_list = zip_file.namelist()
			
 
				+
			
 
				+            # 循环解压文件到指定目录
			
 
				+            file_list = []
			
 
				+            for f in zip_list:
			
 
				+                file_list.append(zip_file.extract(f, path=self.zip_path))
			
 
				+            zip_file.close()
			
 
				+        except:
			
 
				+            logging.info("cannot open zip!")
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-3]
			
 
				+
			
 
				+    def convert(self):
			
 
				+        from format_convert.convert import getText
			
 
				+
			
 
				+        self.init_package()
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return
			
 
				+
			
 
				+        # 内部文件重命名
			
 
				+        file_list = rename_inner_files(self.zip_path)
			
 
				+        if judge_error_code(file_list):
			
 
				+            return file_list
			
 
				+
			
 
				+        file_no = 0
			
 
				+        self._page = _Page(None, 0)
			
 
				+        for file in file_list:
			
 
				+            if os.path.isdir(file):
			
 
				+                continue
			
 
				+
			
 
				+            bbox = (0, file_no, 0, 0)
			
 
				+            # 无文件后缀，猜格式
			
 
				+            if len(file.split(".")) <= 1:
			
 
				+                logging.info(str(file) + " has no type! Guess type...")
			
 
				+                _type = judge_format(file)
			
 
				+                if _type is None:
			
 
				+                    logging.info(str(file) + "cannot guess type!")
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    logging.info(str(file) + " guess type: " + _type)
			
 
				+                    new_file = str(file) + "." + _type
			
 
				+                    os.rename(file, new_file)
			
 
				+                    file = new_file
			
 
				+                    sub_html = getText(_type, file)
			
 
				+            # 有文件后缀，截取
			
 
				+            else:
			
 
				+                _type = file.split(".")[-1]
			
 
				+                sub_html = getText(_type, file)
			
 
				+
			
 
				+            if judge_error_code(sub_html, code=[-3]):
			
 
				+                continue
			
 
				+            if judge_error_code(sub_html):
			
 
				+                self._doc.error_code = sub_html
			
 
				+                return
			
 
				+
			
 
				+            _sen = _Sentence(sub_html[0], bbox, is_html=True)
			
 
				+            self._page.add_child(_sen)
			
 
				+        self._doc.add_child(self._page)
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        try:
			
 
				+            self.convert()
			
 
				+        except:
			
 
				+            traceback.print_exc()
			
 
				+            self._doc.error_code = [-1]
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        return self._doc.get_html()
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -1012,7 +1012,7 @@ class LineTable:
 
				         if width < 0 and height < 0:
			
 
				             iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
			
 
				                                        abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
			
 
				-            print("getIOU", iou)
			
 
				+            # print("getIOU", iou)
			
 
				             return iou
			
 
				         return 0
			
 
				 
			
--- a/result.html
+++ b/result.html
@@ -1 +1,11 @@
 
				-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body></body>
			
 
				+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body> 甲方向乙方支付技术服务报酬及支付方式为：  
			
 
				+       1.技术服务费总额为：   135000（壹拾叁万伍仟元整）       ； 
			
 
				+       2.技术服务费由甲方  合同签订后一周内支付 80%，即 108000（壹
			
 
				+拾万八仟元整）；项目交付后一月内支付 20%，即 27000（贰万柒仟元整）  支
			
 
				+付乙方。 
			
 
				+       具体支付方式和时间如下： 
			
 
				+       (1)    合同签订后阶段性支付；银行转账            
			
 
				+       乙方开户银行名称、地址和帐号为： 
			
 
				+       开户银行：  中国农业银行股份有限公司江苏自贸试验区南京片区支行    
			
 
				+      地址：  南京市江北新区研创园团结路 99 号孵鹰大厦 690 室        
			
 
				+帐号：      10122001040229008                          </body>