|
@@ -1,6 +1,4 @@
|
|
|
#-*- coding: utf-8 -*-
|
|
|
-import copy
|
|
|
-import difflib
|
|
|
import sys
|
|
|
import os
|
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
|
|
@@ -11,68 +9,26 @@ from format_convert.convert_image import picture2text, ImageConvert
|
|
|
from format_convert.convert_pdf import pdf2text, PDFConvert
|
|
|
from format_convert.convert_rar import rar2text, RarConvert
|
|
|
from format_convert.convert_swf import swf2text, SwfConvert
|
|
|
-from format_convert.convert_txt import txt2text
|
|
|
+from format_convert.convert_txt import txt2text, TxtConvert
|
|
|
from format_convert.convert_xls import xls2text, XlsConvert
|
|
|
from format_convert.convert_xlsx import xlsx2text, XlsxConvert
|
|
|
from format_convert.convert_zip import zip2text, ZipConvert
|
|
|
|
|
|
-import codecs
|
|
|
-import gc
|
|
|
import hashlib
|
|
|
-import io
|
|
|
-import json
|
|
|
-import multiprocessing
|
|
|
-import sys
|
|
|
-import subprocess
|
|
|
-import PyPDF2
|
|
|
-import lxml
|
|
|
-import pdfminer
|
|
|
-from PIL import Image
|
|
|
from format_convert import get_memory_info
|
|
|
from ocr import ocr_interface
|
|
|
-from ocr.ocr_interface import ocr, OcrModels
|
|
|
from otr import otr_interface
|
|
|
-from otr.otr_interface import otr, OtrModels
|
|
|
import re
|
|
|
import shutil
|
|
|
-import signal
|
|
|
-import sys
|
|
|
import base64
|
|
|
import time
|
|
|
-import traceback
|
|
|
import uuid
|
|
|
-from os.path import basename
|
|
|
-import cv2
|
|
|
-import fitz
|
|
|
-import pandas
|
|
|
-import docx
|
|
|
-import zipfile
|
|
|
-import mimetypes
|
|
|
-import filetype
|
|
|
-# import pdfplumber
|
|
|
-import psutil
|
|
|
-import requests
|
|
|
-import rarfile
|
|
|
-from PyPDF2 import PdfFileReader, PdfFileWriter
|
|
|
-import xml.dom.minidom
|
|
|
-import subprocess
|
|
|
-import logging
|
|
|
-from pdfminer.pdfparser import PDFParser
|
|
|
-from pdfminer.pdfdocument import PDFDocument
|
|
|
-from pdfminer.pdfpage import PDFPage
|
|
|
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
|
-from pdfminer.converter import PDFPageAggregator
|
|
|
-from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar
|
|
|
import logging
|
|
|
-import chardet
|
|
|
from bs4 import BeautifulSoup
|
|
|
-from format_convert.libreoffice_interface import office_convert
|
|
|
-from format_convert.swf.export import SVGExporter
|
|
|
+
|
|
|
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
|
|
from format_convert.table_correct import *
|
|
|
-from format_convert.swf.movie import SWF
|
|
|
import logging
|
|
|
-# import timeout_decorator
|
|
|
from format_convert import timeout_decorator
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
|
@@ -2276,8 +2232,8 @@ def getText(_type, path_or_stream):
|
|
|
# return swf2text(path_or_stream, unique_type_dir)
|
|
|
return SwfConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
if _type == "txt":
|
|
|
- return txt2text(path_or_stream)
|
|
|
-
|
|
|
+ # return txt2text(path_or_stream)
|
|
|
+ return TxtConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
return [""]
|
|
|
|
|
|
|
|
@@ -2650,8 +2606,8 @@ else:
|
|
|
if __name__ == '__main__':
|
|
|
if get_platform() == "Windows":
|
|
|
# file_path = "C:/Users/Administrator/Desktop/error2.swf"
|
|
|
- file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/转账支付说明.txt"
|
|
|
- # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
|
|
|
+ # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/转账支付说明.txt"
|
|
|
+ file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624595704316.pdf"
|
|
|
else:
|
|
|
file_path = "1.doc"
|
|
|
|