fangjiasheng 3 年 前
コミット
59143e621a
3 ファイル変更9 行追加61 行削除
  1. 6 50
      format_convert/convert.py
  2. 1 0
      format_convert/convert_pdf.py
  3. 2 11
      result.html

+ 6 - 50
format_convert/convert.py

@@ -1,6 +1,4 @@
 #-*- coding: utf-8 -*-
-import copy
-import difflib
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
@@ -11,68 +9,26 @@ from format_convert.convert_image import picture2text, ImageConvert
 from format_convert.convert_pdf import pdf2text, PDFConvert
 from format_convert.convert_rar import rar2text, RarConvert
 from format_convert.convert_swf import swf2text, SwfConvert
-from format_convert.convert_txt import txt2text
+from format_convert.convert_txt import txt2text, TxtConvert
 from format_convert.convert_xls import xls2text, XlsConvert
 from format_convert.convert_xlsx import xlsx2text, XlsxConvert
 from format_convert.convert_zip import zip2text, ZipConvert
 
-import codecs
-import gc
 import hashlib
-import io
-import json
-import multiprocessing
-import sys
-import subprocess
-import PyPDF2
-import lxml
-import pdfminer
-from PIL import Image
 from format_convert import get_memory_info
 from ocr import ocr_interface
-from ocr.ocr_interface import ocr, OcrModels
 from otr import otr_interface
-from otr.otr_interface import otr, OtrModels
 import re
 import shutil
-import signal
-import sys
 import base64
 import time
-import traceback
 import uuid
-from os.path import basename
-import cv2
-import fitz
-import pandas
-import docx
-import zipfile
-import mimetypes
-import filetype
-# import pdfplumber
-import psutil
-import requests
-import rarfile
-from PyPDF2 import PdfFileReader, PdfFileWriter
-import xml.dom.minidom
-import subprocess
-import logging
-from pdfminer.pdfparser import PDFParser
-from pdfminer.pdfdocument import PDFDocument
-from pdfminer.pdfpage import PDFPage
-from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
-from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar
 import logging
-import chardet
 from bs4 import BeautifulSoup
-from format_convert.libreoffice_interface import office_convert
-from format_convert.swf.export import SVGExporter
+
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
 from format_convert.table_correct import *
-from format_convert.swf.movie import SWF
 import logging
-# import timeout_decorator
 from format_convert import timeout_decorator
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
@@ -2276,8 +2232,8 @@ def getText(_type, path_or_stream):
         # return swf2text(path_or_stream, unique_type_dir)
         return SwfConvert(path_or_stream, unique_type_dir).get_html()
     if _type == "txt":
-        return txt2text(path_or_stream)
-
+        # return txt2text(path_or_stream)
+        return TxtConvert(path_or_stream, unique_type_dir).get_html()
     return [""]
 
 
@@ -2650,8 +2606,8 @@ else:
 if __name__ == '__main__':
     if get_platform() == "Windows":
         # file_path = "C:/Users/Administrator/Desktop/error2.swf"
-        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/转账支付说明.txt"
-        # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
+        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/转账支付说明.txt"
+        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624595704316.pdf"
     else:
         file_path = "1.doc"
 

+ 1 - 0
format_convert/convert_pdf.py

@@ -633,6 +633,7 @@ class PDFConvert:
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
         if self._doc.error_code is not None:
+            # pdfminer读不了直接转成图片识别
             return
 
         # 判断是否能读pdf

+ 2 - 11
result.html

@@ -1,11 +1,2 @@
-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body> 甲方向乙方支付技术服务报酬及支付方式为:  
-       1.技术服务费总额为:   135000(壹拾叁万伍仟元整)       ; 
-       2.技术服务费由甲方  合同签订后一周内支付 80%,即 108000(壹
-拾万八仟元整);项目交付后一月内支付 20%,即 27000(贰万柒仟元整)  支
-付乙方。 
-       具体支付方式和时间如下: 
-       (1)    合同签订后阶段性支付;银行转账            
-       乙方开户银行名称、地址和帐号为: 
-       开户银行:  中国农业银行股份有限公司江苏自贸试验区南京片区支行    
-      地址:  南京市江北新区研创园团结路 99 号孵鹰大厦 690 室        
-帐号:      10122001040229008                          </body>
+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><div>啊啊啊啊啊啊啊啊啊啊啊啊啊啊是水水水水噜啦噜啦嘞绿绿绿绿绿绿绿</div>
+</body>