|
@@ -5,8 +5,8 @@ import sys
|
|
import os
|
|
import os
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
|
|
|
|
|
|
-from format_convert.convert_doc import doc2text
|
|
|
|
-from format_convert.convert_docx import docx2text
|
|
|
|
|
|
+from format_convert.convert_doc import doc2text, DocConvert
|
|
|
|
+from format_convert.convert_docx import docx2text, DocxConvert
|
|
from format_convert.convert_image import picture2text
|
|
from format_convert.convert_image import picture2text
|
|
from format_convert.convert_pdf import pdf2text, PDFConvert
|
|
from format_convert.convert_pdf import pdf2text, PDFConvert
|
|
from format_convert.convert_rar import rar2text
|
|
from format_convert.convert_rar import rar2text
|
|
@@ -2251,9 +2251,10 @@ def getText(_type, path_or_stream):
|
|
|
|
|
|
if _type == "pdf":
|
|
if _type == "pdf":
|
|
# return pdf2text(path_or_stream, unique_type_dir)
|
|
# return pdf2text(path_or_stream, unique_type_dir)
|
|
- return PDFConvert(path_or_stream).get_html()
|
|
|
|
|
|
+ return PDFConvert(path_or_stream, unique_type_dir).get_html()
|
|
if _type == "docx":
|
|
if _type == "docx":
|
|
- return docx2text(path_or_stream, unique_type_dir)
|
|
|
|
|
|
+ # return docx2text(path_or_stream, unique_type_dir)
|
|
|
|
+ return DocxConvert(path_or_stream, unique_type_dir).get_html()
|
|
if _type == "zip":
|
|
if _type == "zip":
|
|
return zip2text(path_or_stream, unique_type_dir)
|
|
return zip2text(path_or_stream, unique_type_dir)
|
|
if _type == "rar":
|
|
if _type == "rar":
|
|
@@ -2263,7 +2264,8 @@ def getText(_type, path_or_stream):
|
|
if _type == "xls":
|
|
if _type == "xls":
|
|
return xls2text(path_or_stream, unique_type_dir)
|
|
return xls2text(path_or_stream, unique_type_dir)
|
|
if _type == "doc":
|
|
if _type == "doc":
|
|
- return doc2text(path_or_stream, unique_type_dir)
|
|
|
|
|
|
+ # return doc2text(path_or_stream, unique_type_dir)
|
|
|
|
+ return DocConvert(path_or_stream, unique_type_dir).get_html()
|
|
if _type == "jpg" or _type == "png" or _type == "jpeg":
|
|
if _type == "jpg" or _type == "png" or _type == "jpeg":
|
|
return picture2text(path_or_stream)
|
|
return picture2text(path_or_stream)
|
|
if _type == "swf":
|
|
if _type == "swf":
|
|
@@ -2641,35 +2643,9 @@ else:
|
|
if not os.path.exists(_path):
|
|
if not os.path.exists(_path):
|
|
_path = os.path.dirname(os.path.abspath(__file__))
|
|
_path = os.path.dirname(os.path.abspath(__file__))
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
-
|
|
|
|
- print(os.path.abspath(__file__) + "/../../")
|
|
|
|
- # if len(sys.argv) == 2:
|
|
|
|
- # port = int(sys.argv[1])
|
|
|
|
- # else:
|
|
|
|
- # port = 15015
|
|
|
|
- # app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
|
|
|
|
- # log("format_conversion running")
|
|
|
|
-
|
|
|
|
- # convert("", "ocr_model", "otr_model")
|
|
|
|
- # _str = "啊"
|
|
|
|
- # str1 = ""
|
|
|
|
- # str2 = ""
|
|
|
|
- # for i in range(900000):
|
|
|
|
- # str1 += _str
|
|
|
|
- # list1 = [str1]
|
|
|
|
- # for i in range(700000):
|
|
|
|
- # str2 += _str
|
|
|
|
- # list2 = [str2]
|
|
|
|
- # cut_str(list1, list2)
|
|
|
|
-
|
|
|
|
- # file_path = "C:/Users/Administrator/Desktop/error1.png"
|
|
|
|
- # file_path = "D:/Project/table-detect-master/train_data/label_1.jpg"
|
|
|
|
- # file_path = "D:/Project/table-detect-master/test_files/1.png"
|
|
|
|
- # file_path = "D:/Project/table-detect-master/test_files/table2.jpg"
|
|
|
|
-
|
|
|
|
if get_platform() == "Windows":
|
|
if get_platform() == "Windows":
|
|
- file_path = "C:/Users/Administrator/Desktop/error3.pdf"
|
|
|
|
- # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"
|
|
|
|
|
|
+ # file_path = "C:/Users/Administrator/Desktop/error3.pdf"
|
|
|
|
+ file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/招标公告--汾口镇汪家桥村村道硬化工程 - .doc"
|
|
# file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
|
|
# file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
|
|
else:
|
|
else:
|
|
file_path = "1.doc"
|
|
file_path = "1.doc"
|