fangjiasheng 2 years ago
parent
commit
492ee8732d

+ 137 - 13
format_convert/convert.py

@@ -34,10 +34,10 @@ from flask import Flask, request, g
 import inspect
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
 from format_convert.table_correct import *
-import logging
 from format_convert.wrapt_timeout_decorator import *
 from format_convert import _global
 
+MAX_COMPUTE = True
 
 port_num = [0]
 def choose_port():
@@ -56,8 +56,8 @@ def choose_port():
 
 
 @memory_decorator
-def getText(_type, path_or_stream):
-    @timeout(300, timeout_exception=TimeoutError, use_signals=False)
+def getText(_type, path_or_stream, time_out=300):
+    @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
     def get_html_1(_class):
         return _class.get_html()
 
@@ -74,10 +74,12 @@ def getText(_type, path_or_stream):
         unique_type_dir = path_or_stream + "_" + _type + os.sep
 
     if _type == "pdf":
-        # return PDFConvert(path_or_stream, unique_type_dir).get_html()
+        if MAX_COMPUTE:
+            return PDFConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(PDFConvert(path_or_stream, unique_type_dir))
     if _type == "docx":
-        # return DocxConvert(path_or_stream, unique_type_dir).get_html()
+        if MAX_COMPUTE:
+            return DocxConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
     if _type == "zip":
         return ZipConvert(path_or_stream, unique_type_dir).get_html()
@@ -86,22 +88,28 @@ def getText(_type, path_or_stream):
         return RarConvert(path_or_stream, unique_type_dir).get_html()
         # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
     if _type == "xlsx":
-        # return XlsxConvert(path_or_stream, unique_type_dir).get_html()
+        if MAX_COMPUTE:
+            return XlsxConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
     if _type == "xls":
-        # return XlsConvert(path_or_stream, unique_type_dir).get_html()
+        if MAX_COMPUTE:
+            return XlsConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
     if _type == "doc":
-        # return DocConvert(path_or_stream, unique_type_dir).get_html()
+        if MAX_COMPUTE:
+            return DocConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(DocConvert(path_or_stream, unique_type_dir))
     if _type == "jpg" or _type == "png" or _type == "jpeg":
-        # return ImageConvert(path_or_stream, unique_type_dir).get_html()
+        if MAX_COMPUTE:
+            return ImageConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
     if _type == "swf":
-        # return SwfConvert(path_or_stream, unique_type_dir).get_html()
+        if MAX_COMPUTE:
+            return SwfConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
     if _type == "txt":
-        # return TxtConvert(path_or_stream, unique_type_dir).get_html()
+        if MAX_COMPUTE:
+            return TxtConvert(path_or_stream, unique_type_dir).get_html()
         return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
     return [""]
 
@@ -252,7 +260,7 @@ else:
 
 # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
 # @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
-def unique_temp_file_process(stream, _type, _md5):
+def unique_temp_file_process(stream, _type, _md5, time_out=300):
     if get_platform() == "Windows":
         _global._init()
 
@@ -281,7 +289,7 @@ def unique_temp_file_process(stream, _type, _md5):
         with open(file_path, "wb") as ff:
             ff.write(stream)
 
-        text = getText(_type, file_path)
+        text = getText(_type, file_path, time_out=time_out)
 
         # 获取swf转换的图片
         swf_images = []
@@ -614,6 +622,122 @@ def _convert():
         #     )
 
 
+def convert(data, ocr_model, otr_model):
+    """
+    接口返回值:
+    {[str], 1}: 处理成功
+    {[-1], 0}: 逻辑处理错误
+    {[-2], 0}: 接口调用错误
+    {[-3], 1}: 文件格式错误,无法打开
+    {[-4], 0}: 各类文件调用第三方包读取超时
+    {[-5], 0}: 整个转换过程超时
+    {[-6], 0}: 阿里云UDF队列超时
+    {[-7], 1}: 文件需密码,无法打开
+    :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
+    """
+    log("into convert")
+    _global._init()
+    _global.update({"md5": "1"+"0"*15})
+    # set_flask_global()
+
+    start_time = time.time()
+    _md5 = _global.get("md5")
+    _type = None
+    try:
+        # 模型加入全局变量
+        globals().update({"global_ocr_model": ocr_model})
+        globals().update({"global_otr_model": otr_model})
+
+        _time = time.time()
+        stream = base64.b64decode(data.get("file"))
+        _type = data.get("type")
+        _md5 = get_md5_from_bytes(stream)
+        _md5 = _md5[0]
+        _global.update({"md5": _md5})
+        log("get bytes from file " + str(time.time()-_time))
+
+        if get_platform() == "Windows":
+            try:
+                text, swf_images = unique_temp_file_process(stream, _type, _md5)
+            except TimeoutError:
+                log("convert time out! 300 sec")
+                text = [-5]
+                swf_images = []
+        else:
+            # Linux 通过装饰器设置整个转换超时时间
+            try:
+                text, swf_images = unique_temp_file_process(stream, _type, _md5, time_out=3000)
+            except TimeoutError:
+                log("convert time out! 300 sec")
+                text = [-5]
+                swf_images = []
+
+        still_success_code = [-3, -4, -7]
+        if judge_error_code(text):
+            if judge_error_code(text, still_success_code):
+                is_success = 1
+            else:
+                is_success = 0
+            log("md5: " + str(_md5)
+                + " finished result: " + str(text)
+                + " is_success: " + str(is_success) + " "
+                + str(_type) + " "
+                + " " + str(time.time() - start_time))
+            return {"result_html": [str(text[0])], "result_text": [str(text[0])],
+                    "is_success": is_success, "swf_images": str(swf_images)}
+
+        # 结果保存result.html
+        text_str = ""
+        for t in text:
+            text_str += t
+        to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
+
+        # 取纯文本
+        only_text = []
+        for t in text:
+            new_t = BeautifulSoup(t, "lxml").get_text()
+            new_t = re.sub("\n", "", new_t)
+            only_text.append(new_t)
+
+        # 判断长度,过长截取
+        text = cut_str(text, only_text)
+        only_text = cut_str(only_text, only_text)
+
+        if len(only_text) == 0:
+            only_text = [""]
+
+        if only_text[0] == '' and len(only_text) <= 1:
+            print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
+            log("md5: " + str(_md5) + " "
+                + " finished result: ['', 0] is_success: 1 "
+                + str(_type) + " "
+                + str(time.time() - start_time))
+        else:
+            log("md5: " + str(_md5) +
+                " finished result: " + str(only_text)[:20] + " "
+                + str(len(str(text))) + " is_success: 1 "
+                + str(_type) + " "
+                + str(time.time() - start_time))
+
+        return {"result_html": text, "result_text": only_text,
+                "is_success": 1, "swf_images": str(swf_images)}
+
+    except ConnectionError:
+        log("convert post has no data!" + " failed result: [-2] is_success: 0 "
+            + str(time.time() - start_time))
+        return {"result_html": ["-2"], "result_text": ["-2"],
+                "is_success": 0, "swf_images": str([])}
+    except Exception as e:
+        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
+            + str(_type) + " " +
+            str(time.time() - start_time))
+        traceback.print_exc()
+        return {"result_html": ["-1"], "result_text": ["-1"],
+                "is_success": 0, "swf_images": str([])}
+    finally:
+        log("finally")
+
+
 def test_more(_dir, process_no=None):
     file_path_list = []
     for root, dirs, files in os.walk(_dir, topdown=False):

+ 9 - 0
format_convert/convert_need_interface.py

@@ -27,6 +27,8 @@ from format_convert.libreoffice_interface import office_convert
 import numpy as np
 
 
+MAX_COMPUTE = True
+
 if get_platform() == "Windows":
     FROM_REMOTE = False
     only_test_ocr = False
@@ -47,6 +49,9 @@ if get_platform() == "Windows":
 else:
     FROM_REMOTE = True
 
+if MAX_COMPUTE:
+    FROM_REMOTE = False
+
 # ip_port_dict = get_ip_port()
 # ip = 'http://127.0.0.1'
 # ocr_port_list = ip_port_dict.get(ip).get("ocr")
@@ -426,6 +431,7 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
         # 调用接口
         try:
             if from_remote:
+                log("from remote")
                 retry_times_1 = 3
                 # 重试
                 while retry_times_1:
@@ -454,9 +460,12 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
                         return r
                     break
             else:
+                log("from local")
+                log("otr_model " + str(globals().get("global_otr_model")))
                 if globals().get("global_otr_model") is None:
                     print("=========== init otr model ===========")
                     globals().update({"global_otr_model": OtrModels().get_model()})
+                log("init finish")
                 r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"), is_from_pdf=is_from_pdf)
                 # r = otr(data=base64_stream, otr_model=None, is_from_pdf=is_from_pdf)
         except TimeoutError:

+ 11 - 5
format_convert/convert_test.py

@@ -7,6 +7,7 @@ import time
 from multiprocessing import Process
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
+from format_convert.convert import to_html
 
 
 def test_one(p, from_remote=False):
@@ -19,11 +20,16 @@ def test_one(p, from_remote=False):
 
     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
     if from_remote:
-        # _url = 'http://121.46.18.113:15010/convert'
-        # _url = 'http://192.168.2.102:15010/convert'
+        _url = 'http://121.46.18.113:15010/convert'
+        # _url = 'http://192.168.2.103:15010/convert'
         # _url = 'http://172.16.160.65:15010/convert'
-        _url = 'http://127.0.0.1:15010/convert'
+        # _url = 'http://127.0.0.1:15010/convert'
         result = json.loads(request_post(_url, data, time_out=10000))
+        text_str = ""
+        for t in result.get("result_html"):
+            text_str += t
+        to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
+                text_str)
     else:
         print("only support remote!")
 
@@ -48,10 +54,10 @@ def test_duplicate(path_list, process_no=None):
 
 if __name__ == '__main__':
     if get_platform() == "Windows":
-        file_path = "C:/Users/Administrator/Desktop/test_image/error1.png"
+        # file_path = "C:/Users/Administrator/Desktop/test_xls/merge_cell.xlsx"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
-        # file_path = "C:/Users/Administrator/Downloads/1653547877897.pdf"
+        file_path = "C:/Users/Administrator/Downloads/1653559902461.pdf"
     else:
         file_path = "test1.doc"
     test_one(file_path, from_remote=True)

+ 2 - 1
format_convert/convert_xlsx.py

@@ -189,12 +189,13 @@ class XlsxConvert:
         sheet_xlrd = self.workbook.sheet_by_index(sheet_no)
         merged_cell_list = sheet_xlrd.merged_cells
         merged_cell_list.sort(key=lambda x: (x[0], x[1], x[2], x[3]))
+        # print("merged_cell_list", merged_cell_list)
 
         # 复制填充合并单元格
         for row_start, row_end, col_start, col_end in merged_cell_list:
             if row_start >= len(row_list) or row_end > len(row_list):
                 continue
-            if col_start >= len(row_list) or col_end > len(row_list):
+            if col_start >= len(row_list[row_start]) or col_end > len(row_list[row_start]):
                 continue
             copy_cell = row_list[row_start][col_start]
             for i in range(row_start, row_end):

+ 64 - 3
format_convert/utils.py

@@ -10,6 +10,8 @@ import subprocess
 import sys
 from io import BytesIO
 from subprocess import Popen
+
+import cv2
 import requests
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 import difflib
@@ -1804,6 +1806,63 @@ def get_args_from_config(ip_port_dict, ip, arg_type, node_type=None):
     return arg_list
 
 
+def remove_red_seal(image_np):
+    """
+    去除红色印章
+    """
+    cv2.namedWindow("image_np", 0)
+    cv2.resizeWindow("image_np", 1000, 800)
+    cv2.imshow("image_np", image_np)
+    height, width, c = image_np.shape
+    window_h = int(height / 15)
+
+    image_hsv = cv2.cvtColor(image_np, cv2.COLOR_BGR2HSV)
+
+    # 遍历numpy
+    red_point_list = []
+    image_list = image_np.tolist()
+    hsv_dict = {}
+    for index_1 in range(len(image_list)):
+        for index_2 in range(len(image_list[index_1])):
+            h, s, v = image_hsv[index_1][index_2]
+            if (0 <= h <= 10 or 156 <= h <= 180) and 43 <= s <= 255 and 46 <= v <= 255:
+                key = str(image_hsv[index_1][index_2].tolist())
+                red_point_list.append([key, index_1, index_2])
+                if hsv_dict.get(key):
+                    hsv_dict[key] += 1
+                else:
+                    hsv_dict[key] = 1
+
+    # 找出相同最多的hsv值
+    hsv_most_key = None
+    hsv_most_value = 0
+    for hsv in hsv_dict.keys():
+        if hsv_dict.get(hsv) > hsv_most_value:
+            hsv_most_value = hsv_dict.get(hsv)
+            hsv_most_key = hsv
+    # print(hsv_dict)
+
+    # 根据hsv判断其填充为黑色还是白色
+    hsv_most_key = eval(hsv_most_key)
+    for point in red_point_list:
+        if abs(eval(point[0])[2] - hsv_most_key[2]) <= 70:
+            image_np[point[1]][point[2]][0] = 255
+            image_np[point[1]][point[2]][1] = 255
+            image_np[point[1]][point[2]][2] = 255
+        else:
+            image_np[point[1]][point[2]][0] = 0
+            image_np[point[1]][point[2]][1] = 0
+            image_np[point[1]][point[2]][2] = 0
+
+    cv2.namedWindow("remove_red_seal", 0)
+    cv2.resizeWindow("remove_red_seal", 1000, 800)
+    cv2.imshow("remove_red_seal", image_np)
+    # cv2.imwrite("C:/Users/Administrator/Downloads/1.png", image_np)
+    cv2.waitKey(0)
+
+    return image_np
+
+
 if __name__ == "__main__":
     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
     # print(slash_replace(strs))
@@ -1832,6 +1891,8 @@ if __name__ == "__main__":
 
     # print(parse_yaml())
 
-    print(get_ip_port())
-    print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
-    # print(get_intranet_ip())
+    # print(get_ip_port())
+    # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
+    # print(get_intranet_ip())
+    _path = "C:/Users/Administrator/Downloads/3.png"
+    remove_red_seal(cv2.imread(_path))

+ 0 - 1
ocr/my_infer.py

@@ -3,7 +3,6 @@ from PIL import Image
 from paddleocr import PaddleOCR
 from tools.infer.utility import draw_ocr
 import numpy as np
-from format_convert.convert import remove_red_seal, remove_underline
 
 # path = "../temp/complex/710.png"
 # path = "../test_files/开标记录表3_page_0.png"

+ 25 - 17
otr/otr_interface.py

@@ -6,23 +6,7 @@ import traceback
 # os.environ['TF_XLA_FLAGS'] = '--tf_xla_cpu_global_jit'
 # os.environ['CUDA_VISIBLE_DEVICES'] = "0"
 import tensorflow as tf
-try:
-    gpus = tf.config.list_physical_devices('GPU')
-    if len(gpus) > 0:
-        tf.config.experimental.set_virtual_device_configuration(
-            gpus[0],
-            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])
-except:
-    traceback.print_exc()
-    # pass
-    # gpus = tf.config.list_physical_devices('GPU')
-    # for gpu in gpus:  # 如果使用多块GPU时
-    #     tf.config.experimental.set_memory_growth(gpu, True)
-
-    os.environ['CUDA_CACHE_MAXSIZE'] = str(2147483648)
-    os.environ['CUDA_CACHE_DISABLE'] = str(0)
-    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
-    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
+
 import sys
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 import time
@@ -39,6 +23,29 @@ from otr.table_line import get_points, get_split_line, get_points_row, \
 from format_convert import _global
 
 
+MAX_COMPUTE = True
+
+if not MAX_COMPUTE:
+    # tensorflow 内存设置
+    try:
+        gpus = tf.config.list_physical_devices('GPU')
+        if len(gpus) > 0:
+            tf.config.experimental.set_virtual_device_configuration(
+                gpus[0],
+                [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])
+    except:
+        traceback.print_exc()
+        # pass
+        # gpus = tf.config.list_physical_devices('GPU')
+        # for gpu in gpus:  # 如果使用多块GPU时
+        #     tf.config.experimental.set_memory_growth(gpu, True)
+
+        os.environ['CUDA_CACHE_MAXSIZE'] = str(2147483648)
+        os.environ['CUDA_CACHE_DISABLE'] = str(0)
+        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6)
+        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
+
+
 # 接口配置
 app = Flask(__name__)
 
@@ -81,6 +88,7 @@ def _otr():
 
 
 def otr(data, otr_model, is_from_pdf):
+    log("into otr_interface otr")
     try:
         img_data = base64.b64decode(data)
         # points_and_lines = pool.apply(table_detect, (img_data,))

File diff suppressed because it is too large
+ 42 - 6
result.html


Some files were not shown because too many files changed in this diff