Parcourir la source

修复docx格式混乱

fangjiasheng il y a 2 ans
Parent
commit
f083975c4d

+ 5 - 3
format_convert/convert.py

@@ -795,10 +795,12 @@ def test_one(p, from_remote=False):
     if from_remote:
         ocr_model = None
         otr_model = None
-        _url = 'http://127.0.0.1:15010/convert'
+        _url = 'http://121.46.18.113:15010/convert'
         # _url = 'http://192.168.2.102:15010/convert'
         # _url = 'http://172.16.160.65:15010/convert'
         result = json.loads(request_post(_url, data, time_out=10000))
+        with open("../result.html", "w") as f:
+            f.write(result.get("result_text")[0])
 
         if p.split(".")[-1] == "swf":
             swf_images = eval(result.get("swf_images"))
@@ -869,10 +871,10 @@ if __name__ == '__main__':
         app.run(port=15011)
 
     # if get_platform() == "Windows":
-    #     # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
+    #     file_path = "C:/Users/Administrator/Desktop/test_image/error29.png"
     #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
     #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
-    #     file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
+    #     # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
     # else:
     #     file_path = "test1.doc"
     # test_one(file_path, from_remote=True)

+ 7 - 1
format_convert/convert_docx.py

@@ -325,6 +325,10 @@ class DocxConvert:
             return
         order_list, text_list = order_and_text_list
 
+        # test
+        # for i in range(len(text_list)):
+        #     print(order_list[i], text_list[i])
+
         table_list = self.get_tables()
         if judge_error_code(table_list):
             self._doc.error_code = table_list
@@ -342,7 +346,9 @@ class DocxConvert:
             if tag == "w:t":
                 if len(text_list) > 0:
                     _para = text_list.pop(0)
-                    self._page.add_child(_Sentence(_para, bbox))
+                    _sen = _Sentence(_para, bbox)
+                    _sen.combine=False
+                    self._page.add_child(_sen)
             if tag == "wp:docPr":
                 if len(image_list) > 0:
                     temp_image_path = self.unique_type_dir + "docpr" + str(doc_pr_cnt) + ".png"

+ 11 - 5
format_convert/convert_image.py

@@ -145,18 +145,24 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, u
         log("isr total time "+str(time.time()-_isr_time))
         return _image_np
 
-    def ocr_process(_image_np, _threshold=1024):
+    def ocr_process(_image_np, _threshold=2048):
         log("ocr_process image shape " + str(_image_np.shape))
 
         # ocr图片过大内存溢出,需resize
         # 大图按比例缩小,小图维持不变;若统一拉伸成固定大小如1024会爆显存
         ratio = (1, 1)
-        if _image_np.shape[0] >= _threshold or _image_np.shape[1] >= _threshold:
-            best_h, best_w = get_best_predict_size2(_image_np, 1024)
+        if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
+            best_h, best_w = get_best_predict_size2(_image_np, _threshold)
             _image_np = pil_resize(_image_np, best_h, best_w)
             log("ocr_process image resize " + str(_image_np.shape))
             ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
 
+        # 大图片ocr加锁,防止爆显存
+        # if _image_np.shape[0] >= 1024 and _image_np.shape[1] >= 1024:
+        #     file_lock = True
+        # else:
+        #     file_lock = False
+
         # 调用ocr模型接口
         image_bytes = np2bytes(_image_np)
         text_list, bbox_list = from_ocr_interface(image_bytes, is_table=True)
@@ -258,9 +264,9 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, u
         all_obj_list = []
         _add_y = 0
         for image_np in image_np_list:
-            print("sub image shape", image_np.shape)
+            # print("sub image shape", image_np.shape)
             # 整体分辨率限制
-            threshold = 2000
+            threshold = 2048
             if image_np.shape[0] > threshold or image_np.shape[1] > threshold:
                 h, w = get_best_predict_size2(image_np, threshold=threshold)
                 log("global image resize " + str(image_np.shape[:2]) + " -> " + str(h) + "," + str(w))

+ 10 - 4
format_convert/convert_test.py

@@ -4,6 +4,7 @@ import os
 import random
 import sys
 import time
+from glob import glob
 from multiprocessing import Process
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
@@ -21,9 +22,9 @@ def test_one(p, from_remote=False):
     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
     if from_remote:
         # _url = 'http://121.46.18.113:15010/convert'
-        _url = 'http://192.168.2.103:15010/convert'
+        # _url = 'http://192.168.2.103:15010/convert'
         # _url = 'http://172.16.160.65:15010/convert'
-        # _url = 'http://127.0.0.1:15010/convert'
+        _url = 'http://127.0.0.1:15010/convert'
         result = json.loads(request_post(_url, data, time_out=10000))
         text_str = ""
         for t in result.get("result_html"):
@@ -57,12 +58,17 @@ if __name__ == '__main__':
         # file_path = "C:/Users/Administrator/Desktop/test_xls/merge_cell.xlsx"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
-        # file_path = "C:/Users/Administrator/Downloads/1647913696016.jpg"
-        file_path = "C:/Users/Administrator/Desktop/test_image/error7.jpg"
+        # file_path = "C:/Users/Administrator/Downloads/神仙居旅游汽车租赁竞争性磋商文件(1).doc"
+        # file_path = "C:/Users/Administrator/Desktop/test_xls/error2.xlsx"
+        file_path = "C:/Users/Administrator/Desktop/test_doc/error5.docx"
     else:
         file_path = "1660296734009.pdf"
     test_one(file_path, from_remote=True)
 
+    # paths = glob("C:/Users/Administrator/Desktop/test_image/*")
+    # for file_path in paths:
+    #     test_one(file_path, from_remote=True)
+
     # if get_platform() == "Windows":
     #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
     #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",

+ 2 - 0
format_convert/convert_tree.py

@@ -162,6 +162,8 @@ class _Sentence:
         self.x = bbox[0]
         self.y = bbox[1]
         self.error_code = None
+        # 合并接近句子
+        self.combine = True
 
     def get_html(self):
         if self.error_code is not None:

+ 4 - 0
format_convert/convert_xlsx.py

@@ -105,6 +105,10 @@ class XlsxConvert:
 
         sheet_no = 0
         for sheet in self.sheet_list:
+            # 删除xlsx全为空的行列
+            sheet.dropna(how='all', axis=1, inplace=True)
+            sheet.dropna(how='all', axis=0, inplace=True)
+
             self._page = _Page(None, sheet_no)
             self.convert_page(sheet, sheet_no)
 

+ 21 - 0
format_convert/utils.py

@@ -1535,6 +1535,10 @@ def combine_object(obj_list, threshold=5):
     for i in range(1, len(sentence_list)):
         sen1 = sentence_list[i-1]
         sen2 = sentence_list[i]
+
+        if sen1.combine is False or sen2.combine is False:
+            continue
+
         if abs(sen2.y - sen1.y) <= threshold:
             if sen2.x > sen1.x:
                 sen2.x = sen1.x
@@ -2179,6 +2183,23 @@ def ocr_cant_read(text_list, box_list):
     return result
 
 
+def file_lock(file_name):
+    """
+    获取文件排它锁,返回文件句柄,需手动close文件以释放排它锁
+    :param file_name:
+    :return:
+    """
+    import fcntl
+    if not os.path.exists(file_name):
+        with open(file_name, 'w') as f:
+            f.write('0')
+
+    file = open(file_name, 'r')
+    # 获取排它锁
+    fcntl.flock(file.fileno(), fcntl.LOCK_EX)
+    return file
+
+
 if __name__ == "__main__":
     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
     # print(slash_replace(strs))

+ 1 - 1
idc/idc_interface.py

@@ -167,7 +167,7 @@ class IdcModels:
 
 def test_idc_model(from_remote=False):
     idc_model = IdcModels().get_model()
-    paths = glob("C:/Users/Administrator/Desktop/test_image/error24.jpg")
+    paths = glob("C:/Users/Administrator/Desktop/test_image/111.jpg")
     # file_path = "C:/Users/Administrator/Desktop/test_image/error10.jpg"
     for file_path in paths:
         img_np = cv2.imread(file_path)

+ 6 - 6
isr/isr_interface.py

@@ -254,7 +254,7 @@ class IsrModels:
 
 def test_isr_model(from_remote=False):
     if get_platform() == "Windows":
-        file_path = "C:/Users/Administrator/Desktop/test_image/error10.jpg"
+        file_path = "C:/Users/Administrator/Desktop/test_image/114.jpg"
         # file_path = "C:\\Users\\Administrator\\Downloads\\1647913696016.jpg"
     else:
         file_path = "error10.jpg"
@@ -294,14 +294,14 @@ def test_isr_model(from_remote=False):
         else:
             img = result.get("image")
             print(img.shape)
-            # cv2.namedWindow('img', cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
-            # cv2.imshow("img", img)
-            # cv2.waitKey(0)
+            cv2.namedWindow('img', cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
+            cv2.imshow("img", img)
+            cv2.waitKey(0)
         # print(result)
 
 
 if __name__ == "__main__":
-    for i in range(100):
+    for i in range(1):
         s_t = time.time()
-        test_isr_model(from_remote=True)
+        test_isr_model(from_remote=False)
         print("finish test_isr_model", time.time()-s_t)

+ 2 - 6
isr/pre_process.py

@@ -2,8 +2,9 @@ import colorsys
 import time
 import numpy as np
 import cv2
-
 from skimage import measure
+
+
 def count_red_pixel(image_np, cnt=1000):
     # 红色像素计数
     start_time = time.time()
@@ -26,11 +27,6 @@ def count_red_pixel(image_np, cnt=1000):
             return True
     return False
 
-    if red_cnt >= cnt:
-        return True
-    else:
-        return False
-
 
 def get_classes(classes_path):
     """loads the classes"""

+ 15 - 8
ocr/tools/infer/predict_det.py

@@ -16,17 +16,11 @@ import io
 import logging
 import os
 import sys
-# __dir__ = os.path.dirname(os.path.abspath(__file__))
-import zlib
-
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
 import requests
 from format_convert import _global
+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock
 
-from format_convert.utils import judge_error_code, log, namespace_to_dict
-
-# sys.path.append(__dir__)
-# sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
 import cv2
 import numpy as np
@@ -38,7 +32,10 @@ from ocr.ppocr.utils.logging import get_logger
 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
 from ocr.ppocr.data import create_operators, transform
 from ocr.ppocr.postprocess import build_post_process
+from format_convert.max_compute_config import max_compute
 
+
+MAX_COMPUTE = max_compute
 logger = get_logger()
 
 
@@ -172,7 +169,17 @@ class TextDetector(object):
 
         self.input_tensor.copy_from_cpu(img)
         try:
-            self.predictor.run()
+            # 加锁,防止太多大图片同时预测,爆显存
+            if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not max_compute:
+                time2 = time.time()
+                lock_file_sub = 'ocr'
+                lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
+                f = file_lock(lock_file)
+                log("get file_lock " + lock_file_sub + " time " + str(time.time()-time2))
+                self.predictor.run()
+                f.close()
+            else:
+                self.predictor.run()
         except RuntimeError:
             log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
             log("predictor shrink memory!")