il y a 3 ans · f083975c4d
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -795,10 +795,12 @@ def test_one(p, from_remote=False):
 
				     if from_remote:
			
 
				         ocr_model = None
			
 
				         otr_model = None
			
 
				-        _url = 'http://127.0.0.1:15010/convert'
			
 
				+        _url = 'http://121.46.18.113:15010/convert'
			
 
				         # _url = 'http://192.168.2.102:15010/convert'
			
 
				         # _url = 'http://172.16.160.65:15010/convert'
			
 
				         result = json.loads(request_post(_url, data, time_out=10000))
			
 
				+        with open("../result.html", "w") as f:
			
 
				+            f.write(result.get("result_text")[0])
			
 
				 
			
 
				         if p.split(".")[-1] == "swf":
			
 
				             swf_images = eval(result.get("swf_images"))
			
@@ -869,10 +871,10 @@ if __name__ == '__main__':
 
				         app.run(port=15011)
			
 
				 
			
 
				     # if get_platform() == "Windows":
			
 
				-    #     # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
			
 
				+    #     file_path = "C:/Users/Administrator/Desktop/test_image/error29.png"
			
 
				     #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
			
 
				     #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
			
 
				-    #     file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
			
 
				+    #     # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
			
 
				     # else:
			
 
				     #     file_path = "test1.doc"
			
 
				     # test_one(file_path, from_remote=True)
			
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -325,6 +325,10 @@ class DocxConvert:
 
				             return
			
 
				         order_list, text_list = order_and_text_list
			
 
				 
			
 
				+        # test
			
 
				+        # for i in range(len(text_list)):
			
 
				+        #     print(order_list[i], text_list[i])
			
 
				+
			
 
				         table_list = self.get_tables()
			
 
				         if judge_error_code(table_list):
			
 
				             self._doc.error_code = table_list
			
@@ -342,7 +346,9 @@ class DocxConvert:
 
				             if tag == "w:t":
			
 
				                 if len(text_list) > 0:
			
 
				                     _para = text_list.pop(0)
			
 
				-                    self._page.add_child(_Sentence(_para, bbox))
			
 
				+                    _sen = _Sentence(_para, bbox)
			
 
				+                    _sen.combine=False
			
 
				+                    self._page.add_child(_sen)
			
 
				             if tag == "wp:docPr":
			
 
				                 if len(image_list) > 0:
			
 
				                     temp_image_path = self.unique_type_dir + "docpr" + str(doc_pr_cnt) + ".png"
			
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -145,18 +145,24 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, u
 
				         log("isr total time "+str(time.time()-_isr_time))
			
 
				         return _image_np
			
 
				 
			
 
				-    def ocr_process(_image_np, _threshold=1024):
			
 
				+    def ocr_process(_image_np, _threshold=2048):
			
 
				         log("ocr_process image shape " + str(_image_np.shape))
			
 
				 
			
 
				         # ocr图片过大内存溢出，需resize
			
 
				         # 大图按比例缩小，小图维持不变；若统一拉伸成固定大小如1024会爆显存
			
 
				         ratio = (1, 1)
			
 
				-        if _image_np.shape[0] >= _threshold or _image_np.shape[1] >= _threshold:
			
 
				-            best_h, best_w = get_best_predict_size2(_image_np, 1024)
			
 
				+        if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
			
 
				+            best_h, best_w = get_best_predict_size2(_image_np, _threshold)
			
 
				             _image_np = pil_resize(_image_np, best_h, best_w)
			
 
				             log("ocr_process image resize " + str(_image_np.shape))
			
 
				             ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
			
 
				 
			
 
				+        # 大图片ocr加锁，防止爆显存
			
 
				+        # if _image_np.shape[0] >= 1024 and _image_np.shape[1] >= 1024:
			
 
				+        #     file_lock = True
			
 
				+        # else:
			
 
				+        #     file_lock = False
			
 
				+
			
 
				         # 调用ocr模型接口
			
 
				         image_bytes = np2bytes(_image_np)
			
 
				         text_list, bbox_list = from_ocr_interface(image_bytes, is_table=True)
			
@@ -258,9 +264,9 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, u
 
				         all_obj_list = []
			
 
				         _add_y = 0
			
 
				         for image_np in image_np_list:
			
 
				-            print("sub image shape", image_np.shape)
			
 
				+            # print("sub image shape", image_np.shape)
			
 
				             # 整体分辨率限制
			
 
				-            threshold = 2000
			
 
				+            threshold = 2048
			
 
				             if image_np.shape[0] > threshold or image_np.shape[1] > threshold:
			
 
				                 h, w = get_best_predict_size2(image_np, threshold=threshold)
			
 
				                 log("global image resize " + str(image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
			
--- a/format_convert/convert_test.py
+++ b/format_convert/convert_test.py
@@ -4,6 +4,7 @@ import os
 
				 import random
			
 
				 import sys
			
 
				 import time
			
 
				+from glob import glob
			
 
				 from multiprocessing import Process
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				 from format_convert.utils import get_platform, request_post, get_md5_from_bytes
			
@@ -21,9 +22,9 @@ def test_one(p, from_remote=False):
 
				     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
			
 
				     if from_remote:
			
 
				         # _url = 'http://121.46.18.113:15010/convert'
			
 
				-        _url = 'http://192.168.2.103:15010/convert'
			
 
				+        # _url = 'http://192.168.2.103:15010/convert'
			
 
				         # _url = 'http://172.16.160.65:15010/convert'
			
 
				-        # _url = 'http://127.0.0.1:15010/convert'
			
 
				+        _url = 'http://127.0.0.1:15010/convert'
			
 
				         result = json.loads(request_post(_url, data, time_out=10000))
			
 
				         text_str = ""
			
 
				         for t in result.get("result_html"):
			
@@ -57,12 +58,17 @@ if __name__ == '__main__':
 
				         # file_path = "C:/Users/Administrator/Desktop/test_xls/merge_cell.xlsx"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
			
 
				-        # file_path = "C:/Users/Administrator/Downloads/1647913696016.jpg"
			
 
				-        file_path = "C:/Users/Administrator/Desktop/test_image/error7.jpg"
			
 
				+        # file_path = "C:/Users/Administrator/Downloads/神仙居旅游汽车租赁竞争性磋商文件(1).doc"
			
 
				+        # file_path = "C:/Users/Administrator/Desktop/test_xls/error2.xlsx"
			
 
				+        file_path = "C:/Users/Administrator/Desktop/test_doc/error5.docx"
			
 
				     else:
			
 
				         file_path = "1660296734009.pdf"
			
 
				     test_one(file_path, from_remote=True)
			
 
				 
			
 
				+    # paths = glob("C:/Users/Administrator/Desktop/test_image/*")
			
 
				+    # for file_path in paths:
			
 
				+    #     test_one(file_path, from_remote=True)
			
 
				+
			
 
				     # if get_platform() == "Windows":
			
 
				     #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
			
 
				     #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
			
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -162,6 +162,8 @@ class _Sentence:
 
				         self.x = bbox[0]
			
 
				         self.y = bbox[1]
			
 
				         self.error_code = None
			
 
				+        # 合并接近句子
			
 
				+        self.combine = True
			
 
				 
			
 
				     def get_html(self):
			
 
				         if self.error_code is not None:
			
--- a/format_convert/convert_xlsx.py
+++ b/format_convert/convert_xlsx.py
@@ -105,6 +105,10 @@ class XlsxConvert:
 
				 
			
 
				         sheet_no = 0
			
 
				         for sheet in self.sheet_list:
			
 
				+            # 删除xlsx全为空的行列
			
 
				+            sheet.dropna(how='all', axis=1, inplace=True)
			
 
				+            sheet.dropna(how='all', axis=0, inplace=True)
			
 
				+
			
 
				             self._page = _Page(None, sheet_no)
			
 
				             self.convert_page(sheet, sheet_no)
			
 
				 
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -1535,6 +1535,10 @@ def combine_object(obj_list, threshold=5):
 
				     for i in range(1, len(sentence_list)):
			
 
				         sen1 = sentence_list[i-1]
			
 
				         sen2 = sentence_list[i]
			
 
				+
			
 
				+        if sen1.combine is False or sen2.combine is False:
			
 
				+            continue
			
 
				+
			
 
				         if abs(sen2.y - sen1.y) <= threshold:
			
 
				             if sen2.x > sen1.x:
			
 
				                 sen2.x = sen1.x
			
@@ -2179,6 +2183,23 @@ def ocr_cant_read(text_list, box_list):
 
				     return result
			
 
				 
			
 
				 
			
 
				+def file_lock(file_name):
			
 
				+    """
			
 
				+    获取文件排它锁，返回文件句柄，需手动close文件以释放排它锁
			
 
				+    :param file_name:
			
 
				+    :return:
			
 
				+    """
			
 
				+    import fcntl
			
 
				+    if not os.path.exists(file_name):
			
 
				+        with open(file_name, 'w') as f:
			
 
				+            f.write('0')
			
 
				+
			
 
				+    file = open(file_name, 'r')
			
 
				+    # 获取排它锁
			
 
				+    fcntl.flock(file.fileno(), fcntl.LOCK_EX)
			
 
				+    return file
			
 
				+
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
			
 
				     # print(slash_replace(strs))
			
--- a/idc/idc_interface.py
+++ b/idc/idc_interface.py
@@ -167,7 +167,7 @@ class IdcModels:
 
				 
			
 
				 def test_idc_model(from_remote=False):
			
 
				     idc_model = IdcModels().get_model()
			
 
				-    paths = glob("C:/Users/Administrator/Desktop/test_image/error24.jpg")
			
 
				+    paths = glob("C:/Users/Administrator/Desktop/test_image/111.jpg")
			
 
				     # file_path = "C:/Users/Administrator/Desktop/test_image/error10.jpg"
			
 
				     for file_path in paths:
			
 
				         img_np = cv2.imread(file_path)
			
--- a/isr/isr_interface.py
+++ b/isr/isr_interface.py
@@ -254,7 +254,7 @@ class IsrModels:
 
				 
			
 
				 def test_isr_model(from_remote=False):
			
 
				     if get_platform() == "Windows":
			
 
				-        file_path = "C:/Users/Administrator/Desktop/test_image/error10.jpg"
			
 
				+        file_path = "C:/Users/Administrator/Desktop/test_image/114.jpg"
			
 
				         # file_path = "C:\\Users\\Administrator\\Downloads\\1647913696016.jpg"
			
 
				     else:
			
 
				         file_path = "error10.jpg"
			
@@ -294,14 +294,14 @@ def test_isr_model(from_remote=False):
 
				         else:
			
 
				             img = result.get("image")
			
 
				             print(img.shape)
			
 
				-            # cv2.namedWindow('img', cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
			
 
				-            # cv2.imshow("img", img)
			
 
				-            # cv2.waitKey(0)
			
 
				+            cv2.namedWindow('img', cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
			
 
				+            cv2.imshow("img", img)
			
 
				+            cv2.waitKey(0)
			
 
				         # print(result)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    for i in range(100):
			
 
				+    for i in range(1):
			
 
				         s_t = time.time()
			
 
				-        test_isr_model(from_remote=True)
			
 
				+        test_isr_model(from_remote=False)
			
 
				         print("finish test_isr_model", time.time()-s_t)
			
--- a/isr/pre_process.py
+++ b/isr/pre_process.py
@@ -2,8 +2,9 @@ import colorsys
 
				 import time
			
 
				 import numpy as np
			
 
				 import cv2
			
 
				-
			
 
				 from skimage import measure
			
 
				+
			
 
				+
			
 
				 def count_red_pixel(image_np, cnt=1000):
			
 
				     # 红色像素计数
			
 
				     start_time = time.time()
			
@@ -26,11 +27,6 @@ def count_red_pixel(image_np, cnt=1000):
 
				             return True
			
 
				     return False
			
 
				 
			
 
				-    if red_cnt >= cnt:
			
 
				-        return True
			
 
				-    else:
			
 
				-        return False
			
 
				-
			
 
				 
			
 
				 def get_classes(classes_path):
			
 
				     """loads the classes"""
			
--- a/ocr/tools/infer/predict_det.py
+++ b/ocr/tools/infer/predict_det.py
@@ -16,17 +16,11 @@ import io
 
				 import logging
			
 
				 import os
			
 
				 import sys
			
 
				-# __dir__ = os.path.dirname(os.path.abspath(__file__))
			
 
				-import zlib
			
 
				-
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
			
 
				 import requests
			
 
				 from format_convert import _global
			
 
				+from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock
			
 
				 
			
 
				-from format_convert.utils import judge_error_code, log, namespace_to_dict
			
 
				-
			
 
				-# sys.path.append(__dir__)
			
 
				-# sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
			
 
				 os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
			
 
				 import cv2
			
 
				 import numpy as np
			
@@ -38,7 +32,10 @@ from ocr.ppocr.utils.logging import get_logger
 
				 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
			
 
				 from ocr.ppocr.data import create_operators, transform
			
 
				 from ocr.ppocr.postprocess import build_post_process
			
 
				+from format_convert.max_compute_config import max_compute
			
 
				 
			
 
				+
			
 
				+MAX_COMPUTE = max_compute
			
 
				 logger = get_logger()
			
 
				 
			
 
				 
			
@@ -172,7 +169,17 @@ class TextDetector(object):
 
				 
			
 
				         self.input_tensor.copy_from_cpu(img)
			
 
				         try:
			
 
				-            self.predictor.run()
			
 
				+            # 加锁，防止太多大图片同时预测，爆显存
			
 
				+            if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not max_compute:
			
 
				+                time2 = time.time()
			
 
				+                lock_file_sub = 'ocr'
			
 
				+                lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
			
 
				+                f = file_lock(lock_file)
			
 
				+                log("get file_lock " + lock_file_sub + " time " + str(time.time()-time2))
			
 
				+                self.predictor.run()
			
 
				+                f.close()
			
 
				+            else:
			
 
				+                self.predictor.run()
			
 
				         except RuntimeError:
			
 
				             log("ocr/tools/infer/predict_det.py predict.run error! maybe no gpu memory!")
			
 
				             log("predictor shrink memory!")