| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479 |
- # encoding=utf8
- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import io
- import logging
- import os
- import sys
- # __dir__ = os.path.dirname(os.path.abspath(__file__))
- import zlib
- import requests
- # sys.path.append(__dir__)
- # sys.path.append(os.path.abspath(os.path.join(__dir__, '../..')))
- sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../../")
- os.environ["FLAGS_allocator_strategy"] = 'auto_growth'
- import cv2
- import numpy as np
- import math
- import time
- import traceback
- os.environ['FLAGS_eager_delete_tensor_gb'] = '0'
- import paddle
- import ocr.tools.infer.utility as utility
- from ocr.ppocr.postprocess import build_post_process
- from ocr.ppocr.utils.logging import get_logger
- from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
- from config.max_compute_config import MAX_COMPUTE
- from format_convert.utils import judge_error_code, log, namespace_to_dict, get_platform, file_lock, \
- get_gpu_memory_usage, get_current_process_gpu_id
- from format_convert import _global
- import torch
- from torch import nn
- from ocr.tools.infer.torch_rec_model import Rec_ResNet_34
- import gc
- logger = get_logger()
- class TextRecognizer(object):
- shrink_memory_count = 0
- def __init__(self, args):
- self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
- self.character_type = args.rec_char_type
- self.rec_batch_num = args.rec_batch_num
- self.rec_batch_num = 16
- print('self.rec_batch_num', self.rec_batch_num)
- self.rec_algorithm = args.rec_algorithm
- postprocess_params = {
- 'name': 'CTCLabelDecode',
- "character_type": args.rec_char_type,
- "character_dict_path": args.rec_char_dict_path,
- # "use_space_char": args.use_space_char
- "use_space_char": False
- }
- self.postprocess_op = build_post_process(postprocess_params)
- rec_model_path = args.rec_model_dir
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
- model = Rec_ResNet_34()
- mode_state_dict = torch.load(rec_model_path, self.device)['state_dict']
- if str(self.device)=='cpu': # cpu处理时精度调整,加速推理
- for name, value in mode_state_dict.items():
- if get_platform() != "Windows":
- value = value.double()
- value = torch.where((value < 1.0e-23) & (value > 0.0), 1.0e-23, value)
- value = torch.where((value > -1.0e-23) & (value < 0.0), -1.0e-23, value)
- mode_state_dict[name] = value
- model.load_state_dict(mode_state_dict)
- self.predictor = model
- self.predictor.to(self.device)
- self.predictor.eval()
- if str(self.device) != 'cpu':
- self.gpu_id = get_current_process_gpu_id()
- else:
- self.gpu_id = None
- def resize_norm_img(self, img, max_wh_ratio):
- h, w = img.shape[:2]
- imgC, imgH, imgW = self.rec_image_shape
- assert imgC == img.shape[2]
- # print('max_wh_ratio', max_wh_ratio)
- # max_wh_ratio h是w的10倍,直接返回
- if max_wh_ratio < 0.1:
- # log('max_wh_ratio < 0.1', )
- resized_image = img.astype('float32')
- resized_image = resized_image.transpose((2, 0, 1)) / 255
- return resized_image
- else:
- if self.character_type == "ch":
- imgW = int((32 * max_wh_ratio))
- ratio = w / float(h)
- if math.ceil(imgH * ratio) > imgW:
- resized_w = imgW
- else:
- resized_w = int(math.ceil(imgH * ratio))
- try:
- resized_image = cv2.resize(img, (resized_w, imgH))
- except:
- log("predict_rec.py resize_norm_img resize shape " + str((resized_w, imgH, imgW, h, w, ratio, max_wh_ratio)) + ' ' + str(self.rec_image_shape))
- raise
- resized_image = resized_image.astype('float32')
- resized_image = resized_image.transpose((2, 0, 1)) / 255
- resized_image -= 0.5
- resized_image /= 0.5
- padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
- if resized_w is not None:
- padding_im[:, :, 0:resized_w] = resized_image
- return padding_im
- def predict(self, norm_img_batch):
- tensor = torch.from_numpy(norm_img_batch).float()
- # if norm_img.shape[3] >= 100 and get_platform() != "Windows" and not MAX_COMPUTE:
- if get_platform() != "Windows" and not MAX_COMPUTE:
- # 加锁
- time2 = time.time()
- lock_file_sub = 'ocr'
- lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
- f = file_lock(lock_file)
- log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
- try:
- time2 = time.time()
- if str(self.device) != 'cpu':
- torch.cuda.empty_cache()
- tensor = tensor.to(self.device)
- with torch.no_grad():
- out = self.predictor(tensor)
- log("get file_lock run rec" + " time " + str(time.time()-time2))
- except RuntimeError:
- log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
- log("rec predictor shrink memory! ori_im.shape " + str(norm_img_batch.shape))
- get_gpu_memory_usage()
- raise RuntimeError
- finally:
- f.close()
- if str(self.device) != 'cpu':
- torch.cuda.empty_cache()
- gc.collect()
- else:
- tensor = tensor.to(self.device)
- with torch.no_grad():
- out = self.predictor(tensor)
- # logging.info("ocr model predict time - rec" + str(time.time()-start_time))
- out = out.cpu().numpy()
- preds = out
- return preds
- def predict_batch(self, batch_list):
- batch_out_list = []
- if get_platform() != "Windows" and not MAX_COMPUTE and self.gpu_id is not None:
- # 加锁
- time2 = time.time()
- lock_file_sub = f'ocr_{self.gpu_id}'
- lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"
- f = file_lock(lock_file)
- log("rec get file_lock " + lock_file + " time " + str(time.time()-time2))
- try:
- time2 = time.time()
- if str(self.device) != 'cpu':
- torch.cuda.empty_cache()
- for sub_batch_list in batch_list:
- sub_batch_out = []
- for tensor in sub_batch_list:
- with torch.no_grad():
- out = self.predictor(tensor)
- out = out.cpu().numpy()
- sub_batch_out.append(out)
- # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
- batch_out_list.append(sub_batch_out)
- log("get file_lock run rec" + " time " + str(time.time()-time2))
- except RuntimeError:
- log("ocr/tools/infer/predict_rec.py predict.run error! maybe no gpu memory!")
- log("rec predictor shrink memory! ori_im.shape " + str(tensor.shape))
- get_gpu_memory_usage()
- raise RuntimeError
- finally:
- f.close()
- if str(self.device) != 'cpu':
- torch.cuda.empty_cache()
- else:
- for sub_batch_list in batch_list:
- sub_batch_out = []
- for tensor in sub_batch_list:
- # print('tensor.shape', tensor.shape)
- with torch.no_grad():
- out = self.predictor(tensor)
- out = out.cpu().numpy()
- # print('out.shape', out.shape)
- sub_batch_out.append(out)
- # sub_batch_out = np.concatenate(sub_batch_out, axis=0)
- batch_out_list.append(sub_batch_out)
- # 转为numpy
- for bi, sub_batch_out in enumerate(batch_out_list):
- batch_out_list[bi] = np.concatenate(sub_batch_out, axis=0)
- return batch_out_list
- def __call__(self, img_list):
- start_time = time.time()
- # print('into TextRecognizer __call__')
- img_num = len(img_list)
- # 过滤图片比例异常的
- # print('rec len(img_list)', len(img_list))
- temp_list = []
- for img in img_list:
- if img.shape[0] == 0 or img.shape[1] == 0 \
- or img.shape[0] >= 10000 or img.shape[1] >= 10000 \
- or img.shape[1] / img.shape[0] <= 0.5 \
- or img.shape[1] / img.shape[0] >= 100:
- # print('rec img.shape[1] / img.shape[0] <= 0.5', img.shape)
- continue
- temp_list.append(img)
- if not temp_list:
- return None, 0
- img_list = temp_list
- # 按比例排序
- width_list = []
- i = 0
- for img in img_list:
- width_list.append(img.shape[1] / float(img.shape[0]))
- # Sorting can speed up the recognition process
- indices = np.argsort(np.array(width_list))
- # 分批预测
- # rec_res = []
- rec_res = [['', 0.0]] * img_num
- batch_num = self.rec_batch_num
- elapse = 0
- batch_list = []
- for beg_img_no in range(0, img_num, batch_num):
- end_img_no = min(img_num, beg_img_no + batch_num)
- norm_img_batch = []
- max_wh_ratio = 0
- # 取这个batch中比例最大的
- for ino in range(beg_img_no, end_img_no):
- # h, w = img_list[ino].shape[0:2]
- h, w = img_list[indices[ino]].shape[0:2]
- wh_ratio = w * 1.0 / h
- max_wh_ratio = max(max_wh_ratio, wh_ratio)
- # print('max_wh_ratio', max_wh_ratio)
- # resize image
- for ino in range(beg_img_no, end_img_no):
- # print('img_list[indices[ino]].shape', img_list[indices[ino]].shape)
- norm_img = self.resize_norm_img(img_list[indices[ino]],
- max_wh_ratio)
- # print('norm_img.shape', norm_img.shape)
- norm_img = norm_img[np.newaxis, :]
- norm_img_batch.append(norm_img)
- norm_img_batch = np.concatenate(norm_img_batch)
- norm_img_batch = norm_img_batch.copy()
- # 预测
- # starttime = time.time()
- # # 当图片很长时,降低batch,防止爆内存
- # # print('norm_img_batch.shape', norm_img_batch.shape)
- # preds = []
- # if norm_img_batch.shape[-1] >= 400:
- # if norm_img_batch.shape[-1] <= 1000:
- # mini_batch_size = 4
- # elif norm_img_batch.shape[-1] <= 3000:
- # mini_batch_size = 2
- # else:
- # mini_batch_size = 1
- # for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
- # sub_batch = norm_img_batch[bi:bi+mini_batch_size]
- # sub_preds = self.predict(sub_batch)
- # preds.append(sub_preds)
- # # print('type(sub_preds), sub_preds.shape', type(sub_preds), sub_preds.shape)
- # preds = np.concatenate(preds, axis=0)
- # else:
- # preds = self.predict(norm_img_batch)
- # # print('type(preds), preds.shape', type(preds), preds.shape)
- #
- # # 后处理
- # rec_result = self.postprocess_op(preds)
- # for rno in range(len(rec_result)):
- # rec_res[indices[beg_img_no + rno]] = rec_result[rno]
- # elapse += time.time() - starttime
- # 根据长度,动态batch
- if norm_img_batch.shape[-1] >= 400:
- if norm_img_batch.shape[-1] <= 1000:
- mini_batch_size = 4
- elif norm_img_batch.shape[-1] <= 3000:
- mini_batch_size = 2
- else:
- mini_batch_size = 1
- sub_batch_list = []
- for bi in range(0, norm_img_batch.shape[0], mini_batch_size):
- sub_batch = norm_img_batch[bi:bi+mini_batch_size]
- tensor = torch.from_numpy(sub_batch).float()
- tensor = tensor.to(self.device)
- sub_batch_list.append(tensor)
- else:
- tensor = torch.from_numpy(norm_img_batch).float()
- tensor = tensor.to(self.device)
- sub_batch_list = [tensor]
- batch_list.append(sub_batch_list)
- # 预测
- batch_out_list = self.predict_batch(batch_list)
- # 后处理
- for bi, out in enumerate(batch_out_list):
- begin_img_no = bi * batch_num
- rec_result = self.postprocess_op(out)
- for ri in range(len(rec_result)):
- rec_res[indices[begin_img_no + ri]] = rec_result[ri]
- elapse += time.time() - start_time
- return rec_res, elapse
- class TextRecognizer2(object):
- shrink_memory_count = 0
- def __init__(self, args):
- self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")]
- self.character_type = args.rec_char_type
- self.rec_batch_num = args.rec_batch_num
- self.rec_algorithm = args.rec_algorithm
- postprocess_params = {
- 'name': 'CTCLabelDecode',
- "character_type": args.rec_char_type,
- "character_dict_path": args.rec_char_dict_path,
- "use_space_char": args.use_space_char
- }
- self.postprocess_op = build_post_process(postprocess_params)
- self.args = args
- # self.predictor, self.input_tensor, self.output_tensors = \
- # utility.create_predictor(args, 'rec', logger)
- def resize_norm_img(self, img, max_wh_ratio):
- imgC, imgH, imgW = self.rec_image_shape
- assert imgC == img.shape[2]
- if self.character_type == "ch":
- imgW = int((32 * max_wh_ratio))
- h, w = img.shape[:2]
- ratio = w / float(h)
- if math.ceil(imgH * ratio) > imgW:
- resized_w = imgW
- else:
- resized_w = int(math.ceil(imgH * ratio))
- # print("predict_rec.py resize_norm_img resize shape", (resized_w, imgH))
- resized_image = cv2.resize(img, (resized_w, imgH))
- resized_image = resized_image.astype('float32')
- resized_image = resized_image.transpose((2, 0, 1)) / 255
- resized_image -= 0.5
- resized_image /= 0.5
- padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
- padding_im[:, :, 0:resized_w] = resized_image
- return padding_im
- def __call__(self, img_list):
- from format_convert.convert_need_interface import from_gpu_interface_redis
- img_num = len(img_list)
- # Calculate the aspect ratio of all text bars
- width_list = []
- for img in img_list:
- width_list.append(img.shape[1] / float(img.shape[0]))
- # Sorting can speed up the recognition process
- indices = np.argsort(np.array(width_list))
- rec_res = [['', 0.0]] * img_num
- batch_num = self.rec_batch_num
- elapse = 0
- all_gpu_time = 0
- for beg_img_no in range(0, img_num, batch_num):
- # 预处理
- end_img_no = min(img_num, beg_img_no + batch_num)
- norm_img_batch = []
- max_wh_ratio = 0
- for ino in range(beg_img_no, end_img_no):
- h, w = img_list[indices[ino]].shape[0:2]
- wh_ratio = w * 1.0 / h
- max_wh_ratio = max(max_wh_ratio, wh_ratio)
- for ino in range(beg_img_no, end_img_no):
- norm_img = self.resize_norm_img(img_list[indices[ino]],
- max_wh_ratio)
- norm_img = norm_img[np.newaxis, :]
- norm_img_batch.append(norm_img)
- norm_img_batch = np.concatenate(norm_img_batch)
- norm_img_batch = norm_img_batch.copy()
- starttime = time.time()
- # # 压缩numpy
- # compressed_array = io.BytesIO()
- # np.savez_compressed(compressed_array, norm_img_batch)
- # compressed_array.seek(0)
- # norm_img_batch = compressed_array.read()
- # 调用GPU接口
- _dict = {"inputs": norm_img_batch, "args": str(namespace_to_dict(self.args)), "md5": _global.get("md5")}
- result = from_gpu_interface_redis(_dict, model_type="ocr", predictor_type="rec")
- if judge_error_code(result):
- logging.error("from_gpu_interface failed! " + str(result))
- raise requests.exceptions.RequestException
- preds = result.get("preds")
- gpu_time = result.get("gpu_time")
- all_gpu_time += round(gpu_time, 2)
- # # 解压numpy
- # decompressed_array = io.BytesIO()
- # decompressed_array.write(preds)
- # decompressed_array.seek(0)
- # preds = np.load(decompressed_array, allow_pickle=True)['arr_0']
- # log("inputs.shape" + str(preds.shape))
- # 后处理
- rec_result = self.postprocess_op(preds)
- for rno in range(len(rec_result)):
- rec_res[indices[beg_img_no + rno]] = rec_result[rno]
- elapse += time.time() - starttime
- log("ocr model predict time - rec - time " + str(all_gpu_time) + " - num " + str(img_num))
- return rec_res, elapse
- def main(args):
- image_file_list = get_image_file_list(args.image_dir)
- text_recognizer = TextRecognizer(args)
- valid_image_file_list = []
- img_list = []
- for image_file in image_file_list:
- img, flag = check_and_read_gif(image_file)
- if not flag:
- img = cv2.imread(image_file)
- if img is None:
- logger.info("error in loading image:{}".format(image_file))
- continue
- valid_image_file_list.append(image_file)
- img_list.append(img)
- try:
- rec_res, predict_time = text_recognizer(img_list)
- except:
- logger.info(traceback.format_exc())
- logger.info(
- "ERROR!!!! \n"
- "Please read the FAQ:https://github.com/PaddlePaddle/PaddleOCR#faq \n"
- "If your model has tps module: "
- "TPS does not support variable shape.\n"
- "Please set --rec_image_shape='3,32,100' and --rec_char_type='en' ")
- exit()
- for ino in range(len(img_list)):
- logger.info("Predicts of {}:{}".format(valid_image_file_list[ino],
- rec_res[ino]))
- logger.info("Total predict time for {} images, cost: {:.3f}".format(
- len(img_list), predict_time))
- if __name__ == "__main__":
- main(utility.parse_args())
|