|
@@ -118,18 +118,71 @@ class _Image:
|
|
|
def get_text(self):
|
|
|
return
|
|
|
|
|
|
+ def imageSlice(self,image_np):
|
|
|
+ '''
|
|
|
+ slice the image if the height is to large
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ if image_np is None:
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 整体分辨率限制
|
|
|
+ if image_np.shape[0] > 3000 and image_np.shape[1] < 2000:
|
|
|
+ _sum = np.average(image_np,axis=1)
|
|
|
+
|
|
|
+ list_white_line = []
|
|
|
+ list_ave = list(_sum)
|
|
|
+ for _i in range(len(list_ave)):
|
|
|
+ if (list_ave[_i]>250).all():
|
|
|
+ list_white_line.append(_i)
|
|
|
+ set_white_line = set(list_white_line)
|
|
|
+ width = image_np.shape[1]
|
|
|
+ height = image_np.shape[0]
|
|
|
+ list_images = []
|
|
|
+ _begin = 0
|
|
|
+ _end = 0
|
|
|
+ while 1:
|
|
|
+ if _end>height:
|
|
|
+ break
|
|
|
+ _end+= width
|
|
|
+ while 1:
|
|
|
+ if _begin in set_white_line:
|
|
|
+ break
|
|
|
+ if _begin>height:
|
|
|
+ break
|
|
|
+ _begin += 1
|
|
|
+ _image = image_np[_begin:_end,...]
|
|
|
+ list_images.append(_image)
|
|
|
+ _begin = _end
|
|
|
+ print("image slice into %d parts"%(len(list_images)))
|
|
|
+ return list_images
|
|
|
+ return [image_np]
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
def convert(self):
|
|
|
# 二进制转numpy
|
|
|
# image_np = Image.open(io.BytesIO(self.content))
|
|
|
# image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
|
|
|
image_np = cv2.imread(self.path)
|
|
|
|
|
|
- obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx, use_ocr=True)
|
|
|
- if judge_error_code(obj_list):
|
|
|
- self.error_code = obj_list
|
|
|
- return
|
|
|
- for obj in obj_list:
|
|
|
- self.add_child(obj)
|
|
|
+ list_images = self.imageSlice(image_np)
|
|
|
+ # print(len(list_images))
|
|
|
+ # return
|
|
|
+
|
|
|
+
|
|
|
+ _add_y = 0
|
|
|
+ for _image in list_images:
|
|
|
+ obj_list = image_process(_image, self.path, self.is_from_pdf, self.is_from_docx, use_ocr=True)
|
|
|
+ if judge_error_code(obj_list):
|
|
|
+ self.error_code = obj_list
|
|
|
+ else:
|
|
|
+ list_y = []
|
|
|
+ for obj in obj_list:
|
|
|
+ obj.y += _add_y
|
|
|
+ list_y.append(obj.y)
|
|
|
+ self.add_child(obj)
|
|
|
+ _add_y = max(list_y)
|
|
|
|
|
|
|
|
|
class _Table:
|