utils.py 89 KB


  1. # -*- coding:utf-8 -*-
  2. import argparse
  3. import copy
  4. import hashlib
  5. import inspect
  6. import json
  7. import os
  8. import pickle
  9. import socket
  10. import subprocess
  11. import sys
  12. from io import BytesIO
  13. from subprocess import Popen
  14. from shapely.geometry import LineString
  15. import cv2
  16. import requests
  17. from PIL import Image
  18. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  19. import difflib
  20. import logging
  21. import mimetypes
  22. import platform
  23. import re
  24. import traceback
  25. import filetype
  26. from bs4 import BeautifulSoup
  27. import yaml
  28. from pdfminer.layout import *
  29. from format_convert import _global
  30. from functools import wraps
  31. import psutil
  32. import time
  33. import numpy as np
  34. from format_convert.judge_platform import get_platform
  35. if get_platform() == "Linux":
  36. import resource
  37. import math
  38. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16]):
  39. """
  40. [0] : continue
  41. [-1]: 逻辑处理错误
  42. [-2]: 接口调用错误
  43. [-3]: 文件格式错误,无法打开
  44. [-4]: 各类文件调用第三方包读取超时
  45. [-5]: 整个转换过程超时
  46. [-6]: 阿里云UDF队列超时
  47. [-7]: 文件需密码,无法打开
  48. [-8]: 调用现成接口报错
  49. [-9]: 接口接收数据为空
  50. [-10]: 长图分割报错
  51. [-11]: 新接口idc、isr、atc报错
  52. [-12]: 表格跨页连接报错
  53. [-13]: pdf表格线处理报错
  54. [-14]: 指定页码报错
  55. [-15]: office转换接口未运行
  56. [-16]: idc方向分类错误导致ocr读取乱码
  57. """
  58. for c in code:
  59. if isinstance(_list, list) and _list == [c]:
  60. return True
  61. return False
  62. def add_div(text):
  63. if text == "" or text is None:
  64. return text
  65. # if get_platform() == "Windows":
  66. # print("add_div", text)
  67. if re.findall("<div>", text):
  68. return text
  69. text = "<div>" + text + "\n"
  70. text = re.sub("\n", "</div><div>", text)
  71. # text += "</div>"
  72. if text[-5:] == "<div>":
  73. # print("add_div has cut", text[-30:])
  74. text = text[:-5]
  75. return text
  76. def get_platform():
  77. sys = platform.system()
  78. return sys
  79. def get_html_p(html_path):
  80. log("into get_html_p")
  81. try:
  82. with open(html_path, "r") as ff:
  83. html_str = ff.read()
  84. soup = BeautifulSoup(html_str, 'lxml')
  85. text = ""
  86. for p in soup.find_all("p"):
  87. p_text = p.text
  88. p_text = p_text.strip()
  89. if p.string != "":
  90. text += p_text
  91. text += "\n"
  92. return text
  93. except Exception as e:
  94. log("get_html_p error!")
  95. return [-1]
  96. def string_similarity(str1, str2):
  97. # 去掉<div>和回车
  98. str1 = re.sub("<div>", "", str1)
  99. str1 = re.sub("</div>", "", str1)
  100. str1 = re.sub("\n", "", str1)
  101. str2 = re.sub("<div>", "", str2)
  102. str2 = re.sub("</div>", "", str2)
  103. str2 = re.sub("\n", "", str2)
  104. # print("********************************")
  105. # print("str1", str1)
  106. # print("********************************")
  107. # print("str2", str2)
  108. # print("********************************")
  109. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  110. print("string_similarity", score)
  111. return score
  112. def get_sequential_data(text_list, bbox_list, html=False):
  113. logging.info("into get_sequential_data")
  114. try:
  115. text = ""
  116. order_list = []
  117. for i in range(len(text_list)):
  118. length_start = bbox_list[i][0][0]
  119. length_end = bbox_list[i][1][0]
  120. height_start = bbox_list[i][0][1]
  121. height_end = bbox_list[i][-1][1]
  122. # print([length_start, length_end, height_start, height_end])
  123. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  124. # text = text + infomation['text'] + "\n"
  125. if get_platform() == "Windows":
  126. print("get_sequential_data", order_list)
  127. if not order_list:
  128. if get_platform() == "Windows":
  129. print("get_sequential_data", "no order list")
  130. return ""
  131. # 根据bbox的坐标对输出排序
  132. order_list.sort(key=lambda x: (x[3], x[1], x[0]))
  133. # 根据bbox分行分列
  134. # col_list = []
  135. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  136. # for i in range(len(order_list)):
  137. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  138. # col_list.append(order_list[i])
  139. # else:
  140. # row_list.append(col_list)
  141. # col_list = []
  142. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  143. # col_list.append(order_list[i])
  144. # if i == len(order_list) - 1:
  145. # row_list.append(col_list)
  146. row_list = []
  147. used_box = []
  148. threshold = 5
  149. for box in order_list:
  150. if box in used_box:
  151. continue
  152. height_center = (box[4] + box[3]) / 2
  153. row = []
  154. for box2 in order_list:
  155. if box2 in used_box:
  156. continue
  157. height_center2 = (box2[4] + box2[3]) / 2
  158. if height_center - threshold <= height_center2 <= height_center + threshold:
  159. if box2 not in row:
  160. row.append(box2)
  161. used_box.append(box2)
  162. row.sort(key=lambda x: x[0])
  163. row_list.append(row)
  164. for row in row_list:
  165. if not row:
  166. continue
  167. if len(row) <= 1:
  168. text = text + row[0][0] + "\n"
  169. else:
  170. sub_text = ""
  171. row.sort(key=lambda x: x[1])
  172. for col in row:
  173. sub_text = sub_text + col[0] + " "
  174. sub_text = sub_text + "\n"
  175. text += sub_text
  176. if html:
  177. text = "<div>" + text
  178. text = re.sub("\n", "</div>\n<div>", text)
  179. text += "</div>"
  180. # if text[-5:] == "<div>":
  181. # text = text[:-5]
  182. return text
  183. except Exception as e:
  184. logging.info("get_sequential_data error!")
  185. print("get_sequential_data", traceback.print_exc())
  186. return [-1]
  187. def rename_inner_files(root_path):
  188. try:
  189. logging.info("into rename_inner_files")
  190. # 获取解压文件夹下所有文件+文件夹,不带根路径
  191. path_list = []
  192. for root, dirs, files in os.walk(root_path, topdown=False):
  193. for name in dirs:
  194. p = os.path.join(root, name) + os.sep
  195. if get_platform() == "Windows":
  196. root_path = slash_replace(root_path)
  197. p = slash_replace(p)
  198. p = re.sub(root_path, "", p)
  199. root_path = slash_replace(root_path, True)
  200. p = slash_replace(p, True)
  201. else:
  202. p = re.sub(root_path, "", p)
  203. path_list.append(p)
  204. for name in files:
  205. p = os.path.join(root, name)
  206. if get_platform() == "Windows":
  207. root_path = slash_replace(root_path)
  208. p = slash_replace(p)
  209. p = re.sub(root_path, "", p)
  210. root_path = slash_replace(root_path, True)
  211. p = slash_replace(p, True)
  212. else:
  213. p = re.sub(root_path, "", p)
  214. path_list.append(p)
  215. # 按路径长度排序
  216. path_list.sort(key=lambda x: len(x), reverse=True)
  217. # 循环改名
  218. for old_path in path_list:
  219. # 按路径分隔符分割
  220. ss = old_path.split(os.sep)
  221. # 判断是否文件夹
  222. is_dir = 0
  223. file_type = ""
  224. if os.path.isdir(root_path + old_path):
  225. ss = ss[:-1]
  226. is_dir = 1
  227. else:
  228. if "." in old_path:
  229. file_type = "." + old_path.split(".")[-1]
  230. else:
  231. file_type = ""
  232. # 最后一级需要用hash改名
  233. new_path = ""
  234. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  235. current_level = 0
  236. for s in ss:
  237. # 路径拼接
  238. if current_level < len(ss) - 1:
  239. new_path += s + os.sep
  240. else:
  241. new_path += str(hash(s)) + file_type
  242. current_level += 1
  243. new_ab_path = root_path + new_path
  244. old_ab_path = root_path + old_path
  245. os.rename(old_ab_path, new_ab_path)
  246. # 重新获取解压文件夹下所有文件+文件夹
  247. new_path_list = []
  248. for root, dirs, files in os.walk(root_path, topdown=False):
  249. for name in dirs:
  250. new_path_list.append(os.path.join(root, name) + os.sep)
  251. for name in files:
  252. new_path_list.append(os.path.join(root, name))
  253. return new_path_list
  254. except:
  255. traceback.print_exc()
  256. return [-1]
  257. def judge_format(path):
  258. guess1 = mimetypes.guess_type(path)
  259. _type = None
  260. if guess1[0]:
  261. _type = guess1[0]
  262. else:
  263. guess2 = filetype.guess(path)
  264. if guess2:
  265. _type = guess2.mime
  266. if _type == "application/pdf":
  267. return "pdf"
  268. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  269. return "docx"
  270. if _type == "application/x-zip-compressed" or _type == "application/zip":
  271. return "zip"
  272. if _type == "application/x-rar-compressed" or _type == "application/rar":
  273. return "rar"
  274. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  275. return "xlsx"
  276. if _type == "application/msword":
  277. return "doc"
  278. if _type == "image/png":
  279. return "png"
  280. if _type == "image/jpeg":
  281. return "jpg"
  282. # 猜不到,返回None
  283. return None
  284. def draw_lines_plt(bboxes):
  285. import matplotlib.pyplot as plt
  286. plt.figure()
  287. for bbox in bboxes:
  288. x = [bbox[0], bbox[2]]
  289. y = [bbox[1], bbox[3]]
  290. plt.plot(x, y)
  291. plt.show()
  292. def slash_replace(_str, reverse=False):
  293. if reverse:
  294. _str = eval(repr(_str).replace('/', '\\\\'))
  295. else:
  296. _str = eval(repr(_str).replace('\\\\', '/'))
  297. return _str
  298. class LineTable:
  299. def recognize_table(self, list_textbox, list_line, sourceP_LB=True,
  300. splited=False, from_pdf=False, show=0):
  301. self.list_line = list_line
  302. self.list_crosspoints = self.recognize_crosspoints(list_line)
  303. self.from_pdf = from_pdf
  304. self.splited = splited
  305. self.connect_bbox_list = []
  306. self.show = show
  307. # 聚类
  308. cluster_crosspoints = []
  309. for _point in self.list_crosspoints:
  310. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  311. while 1:
  312. _find = False
  313. new_cluster_crosspoints = []
  314. for l_point in cluster_crosspoints:
  315. _flag = False
  316. for l_n_point in new_cluster_crosspoints:
  317. line1 = l_point.get("lines")
  318. line2 = l_n_point.get("lines")
  319. if len(line1 & line2) > 0:
  320. _find = True
  321. _flag = True
  322. l_n_point["lines"] = line1.union(line2)
  323. l_n_point["points"].extend(l_point["points"])
  324. if not _flag:
  325. new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  326. cluster_crosspoints = new_cluster_crosspoints
  327. if not _find:
  328. break
  329. # need to sort to deal with the inner tables
  330. for clu_cp in cluster_crosspoints:
  331. points = clu_cp["points"]
  332. list_p = np.array([p["point"] for p in points])
  333. max_x = max(list_p[..., 0])
  334. min_x = min(list_p[..., 0])
  335. max_y = max(list_p[..., 1])
  336. min_y = min(list_p[..., 1])
  337. _area = (max_y - min_y) * (max_x - min_x)
  338. clu_cp["area"] = _area
  339. cluster_crosspoints.sort(key=lambda x: x["area"])
  340. list_l_rect = []
  341. for table_crosspoint in cluster_crosspoints:
  342. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  343. list_l_rect.append(list_rect)
  344. in_objs = set()
  345. list_tables = []
  346. for l_rect in list_l_rect:
  347. _ta = self.rect2table(list_textbox, l_rect, in_objs, sourceP_LB=sourceP_LB)
  348. if self.connect_bbox_list:
  349. return [], [], [], self.connect_bbox_list
  350. if _ta:
  351. list_tables.append(_ta)
  352. if self.show:
  353. # 展示原始表格及文字
  354. self._plot(list_line, list_textbox, title='list_line,list_textbox')
  355. # 打印单元格
  356. for list_rect in list_l_rect:
  357. for rect in list_rect:
  358. print('rect', rect)
  359. self._plot([], [], list_rect, title='list_l_rect')
  360. # 打印最终表格
  361. for table in list_tables:
  362. table = table.get('table')
  363. for row in table:
  364. print('------ row ------')
  365. for col in row:
  366. print('col', col)
  367. return list_tables, in_objs, list_l_rect, []
  368. # def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  369. #
  370. # dump_margin = 5
  371. # list_rect_tmp = []
  372. # # 去重
  373. # for _rect in list_rect:
  374. # if (_rect.bbox[3] - _rect.bbox[1] < 10) or (abs(_rect.bbox[2] - _rect.bbox[0]) < 5):
  375. # continue
  376. # _find = False
  377. # for _tmp in list_rect_tmp:
  378. # for i in range(4):
  379. # if abs(_rect.bbox[i] - _tmp.bbox[i]) < dump_margin:
  380. # pass
  381. # else:
  382. # _find = False
  383. # break
  384. # if i == 3:
  385. # _find = True
  386. # if _find:
  387. # break
  388. # if not _find:
  389. # list_rect_tmp.append(_rect)
  390. #
  391. # # print("=====",len(list_rect),len(list_rect_tmp))
  392. # # print(list_rect_tmp)
  393. # # from matplotlib import pyplot as plt
  394. # # plt.figure()
  395. # # for _rect in list_rect_tmp:
  396. # # x0,y0,x1,y1 = _rect.bbox
  397. # # plt.boxplot(_rect.bbox)
  398. # # plt.show()
  399. #
  400. # cluster_rect = []
  401. # for _rect in list_rect:
  402. # _find = False
  403. # for cr in cluster_rect:
  404. # for cr_rect in cr:
  405. # if abs((cr_rect.bbox[2] - cr_rect.bbox[0] + _rect.bbox[2] - _rect.bbox[0]) - (
  406. # max(cr_rect.bbox[2], _rect.bbox[2]) - min(cr_rect.bbox[0], _rect.bbox[0]))) < margin:
  407. # _find = True
  408. # cr.append(_rect)
  409. # break
  410. # elif abs((cr_rect.bbox[3] - cr_rect.bbox[1] + _rect.bbox[3] - _rect.bbox[1]) - (
  411. # max(cr_rect.bbox[3], _rect.bbox[3]) - min(cr_rect.bbox[1], _rect.bbox[1]))) < margin:
  412. # _find = True
  413. # cr.append(_rect)
  414. # break
  415. # if _find:
  416. # break
  417. # if not _find:
  418. # cluster_rect.append([_rect])
  419. #
  420. # list_l_rect = cluster_rect
  421. #
  422. # in_objs = set()
  423. # list_tables = []
  424. # for l_rect in list_l_rect:
  425. # _ta = self.rect2table(list_textbox, l_rect, in_objs)
  426. # if _ta:
  427. # list_tables.append(_ta)
  428. # return list_tables, in_objs, list_l_rect
  429. def recognize_crosspoints(self, list_line, fixLine=True):
  430. list_crosspoints = []
  431. # print("lines num",len(list_line))
  432. def getMaxPoints(list_x, margin=5, reverse=False):
  433. clust_x = []
  434. for _x in list_x:
  435. _find = False
  436. for cx in clust_x:
  437. if abs(cx[0] - _x) < margin:
  438. _find = True
  439. cx.append(_x)
  440. break
  441. if not _find:
  442. clust_x.append([_x])
  443. clust_x.sort(key=lambda x: x, reverse=reverse)
  444. return clust_x[0][0], len(clust_x[0])
  445. for _i in range(len(list_line)):
  446. for _j in range(len(list_line)):
  447. line1 = list_line[_i].__dict__.get("bbox")
  448. line2 = list_line[_j].__dict__.get("bbox")
  449. exists, point = self.cross_point(line1, line2)
  450. if exists:
  451. list_crosspoints.append(point)
  452. if fixLine:
  453. # 聚类
  454. cluster_crosspoints = []
  455. for _point in list_crosspoints:
  456. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  457. while 1:
  458. _find = False
  459. new_cluster_crosspoints = []
  460. for l_point in cluster_crosspoints:
  461. _flag = False
  462. for l_n_point in new_cluster_crosspoints:
  463. line1 = l_point.get("lines")
  464. line2 = l_n_point.get("lines")
  465. if len(line1 & line2) > 0:
  466. _find = True
  467. _flag = True
  468. l_n_point["lines"] = line1.union(line2)
  469. l_n_point["points"].extend(l_point["points"])
  470. if not _flag:
  471. new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  472. cluster_crosspoints = new_cluster_crosspoints
  473. if not _find:
  474. break
  475. list_crosspoints = []
  476. for list_cp in cluster_crosspoints:
  477. points = list_cp.get("points")
  478. l_lines = []
  479. for p in points:
  480. l_lines.extend(p.get("p_lines"))
  481. l_lines = list(set(l_lines))
  482. l_lines.sort(key=lambda x: x[0])
  483. min_x, _count = getMaxPoints([l[0] for l in l_lines], reverse=False)
  484. if _count <= 2:
  485. min_x = None
  486. min_y, _count = getMaxPoints([l[1] for l in l_lines], reverse=False)
  487. if _count < 2:
  488. min_y = None
  489. max_x, _count = getMaxPoints([l[2] for l in l_lines], reverse=True)
  490. if _count <= 2:
  491. max_x = None
  492. max_y, _count = getMaxPoints([l[3] for l in l_lines], reverse=True)
  493. if _count <= 2:
  494. max_y = None
  495. if min_x and min_y and max_x and max_y:
  496. points.sort(key=lambda x: x["point"][0])
  497. if abs(min_x - points[0]["point"][0]) > 30:
  498. _line = LTLine(1, (min_x, min_y), (min_x, max_y))
  499. list_line.append(_line)
  500. l_lines.append(_line.bbox)
  501. # print("add=====",_line.bbox)
  502. if abs(max_x - points[-1]["point"][0]) > 30:
  503. _line = LTLine(1, (max_x, min_y), (max_x, max_y))
  504. list_line.append(_line)
  505. l_lines.append(_line.bbox)
  506. # print("add=====1",_line.bbox)
  507. points.sort(key=lambda x: x["point"][1])
  508. if abs(min_y - points[0]["point"][1]) > 30:
  509. _line = LTLine(1, (min_x, min_y), (max_x, min_y))
  510. list_line.append(_line)
  511. l_lines.append(_line.bbox)
  512. # print("add=====2",_line.bbox)
  513. if abs(max_y - points[-1]["point"][1]) > 30:
  514. _line = LTLine(1, (min_x, max_y), (max_x, max_y))
  515. list_line.append(_line)
  516. l_lines.append(_line.bbox)
  517. # print("add=====2",_line.bbox)
  518. for _i in range(len(l_lines)):
  519. for _j in range(len(l_lines)):
  520. line1 = l_lines[_i]
  521. line2 = l_lines[_j]
  522. exists, point = self.cross_point(line1, line2)
  523. if exists:
  524. list_crosspoints.append(point)
  525. # from matplotlib import pyplot as plt
  526. # plt.figure()
  527. # for _line in l_lines:
  528. # x0,y0,x1,y1 = _line
  529. # plt.plot([x0,x1],[y0,y1])
  530. # for point in list_crosspoints:
  531. # plt.scatter(point.get("point")[0],point.get("point")[1])
  532. # plt.show()
  533. # print(list_crosspoints)
  534. # print("points num",len(list_crosspoints))
  535. return list_crosspoints
  536. # def recognize_rect(self, _page):
  537. # list_line = []
  538. # for _obj in _page._objs:
  539. # if isinstance(_obj, (LTLine)):
  540. # list_line.append(_obj)
  541. # list_crosspoints = self.recognize_crosspoints(list_line)
  542. #
  543. # # 聚类
  544. # cluster_crosspoints = []
  545. # for _point in list_crosspoints:
  546. # cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  547. # while 1:
  548. # _find = False
  549. # new_cluster_crosspoints = []
  550. # for l_point in cluster_crosspoints:
  551. # _flag = False
  552. # for l_n_point in new_cluster_crosspoints:
  553. # line1 = l_point.get("lines")
  554. # line2 = l_n_point.get("lines")
  555. # if len(line1 & line2) > 0:
  556. # _find = True
  557. # _flag = True
  558. # l_n_point["lines"] = line1.union(line2)
  559. # l_n_point["points"].extend(l_point["points"])
  560. # if not _flag:
  561. # new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  562. # cluster_crosspoints = new_cluster_crosspoints
  563. # if not _find:
  564. # break
  565. # # print(len(cluster_crosspoints))
  566. #
  567. # list_l_rect = []
  568. # for table_crosspoint in cluster_crosspoints:
  569. # list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  570. # list_l_rect.append(list_rect)
  571. #
  572. # return list_l_rect
  573. def crosspoint2rect(self, list_crosspoint, margin=10):
  574. dict_line_points = {}
  575. for _point in list_crosspoint:
  576. lines = list(_point.get("lines"))
  577. for _line in lines:
  578. if _line not in dict_line_points:
  579. dict_line_points[_line] = {"direct": None, "points": []}
  580. dict_line_points[_line]["points"].append(_point)
  581. # 排序
  582. for k, v in dict_line_points.items():
  583. list_x = []
  584. list_y = []
  585. for _p in v["points"]:
  586. list_x.append(_p.get("point")[0])
  587. list_y.append(_p.get("point")[1])
  588. if max(list_x) - min(list_x) > max(list_y) - min(list_y):
  589. v.get("points").sort(key=lambda x: x.get("point")[0])
  590. v["direct"] = "row"
  591. else:
  592. v.get("points").sort(key=lambda x: x.get("point")[1])
  593. v["direct"] = "column"
  594. list_rect = []
  595. for _point in list_crosspoint:
  596. if _point["buttom"] >= margin and _point["right"] >= margin:
  597. lines = list(_point.get("lines"))
  598. _line = lines[0]
  599. if dict_line_points[_line]["direct"] == "column":
  600. _line = lines[1]
  601. next_point = None
  602. for p1 in dict_line_points[_line]["points"]:
  603. if p1["buttom"] >= margin and p1["point"][0] > _point["point"][0]:
  604. next_point = p1
  605. break
  606. if not next_point:
  607. continue
  608. lines = list(next_point.get("lines"))
  609. _line = lines[0]
  610. if dict_line_points[_line]["direct"] == "row":
  611. _line = lines[1]
  612. final_point = None
  613. for p1 in dict_line_points[_line]["points"]:
  614. if p1["left"] >= margin and p1["point"][1] > next_point["point"][1]:
  615. final_point = p1
  616. break
  617. if not final_point:
  618. continue
  619. _r = LTRect(1,
  620. (_point["point"][0], _point["point"][1], final_point["point"][0], final_point["point"][1]))
  621. list_rect.append(_r)
  622. tmp_rect = []
  623. set_bbox = set()
  624. for _r in list_rect:
  625. _bbox = "%.2f-%.2f-%.2f-%.2f" % _r.bbox
  626. width = _r.bbox[2] - _r.bbox[0]
  627. height = _r.bbox[3] - _r.bbox[1]
  628. if width <= margin or height <= margin:
  629. continue
  630. if _bbox not in set_bbox:
  631. tmp_rect.append(_r)
  632. set_bbox.add(_bbox)
  633. list_rect = tmp_rect
  634. # _l = [x.get('point') for x in list_crosspoint]
  635. # _l.sort(key=lambda x: (x[0], x[1]))
  636. # print('list_crosspoint', _l)
  637. # print('list_rect', list_rect)
  638. # import cv2
  639. # import numpy as np
  640. # import random
  641. # img = np.zeros(shape=(1000,1000),dtype=np.uint8)
  642. # img += 255
  643. #
  644. # color = []
  645. # for rect in list_rect:
  646. # color += 10
  647. # x0,y0,x1,y1 = rect.bbox
  648. # x0 *= 10/18
  649. # y0 *= 10/18
  650. # x1 *= 10/18
  651. # y1 *= 10/18
  652. # print(rect.bbox)
  653. # cv2.rectangle(img, (int(x0),int(y0)),(int(x1),int(y1)), (color%255, (color+10)%255, (color+20)%255), 3)
  654. # cv2.imshow("bbox", img)
  655. # cv2.waitKey(0)
  656. return list_rect
  657. def cross_point(self, line1, line2, segment=True, margin=2):
  658. point_is_exist = False
  659. x = y = 0
  660. x1, y1, x2, y2 = line1
  661. x3, y3, x4, y4 = line2
  662. if (x2 - x1) == 0:
  663. k1 = None
  664. b1 = 0
  665. else:
  666. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  667. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  668. if (x4 - x3) == 0: # L2直线斜率不存在
  669. k2 = None
  670. b2 = 0
  671. else:
  672. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  673. b2 = y3 * 1.0 - x3 * k2 * 1.0
  674. if k1 is None:
  675. if not k2 is None:
  676. x = x1
  677. y = k2 * x1 + b2
  678. point_is_exist = True
  679. elif k2 is None:
  680. x = x3
  681. y = k1 * x3 + b1
  682. elif not k2 == k1:
  683. x = (b2 - b1) * 1.0 / (k1 - k2)
  684. y = k1 * x * 1.0 + b1 * 1.0
  685. point_is_exist = True
  686. left = 0
  687. right = 0
  688. top = 0
  689. buttom = 0
  690. if point_is_exist:
  691. if segment:
  692. if x >= (min(x1, x2) - margin) and x <= (max(x1, x2) + margin) and y >= (
  693. min(y1, y2) - margin) and y <= (max(y1, y2) + margin):
  694. if x >= (min(x3, x4) - margin) and x <= (max(x3, x4) + margin) and y >= (
  695. min(y3, y4) - margin) and y <= (max(y3, y4) + margin):
  696. point_is_exist = True
  697. left = abs(min(x1, x3) - x)
  698. right = abs(max(x2, x4) - x)
  699. top = abs(min(y1, y3) - y)
  700. buttom = abs(max(y2, y4) - y)
  701. else:
  702. point_is_exist = False
  703. else:
  704. point_is_exist = False
  705. line1_key = "%.2f-%.2f-%.2f-%.2f" % (x1, y1, x2, y2)
  706. line2_key = "%.2f-%.2f-%.2f-%.2f" % (x3, y3, x4, y4)
  707. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  708. "top": top, "buttom": buttom, "lines": set([line1_key, line2_key]),
  709. "p_lines": [line1, line2]}
  710. # def unionTable(self, list_table, fixspan=True, margin=2):
  711. # set_x = set()
  712. # set_y = set()
  713. #
  714. # list_cell = []
  715. # for _t in list_table:
  716. # for _line in _t:
  717. # list_cell.extend(_line)
  718. #
  719. # clusters_rects = []
  720. # # 根据y1聚类
  721. # set_id = set()
  722. # list_cell_dump = []
  723. # for _cell in list_cell:
  724. # _id = id(_cell)
  725. # if _id in set_id:
  726. # continue
  727. # set_id.add(_id)
  728. # list_cell_dump.append(_cell)
  729. # list_cell = list_cell_dump
  730. # list_cell.sort(key=lambda x: x.get("bbox")[3])
  731. # for _rect in list_cell:
  732. # _y0 = _rect.get("bbox")[3]
  733. # _find = False
  734. # for l_cr in clusters_rects:
  735. # if abs(l_cr[0].get("bbox")[3] - _y0) < 2:
  736. # _find = True
  737. # l_cr.append(_rect)
  738. # break
  739. # if not _find:
  740. # clusters_rects.append([_rect])
  741. #
  742. # clusters_rects.sort(key=lambda x: x[0].get("bbox")[3], reverse=True)
  743. # for l_cr in clusters_rects:
  744. # l_cr.sort(key=lambda x: x.get("bbox")[0])
  745. #
  746. # # print("=============:")
  747. # # for l_r in clusters_rects:
  748. # # print(len(l_r))
  749. #
  750. # for _line in clusters_rects:
  751. # for _rect in _line:
  752. # (x0, y0, x1, y1) = _rect.get("bbox")
  753. # set_x.add(x0)
  754. # set_x.add(x1)
  755. # set_y.add(y0)
  756. # set_y.add(y1)
  757. # if len(set_x) == 0 or len(set_y) == 0:
  758. # return
  759. # list_x = list(set_x)
  760. # list_y = list(set_y)
  761. #
  762. # list_x.sort(key=lambda x: x)
  763. # list_y.sort(key=lambda x: x, reverse=True)
  764. # _table = []
  765. # line_i = 0
  766. # for _line in clusters_rects:
  767. #
  768. # table_line = []
  769. # cell_i = 0
  770. # for _rect in _line:
  771. # (x0, y0, x1, y1) = _rect.get("bbox")
  772. # _cell = {"bbox": (x0, y0, x1, y1), "rect": _rect.get("rect"),
  773. # "rowspan": self.getspan(list_y, y0, y1, margin),
  774. # "columnspan": self.getspan(list_x, x0, x1, margin), "text": _rect.get("text", "")}
  775. # table_line.append(_cell)
  776. #
  777. # cell_i += 1
  778. # line_i += 1
  779. # _table.append(table_line)
  780. #
  781. # # print("=====================>>")
  782. # # for _line in _table:
  783. # # for _cell in _line:
  784. # # print(_cell,end="\t")
  785. # # print("\n")
  786. # # print("=====================>>")
  787. #
  788. # # print(_table)
  789. # if fixspan:
  790. # for _line in _table:
  791. # extend_line = []
  792. # for c_i in range(len(_line)):
  793. # _cell = _line[c_i]
  794. # if _cell.get("columnspan") > 1:
  795. # _cospan = _cell.get("columnspan")
  796. # _cell["columnspan"] = 1
  797. # for i in range(1, _cospan):
  798. # extend_line.append({"index": c_i + 1, "cell": _cell})
  799. # extend_line.sort(key=lambda x: x["index"], reverse=True)
  800. # for _el in extend_line:
  801. # _line.insert(_el["index"], _el["cell"])
  802. # for l_i in range(len(_table)):
  803. # _line = _table[l_i]
  804. # for c_i in range(len(_line)):
  805. # _cell = _line[c_i]
  806. # if _cell.get("rowspan") > 1:
  807. # _rospan = _cell.get("rowspan")
  808. # _cell["rowspan"] = 1
  809. # for i in range(1, _rospan):
  810. # _table[l_i + i].insert(c_i, _cell)
  811. #
  812. # table_bbox = (_table[0][0].get("bbox")[0], _table[0][0].get("bbox")[1], _table[-1][-1].get("bbox")[2],
  813. # _table[-1][-1].get("bbox")[3])
  814. #
  815. # ta = {"bbox": table_bbox, "table": _table}
  816. # return ta
  817. # 获取点阵
  818. def getSpanLocation(self, _list, x0, x1, margin):
  819. list_location = []
  820. (x0, x1) = (min(x0, x1), max(x0, x1))
  821. for _x in _list:
  822. if _x >= (x0 - margin) and _x <= (x1 + margin):
  823. list_location.append(_x)
  824. return list_location
  825. def fixSpan(self, _table, list_x, list_y, sourceP_LB):
  826. # with open('table.pickle', 'wb') as f:
  827. # pickle.dump(_table, f)
  828. def checkPosition(_line, _position, bbox, margin=5):
  829. # check y
  830. if len(_line) > 0:
  831. _bbox = _line[0].get("bbox")
  832. # check if has lap
  833. if (min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3])):
  834. # if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
  835. # print(_bbox)
  836. # print(bbox)
  837. # print("check position y false")
  838. return False
  839. # check x
  840. if _position <= len(_line) - 1:
  841. after_bbox = _line[_position].get("bbox")
  842. # the insert bbox.x1 should not less then the after bbox.x0
  843. if not (after_bbox[0] >= bbox[2]):
  844. # print("check position x after false")
  845. return False
  846. if _position - 1 > 0 and _position - 1 < len(_line):
  847. before_bbox = _line[_position - 1].get("bbox")
  848. # the insert bbox.x1 should less equal than the first bbox.x0
  849. if not (bbox[0] >= before_bbox[2]):
  850. # print("check position x before false")
  851. return False
  852. return True
  853. # 拓展columnspan的数据
  854. for _line in _table:
  855. c_i = 0
  856. while c_i < len(_line):
  857. _cell = _line[c_i]
  858. if _cell.get("columnspan") > 1:
  859. x0, y0, x1, y1 = _cell.get("bbox")
  860. _cospan = _cell.get("columnspan")
  861. locations = self.getSpanLocation(list_x, x0, x1, 10)
  862. if len(locations) == _cospan + 1:
  863. _cell["bbox"] = (x0, y0, locations[1], y1)
  864. _cell["columnspan"] = 1
  865. # len(locations)==_colspan+1
  866. for i in range(1, _cospan):
  867. n_cell = {}
  868. n_cell.update(_cell)
  869. n_cell["bbox"] = (locations[i], y0, locations[i + 1], y1)
  870. c_i += 1
  871. # check the position
  872. if checkPosition(_line, c_i, n_cell["bbox"]):
  873. _line.insert(c_i, n_cell)
  874. c_i += 1
  875. # 拓展rowspan的数据
  876. for l_i in range(len(_table)):
  877. _line = _table[l_i]
  878. c_i = 0
  879. while c_i < len(_line):
  880. _cell = _line[c_i]
  881. if _cell.get("rowspan") > 1:
  882. x0, y0, x1, y1 = _cell.get("bbox")
  883. _rospan = _cell.get("rowspan")
  884. locations = self.getSpanLocation(list_y, y0, y1, 10)
  885. if len(locations) == _rospan + 1:
  886. _cell["bbox"] = (x0, y0, x1, locations[1])
  887. _cell["rowspan"] = 1
  888. for i in range(1, _rospan):
  889. n_cell = {}
  890. n_cell.update(_cell)
  891. if l_i + i <= len(_table) - 1:
  892. # print(len(_table),l_i+i)
  893. n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
  894. if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
  895. _table[l_i + i].insert(c_i, n_cell)
  896. c_i += 1
  897. def fixRect(self, _table, list_x, list_y, sourceP_LB, margin):
  898. self.fixSpan(_table, list_x, list_y, sourceP_LB)
  899. # for line_i in range(len(_table)):
  900. # for cell_i in range(len(_table[line_i])):
  901. # _cell = _table[line_i][cell_i]
  902. # print(line_i,cell_i,_cell["bbox"],_cell["text"])
  903. for _line in _table:
  904. extend_line = []
  905. for c_i in range(len(_line)):
  906. c_cell = _line[c_i]
  907. # first cell missing
  908. if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
  909. _bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
  910. _cell = {"bbox": _bbox,
  911. "rect": LTRect(1, _bbox),
  912. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  913. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  914. "text": ""}
  915. extend_line.append({"index": c_i, "cell": _cell})
  916. # cell in the median missing
  917. if c_i < len(_line) - 1:
  918. n_cell = _line[c_i + 1]
  919. _bbox = c_cell["bbox"]
  920. n_bbox = n_cell["bbox"]
  921. if _bbox[0] == n_bbox[0] and _bbox[2] == n_bbox[2]:
  922. continue
  923. else:
  924. if abs(_bbox[2] - n_bbox[0]) > margin:
  925. _bbox = (_bbox[2], _bbox[1], n_bbox[0], _bbox[3])
  926. _cell = {"bbox": _bbox,
  927. "rect": LTRect(1, _bbox),
  928. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  929. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  930. "text": ""}
  931. extend_line.append({"index": c_i + 1, "cell": _cell})
  932. # last cell missing
  933. if c_i == len(_line) - 1:
  934. if abs(c_cell["bbox"][2] - list_x[-1]) > margin:
  935. _bbox = (c_cell["bbox"][2], c_cell["bbox"][1], list_x[-1], c_cell["bbox"][3])
  936. _cell = {"bbox": _bbox,
  937. "rect": LTRect(1, _bbox),
  938. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  939. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  940. "text": ""}
  941. extend_line.append({"index": c_i + 1, "cell": _cell})
  942. extend_line.sort(key=lambda x: x["index"], reverse=True)
  943. for _tmp in extend_line:
  944. _line.insert(_tmp["index"], _tmp["cell"])
  945. # 排序
  946. _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
  947. def feedText2table(self, _table, list_textbox, in_objs, sourceP_LB):
  948. # find the suitable cell of the textbox
  949. list_cells = []
  950. for table_line in _table:
  951. for _cell in table_line:
  952. list_cells.append({"cell": _cell, "inbox_textbox_list": []})
  953. self.connect_bbox_list = []
  954. for textbox in list_textbox:
  955. list_iou = []
  956. for _d in list_cells:
  957. _cell = _d["cell"]
  958. _iou = self.getIOU(textbox.bbox, _cell["bbox"])
  959. list_iou.append(_iou)
  960. max_iou_index = np.argmax(list_iou)
  961. max_iou = list_iou[max_iou_index]
  962. # if self.from_pdf:
  963. # iou_threhold = 0.3
  964. # else:
  965. iou_threhold = 0.1
  966. if max_iou > iou_threhold and textbox not in in_objs:
  967. list_cells[max_iou_index]["inbox_textbox_list"].append(textbox)
  968. in_objs.add(textbox)
  969. if not self.from_pdf and not self.splited:
  970. # 多个iou大于0.3的,可能是ocr将两个文本合成一个了
  971. iou_index_list = np.where(np.array(list_iou) >= 0.3)[0].tolist()
  972. if len(iou_index_list) >= 2:
  973. # print('len(iou_index_list) >= 2 textbox', textbox)
  974. self.connect_bbox_list.append(textbox)
  975. has_matched_box_list = []
  976. for _d in list_cells:
  977. _cell = _d["cell"]
  978. inbox_textbox_list = _d["inbox_textbox_list"]
  979. # 分行,根据y重合
  980. all_match_box_list = []
  981. inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
  982. for i in range(len(inbox_textbox_list)):
  983. match_box_list = []
  984. box1 = inbox_textbox_list[i]
  985. if box1 in has_matched_box_list:
  986. continue
  987. min_y1 = box1.bbox[1] + 1 / 3 * abs(box1.bbox[3] - box1.bbox[1])
  988. max_y1 = box1.bbox[3] - 1 / 3 * abs(box1.bbox[3] - box1.bbox[1])
  989. match_box_list.append(
  990. [box1.get_text(), box1.bbox[0], box1.bbox[1], box1.bbox[2], box1.bbox[3], min_y1, max_y1])
  991. has_matched_box_list.append(box1)
  992. for j in range(i + 1, len(inbox_textbox_list)):
  993. box2 = inbox_textbox_list[j]
  994. if box2 in has_matched_box_list:
  995. continue
  996. # print(min_y1, box2.bbox[1], box2.bbox[3], max_y1)
  997. # print(min_y2, box1.bbox[3], max_y2)
  998. if min_y1 <= box2.bbox[1] <= max_y1 or \
  999. min_y1 <= box2.bbox[3] <= max_y1 or \
  1000. box2.bbox[1] <= min_y1 <= max_y1 <= box2.bbox[3]:
  1001. match_box_list.append(
  1002. [box2.get_text(), box2.bbox[0], box2.bbox[1], box2.bbox[2], box2.bbox[3], min_y1, max_y1])
  1003. has_matched_box_list.append(box2)
  1004. match_box_list.sort(key=lambda x: x[1])
  1005. all_match_box_list.append(match_box_list)
  1006. # print("match_box_list", all_match_box_list)
  1007. all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
  1008. for box_list in all_match_box_list:
  1009. for box in box_list:
  1010. _cell["text"] += re.sub("\s", '', box[0])
  1011. # 打印所有cell
  1012. # for _cell in list_cells:
  1013. # print("cell", _cell)
  1014. def makeTableByRect(self, list_rect, margin, sourceP_LB):
  1015. _table = []
  1016. set_x = set()
  1017. set_y = set()
  1018. clusters_rects = []
  1019. # 根据y1聚类
  1020. if sourceP_LB:
  1021. list_rect.sort(key=lambda x: x.bbox[3])
  1022. for _rect in list_rect:
  1023. _y0 = _rect.bbox[3]
  1024. _y1 = _rect.bbox[1]
  1025. _find = False
  1026. for l_cr in clusters_rects:
  1027. if abs(l_cr[0].bbox[3] - _y0) < margin:
  1028. _find = True
  1029. l_cr.append(_rect)
  1030. break
  1031. if not _find:
  1032. clusters_rects.append([_rect])
  1033. else:
  1034. list_rect.sort(key=lambda x: x.bbox[1])
  1035. for _rect in list_rect:
  1036. _y0 = _rect.bbox[1]
  1037. _y1 = _rect.bbox[3]
  1038. _find = False
  1039. for l_cr in clusters_rects:
  1040. if abs(l_cr[0].bbox[1] - _y0) < margin:
  1041. _find = True
  1042. l_cr.append(_rect)
  1043. break
  1044. if not _find:
  1045. clusters_rects.append([_rect])
  1046. # print("textbox:===================")
  1047. # for _textbox in list_textbox:
  1048. # print(_textbox.get_text())
  1049. # print("textbox:======>>>>>>>>>>>>>")
  1050. # for c in clusters_rects:
  1051. # print("+"*30)
  1052. # for cc in c:
  1053. # print("rect", cc.)
  1054. # cul spans
  1055. for _line in clusters_rects:
  1056. for _rect in _line:
  1057. (x0, y0, x1, y1) = _rect.bbox
  1058. set_x.add(x0)
  1059. set_x.add(x1)
  1060. set_y.add(y0)
  1061. set_y.add(y1)
  1062. if len(set_x) == 0 or len(set_y) == 0:
  1063. return None, [], []
  1064. if len(list_rect) <= 1:
  1065. return None, [], []
  1066. list_x = list(set_x)
  1067. list_y = list(set_y)
  1068. list_x.sort(key=lambda x: x)
  1069. list_y.sort(key=lambda x: x, reverse=sourceP_LB)
  1070. # print("clusters_rects", len(clusters_rects))
  1071. if sourceP_LB:
  1072. clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
  1073. clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
  1074. for l_cr in clusters_rects:
  1075. l_cr.sort(key=lambda x: x.bbox[0])
  1076. pop_x = []
  1077. for i in range(len(list_x) - 1):
  1078. _i = len(list_x) - i - 1
  1079. l_i = _i - 1
  1080. if abs(list_x[_i] - list_x[l_i]) < 5:
  1081. pop_x.append(_i)
  1082. pop_x.sort(key=lambda x: x, reverse=True)
  1083. for _x in pop_x:
  1084. list_x.pop(_x)
  1085. #
  1086. pop_x = []
  1087. for i in range(len(list_y) - 1):
  1088. _i = len(list_y) - i - 1
  1089. l_i = _i - 1
  1090. if abs(list_y[_i] - list_y[l_i]) < 5:
  1091. pop_x.append(_i)
  1092. pop_x.sort(key=lambda x: x, reverse=True)
  1093. for _x in pop_x:
  1094. list_y.pop(_x)
  1095. # print("list_x", list_x)
  1096. # print("list_y", list_y)
  1097. line_i = 0
  1098. for _line in clusters_rects:
  1099. table_line = []
  1100. cell_i = 0
  1101. for _rect in _line:
  1102. (x0, y0, x1, y1) = _rect.bbox
  1103. _cell = {"bbox": (x0, y0, x1, y1),
  1104. "rect": _rect,
  1105. "rowspan": self.getspan(list_y, y0, y1, margin),
  1106. "columnspan": self.getspan(list_x, x0, x1, margin),
  1107. "text": ""}
  1108. cell_i += 1
  1109. table_line.append(_cell)
  1110. line_i += 1
  1111. _table.append(table_line)
  1112. return _table, list_x, list_y
  1113. def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=True):
  1114. def getIOU(bbox0, bbox1):
  1115. width = max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0]) - (bbox0[2] - bbox0[0] + bbox1[2] - bbox1[0])
  1116. height = max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1]) - (bbox0[3] - bbox0[1] + bbox1[3] - bbox1[1])
  1117. if width < 0 and height < 0:
  1118. return abs(width * height / min(abs((bbox0[2] - bbox0[0]) * (bbox0[3] - bbox0[1])),
  1119. abs((bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]))))
  1120. return 0
  1121. _table, list_x, list_y = self.makeTableByRect(list_rect, margin, sourceP_LB)
  1122. if self.show:
  1123. # 打印_table
  1124. temp_list = []
  1125. for t in _table:
  1126. for c in t:
  1127. print(c)
  1128. temp_list.append(c)
  1129. self._plot([], [], temp_list, title='makeTableByRect table')
  1130. if _table is None:
  1131. return
  1132. self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
  1133. # print("table===========================>")
  1134. # for _line in _table:
  1135. # for _cell in _line:
  1136. # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
  1137. # print()
  1138. # print("table===========================>")
  1139. #
  1140. # print("------------")
  1141. # for _line in _table:
  1142. # for _cell in _line:
  1143. # print(_cell["text"],end="\t")
  1144. # print("\n")
  1145. # print("------------")
  1146. _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
  1147. self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
  1148. if self.show:
  1149. # 打印_table
  1150. temp_list = []
  1151. for t in _table:
  1152. for c in t:
  1153. print(c)
  1154. temp_list.append(c)
  1155. self._plot([], [], temp_list, title='fixRect table')
  1156. # print("table===========================>")
  1157. # for _line in _table:
  1158. # for _cell in _line:
  1159. # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
  1160. # print()
  1161. # print("table===========================>")
  1162. self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
  1163. # feedText2table后,有textbox符合多个单元格iou的,可能是文本错误连接了,需拆开
  1164. if self.connect_bbox_list:
  1165. return {}
  1166. min_x, min_y = 1000000, 1000000
  1167. max_x, max_y = 0, 0
  1168. for row in _table:
  1169. for col in row:
  1170. if col.get('bbox')[0] < min_x:
  1171. min_x = col.get('bbox')[0]
  1172. if col.get('bbox')[2] < min_x:
  1173. min_x = col.get('bbox')[2]
  1174. if col.get('bbox')[1] < min_y:
  1175. min_y = col.get('bbox')[1]
  1176. if col.get('bbox')[3] < min_y:
  1177. min_y = col.get('bbox')[3]
  1178. if col.get('bbox')[0] > max_x:
  1179. max_x = col.get('bbox')[0]
  1180. if col.get('bbox')[2] > max_x:
  1181. max_x = col.get('bbox')[2]
  1182. if col.get('bbox')[1] > max_y:
  1183. max_y = col.get('bbox')[1]
  1184. if col.get('bbox')[3] > max_y:
  1185. max_y = col.get('bbox')[3]
  1186. table_bbox = (min_x, min_y, max_x, max_y)
  1187. # table_bbox = (_table[0][0].get("bbox")[0],
  1188. # _table[0][0].get("bbox")[1],
  1189. # _table[-1][-1].get("bbox")[2],
  1190. # _table[-1][-1].get("bbox")[3])
  1191. # print("=======")
  1192. # for _line in _table:
  1193. # for _cell in _line:
  1194. # print(_cell["text"])
  1195. # print("\n")
  1196. # print("===========")
  1197. ta = {"bbox": table_bbox, "table": _table}
  1198. return ta
  1199. def inbox(self, bbox0, bbox_g, text=""):
  1200. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  1201. # return 1
  1202. # print("utils inbox", text, self.getIOU(bbox0,bbox_g), bbox0, bbox_g)
  1203. if self.getIOU(bbox0, bbox_g) > 0.2:
  1204. return 1
  1205. return 0
  1206. def getIOU(self, bbox0, bbox1):
  1207. bbox0 = [min(bbox0[0], bbox0[2]), min(bbox0[1], bbox0[3]), max(bbox0[0], bbox0[2]), max(bbox0[1], bbox0[3])]
  1208. bbox1 = [min(bbox1[0], bbox1[2]), min(bbox1[1], bbox1[3]), max(bbox1[0], bbox1[2]), max(bbox1[1], bbox1[3])]
  1209. width = abs(max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0])) - (
  1210. abs(bbox0[2] - bbox0[0]) + abs(bbox1[2] - bbox1[0]))
  1211. height = abs(max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1])) - (
  1212. abs(bbox0[3] - bbox0[1]) + abs(bbox1[3] - bbox1[1]))
  1213. if width < 0 and height < 0:
  1214. iou = abs(width * height / min(abs((bbox0[2] - bbox0[0]) * (bbox0[3] - bbox0[1])),
  1215. abs((bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]))))
  1216. # print("getIOU", iou)
  1217. return iou
  1218. return 0
  1219. def getspan(self, _list, x0, x1, margin):
  1220. _count = 0
  1221. (x0, x1) = (min(x0, x1), max(x0, x1))
  1222. for _x in _list:
  1223. if _x >= (x0 - margin) and _x <= (x1 + margin):
  1224. _count += 1
  1225. return _count - 1
  1226. def _plot(self, list_line, list_textbox, list_rect=[], title=''):
  1227. from matplotlib import pyplot as plt
  1228. plt.figure()
  1229. for _line in list_line:
  1230. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  1231. plt.plot([x0, x1], [y0, y1])
  1232. for _line in list_line:
  1233. x0, y0, x1, y1 = _line.bbox
  1234. plt.plot([x0, x1], [y0, y1])
  1235. # for point in list_crosspoints:
  1236. # plt.scatter(point.get("point")[0],point.get("point")[1])
  1237. for textbox in list_textbox:
  1238. x0, y0, x1, y1 = textbox.bbox
  1239. plt.plot([x0, x1], [y0, y1])
  1240. for rect in list_rect:
  1241. try:
  1242. x0, y0, x1, y1 = rect.bbox
  1243. except:
  1244. x0, y0, x1, y1 = rect.get("bbox")
  1245. plt.plot([x0, x0], [y0, y1])
  1246. plt.plot([x0, x1], [y0, y0])
  1247. plt.plot([x1, x1], [y0, y1])
  1248. plt.plot([x0, x1], [y1, y1])
  1249. plt.title(str(title))
  1250. plt.show()
  1251. def get_table_html(table):
  1252. html_text = '<table border="1">'
  1253. for row in table:
  1254. html_text += "<tr>"
  1255. for col in row:
  1256. row_span = col.get("rowspan")
  1257. col_span = col.get("columnspan")
  1258. bbox_text = col.get("text")
  1259. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1260. html_text += bbox_text + "</td>"
  1261. html_text += "</tr>"
  1262. html_text += "</table>"
  1263. return html_text
  1264. def sort_object(obj_list, is_reverse=False):
  1265. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  1266. obj_list = combine_object(obj_list)
  1267. if len(obj_list) == 0:
  1268. return obj_list
  1269. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  1270. obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
  1271. return obj_list
  1272. elif isinstance(obj_list[0], _Page):
  1273. obj_list.sort(key=lambda x: x.page_no)
  1274. return obj_list
  1275. else:
  1276. return obj_list
  1277. def combine_object(obj_list, threshold=5):
  1278. from format_convert.convert_tree import _Sentence
  1279. sentence_list = []
  1280. for obj in obj_list:
  1281. if isinstance(obj, _Sentence) and not obj.is_html:
  1282. obj.content = re.sub("\s", "", obj.content)
  1283. sentence_list.append(obj)
  1284. sentence_list.sort(key=lambda x: (x.y, x.x))
  1285. for sen in sentence_list:
  1286. obj_list.remove(sen)
  1287. delete_list = []
  1288. for i in range(1, len(sentence_list)):
  1289. sen1 = sentence_list[i - 1]
  1290. sen2 = sentence_list[i]
  1291. if sen1.combine is False or sen2.combine is False:
  1292. continue
  1293. if abs(sen2.y - sen1.y) <= threshold:
  1294. if sen2.x > sen1.x:
  1295. sen2.x = sen1.x
  1296. sen2.content = sen1.content + sen2.content
  1297. else:
  1298. sen2.content = sen2.content + sen1.content
  1299. if sen2.y > sen1.y:
  1300. sen2.y = sen1.y
  1301. delete_list.append(sen1)
  1302. for sen in delete_list:
  1303. sentence_list.remove(sen)
  1304. for sen in sentence_list:
  1305. obj_list.append(sen)
  1306. return obj_list
  1307. session_ocr = requests.Session()
  1308. session_otr = requests.Session()
  1309. session_all = requests.Session()
  1310. def request_post(url, param, time_out=1000, use_zlib=False):
  1311. fails = 0
  1312. text = json.dumps([-2])
  1313. while True:
  1314. try:
  1315. if fails >= 1:
  1316. break
  1317. headers = {'content-type': 'application/json'}
  1318. # result = requests.post(url, data=param, timeout=time_out)
  1319. if param.get("model_type") == "ocr":
  1320. result = session_ocr.post(url, data=param, timeout=time_out)
  1321. elif param.get("model_type") == "otr":
  1322. result = session_otr.post(url, data=param, timeout=time_out)
  1323. else:
  1324. result = session_all.post(url, data=param, timeout=time_out)
  1325. # print('result.status_code', result.status_code)
  1326. # print('result.text', result.text)
  1327. if result.status_code == 200:
  1328. text = result.text
  1329. break
  1330. else:
  1331. # print('result.status_code', result.status_code)
  1332. # print('result.text', result.text)
  1333. fails += 1
  1334. continue
  1335. except socket.timeout:
  1336. fails += 1
  1337. # print('timeout! fail times:', fails)
  1338. except:
  1339. fails += 1
  1340. # print('fail! fail times:', fails)
  1341. traceback.print_exc()
  1342. return text
  1343. def test_gpu():
  1344. print("=" * 30)
  1345. import paddle
  1346. paddle.utils.run_check()
  1347. # import tensorflow as tf
  1348. # print("tf gpu", tf.config.list_physical_devices('GPU'))
  1349. print("=" * 30)
  1350. def my_subprocess_call(*popenargs, timeout=None):
  1351. logging.info("into my_subprocess_call")
  1352. with Popen(*popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
  1353. try:
  1354. for line in p.stdout:
  1355. print("stdout", line)
  1356. for line in p.stderr:
  1357. print("stderr", line)
  1358. p.wait(timeout=timeout)
  1359. # p.communicate()
  1360. return p.pid, p.returncode
  1361. except: # Including KeyboardInterrupt, wait handled that.
  1362. p.kill()
  1363. # We don't call p.wait() again as p.__exit__ does that for us.
  1364. raise
  1365. finally:
  1366. logging.info("out my_subprocess_call")
  1367. p.kill()
  1368. def parse_yaml():
  1369. yaml_path = os.path.dirname(os.path.abspath(__file__)) + "/interface_new.yml"
  1370. # with open(yaml_path, "r", encoding='utf-8') as f:
  1371. # cfg = f.read()
  1372. #
  1373. # params = yaml.load(cfg, Loader=yaml.SafeLoader)
  1374. with open(yaml_path, "r", encoding='utf-8') as f:
  1375. _dict = json.load(f)
  1376. return _dict
  1377. def get_ip_port(node_type=None, interface_type=None):
  1378. if node_type is None:
  1379. node_type_list = ["master", "slave"]
  1380. else:
  1381. node_type_list = [node_type]
  1382. if interface_type is None:
  1383. interface_type_list = ["convert", "ocr", "otr", "office", "path", "isr", "idc", "atc", "yolo"]
  1384. else:
  1385. interface_type_list = [interface_type]
  1386. ip_port_dict = {}
  1387. params = parse_yaml()
  1388. # 循环 master slave
  1389. for type1 in node_type_list:
  1390. node_type = type1.upper()
  1391. ip = params.get(node_type).get("ip")
  1392. if not ip:
  1393. continue
  1394. if ip_port_dict.get(ip):
  1395. ip_port_dict.get(ip).update({node_type: {}})
  1396. else:
  1397. ip_port_dict.update({ip: {node_type: {}}})
  1398. # 有IP时,循环多个参数
  1399. for type2 in interface_type_list:
  1400. python_path = None
  1401. project_path = None
  1402. gunicorn_path = None
  1403. port_list = []
  1404. interface_type = type2
  1405. if not params.get(node_type).get(interface_type):
  1406. continue
  1407. if interface_type == "path":
  1408. python_path = params.get(node_type).get(interface_type).get("python")
  1409. project_path = params.get(node_type).get(interface_type).get("project")
  1410. gunicorn_path = params.get(node_type).get(interface_type).get("gunicorn")
  1411. else:
  1412. port = params.get(node_type).get(interface_type).get("port")
  1413. port_num = params.get(node_type).get(interface_type).get("port_num")
  1414. gpu_no = params.get(node_type).get(interface_type).get("gpu")
  1415. if port is None or port_num is None:
  1416. port_list = []
  1417. else:
  1418. port_list = [port, port_num, gpu_no]
  1419. # 参数放入dict
  1420. if port_list:
  1421. ip_port_dict.get(ip).get(node_type).update({interface_type: port_list})
  1422. if project_path and python_path and gunicorn_path:
  1423. ip_port_dict.get(ip).get(node_type).update({"project_path": project_path,
  1424. "python_path": python_path,
  1425. "gunicorn_path": gunicorn_path})
  1426. return ip_port_dict
  1427. def get_ip_port_old(node_type=None, interface_type=None):
  1428. if node_type is None:
  1429. node_type_list = ["master", "slave"]
  1430. else:
  1431. node_type_list = [node_type]
  1432. if interface_type is None:
  1433. interface_type_list = ["convert", "ocr", "otr", "office", "path"]
  1434. else:
  1435. interface_type_list = [interface_type]
  1436. ip_port_dict = {}
  1437. params = parse_yaml()
  1438. for type1 in node_type_list:
  1439. node_type = type1.upper()
  1440. ip_list = params.get(node_type).get("ip")
  1441. for type2 in interface_type_list:
  1442. interface_type = type2.upper()
  1443. processes = 0
  1444. python_path = None
  1445. project_path = None
  1446. if interface_type in ["convert".upper()]:
  1447. _port = params.get(node_type).get(interface_type).get("port")
  1448. if _port is None:
  1449. port_list = []
  1450. else:
  1451. if interface_type == "convert".upper():
  1452. processes = params.get(node_type).get(interface_type).get("processes")
  1453. port_list = [str(_port)] * int(processes)
  1454. # port_list = [str(_port)]
  1455. elif interface_type == "path".upper():
  1456. python_path = params.get(node_type).get(interface_type).get("python")
  1457. project_path = params.get(node_type).get(interface_type).get("project")
  1458. else:
  1459. port_start = params.get(node_type).get(interface_type).get("port_start")
  1460. port_no = params.get(node_type).get(interface_type).get("port_no")
  1461. if port_start is None or port_no is None:
  1462. port_list = []
  1463. else:
  1464. port_list = [str(x) for x in range(port_start, port_start + port_no, 1)]
  1465. if ip_list:
  1466. for _ip in ip_list:
  1467. if _ip is None:
  1468. continue
  1469. if _ip in ip_port_dict.keys():
  1470. if port_list:
  1471. ip_port_dict.get(_ip).update({interface_type.lower(): port_list})
  1472. else:
  1473. if port_list:
  1474. ip_port_dict[_ip] = {interface_type.lower(): port_list}
  1475. if processes:
  1476. ip_port_dict.get(_ip).update({interface_type.lower() + "_processes": processes})
  1477. if project_path and python_path:
  1478. ip_port_dict.get(_ip).update({"project_path": project_path,
  1479. "python_path": python_path})
  1480. return ip_port_dict
  1481. def get_intranet_ip():
  1482. try:
  1483. # Create a new socket using the given address family,
  1484. # socket type and protocol number.
  1485. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  1486. # Connect to a remote socket at address.
  1487. # (The format of address depends on the address family.)
  1488. address = ("8.8.8.8", 80)
  1489. s.connect(address)
  1490. # Return the socket’s own address.
  1491. # This is useful to find out the port number of an IPv4/v6 socket, for instance.
  1492. # (The format of the address returned depends on the address family.)
  1493. sockname = s.getsockname()
  1494. ip = sockname[0]
  1495. port = sockname[1]
  1496. finally:
  1497. s.close()
  1498. return ip
  1499. def get_all_ip():
  1500. if get_platform() == "Windows":
  1501. ips = ['0.0.0.0']
  1502. else:
  1503. ips = [ip.split('/')[0] for ip in os.popen("ip addr | grep 'inet '|awk '{print $2}'").readlines()]
  1504. for i in range(len(ips)):
  1505. ips[i] = "http://" + ips[i]
  1506. return ips
  1507. def get_using_ip():
  1508. ip_port_dict = get_ip_port()
  1509. ips = get_all_ip()
  1510. for key in ip_port_dict.keys():
  1511. if key in ips:
  1512. ip = key
  1513. break
  1514. # ip = "http://127.0.0.1"
  1515. if ip == 'http://127.0.0.1':
  1516. ip = 'http://0.0.0.0'
  1517. return ip
  1518. def memory_decorator(func):
  1519. @wraps(func)
  1520. def get_memory_info(*args, **kwargs):
  1521. if get_platform() == "Windows":
  1522. return func(*args, **kwargs)
  1523. # 只有linux有resource包
  1524. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1525. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1526. start_time = time.time()
  1527. logging.info("----- memory info start - " + func.__qualname__
  1528. + " - " + str(os.getpid())
  1529. + " - " + str(round(usage, 2)) + " GB"
  1530. + " - " + str(round(time.time() - start_time, 2)) + " sec")
  1531. result = func(*args, **kwargs)
  1532. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1533. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1534. logging.info("----- memory info end - " + func.__qualname__
  1535. + " - " + str(os.getpid())
  1536. + " - " + str(round(usage, 2)) + " GB"
  1537. + " - " + str(round(time.time() - start_time, 2)) + " sec")
  1538. return result
  1539. return get_memory_info
  1540. def log(msg):
  1541. call_func_name = inspect.currentframe().f_back.f_code.co_name
  1542. logger = get_logger(call_func_name, {"md5": _global.get("md5"),
  1543. "port": _global.get("port")})
  1544. logger.info(msg)
  1545. # logging.info(msg)
  1546. def get_logger(_name, _dict):
  1547. extra = _dict
  1548. _format = '%(asctime)s - %(name)s - %(levelname)s - %(md5)s - %(port)s - %(message)s'
  1549. logger = logging.getLogger(_name)
  1550. create_new_flag = 1
  1551. handlers = logger.handlers
  1552. if handlers:
  1553. for h in handlers:
  1554. if h.formatter.__dict__.get("_fmt") == _format:
  1555. create_new_flag = 0
  1556. break
  1557. if create_new_flag:
  1558. formatter = logging.Formatter(_format)
  1559. handler = logging.StreamHandler()
  1560. handler.setFormatter(formatter)
  1561. logger.addHandler(handler)
  1562. logger.setLevel(logging.INFO)
  1563. logger.propagate = False
  1564. logger = logging.LoggerAdapter(logger, extra)
  1565. return logger
  1566. def set_flask_global():
  1567. # 接口轮询所需锁、参数
  1568. ip_port_flag = {}
  1569. # ip_flag = []
  1570. ip_port_dict = get_ip_port()
  1571. # print(ip_port_dict)
  1572. for _k in ip_port_dict.keys():
  1573. # print(_k)
  1574. ip_port_flag.update({_k: {}})
  1575. for interface in ["ocr", "otr", "convert", "idc", "isr", "atc", 'yolo', "office"]:
  1576. if ip_port_dict.get(_k).get("MASTER") and ip_port_dict.get(_k).get("MASTER").get(interface):
  1577. ip_port_flag[_k][interface] = 0
  1578. else:
  1579. if ip_port_dict.get(_k).get("SLAVE") and ip_port_dict.get(_k).get("SLAVE").get(interface):
  1580. ip_port_flag[_k][interface] = 0
  1581. _global.update({"ip_port_flag": ip_port_flag})
  1582. _global.update({"ip_port": ip_port_dict})
  1583. # _global.update({"ip_flag": ip_flag})
  1584. # print(globals().get("ip_port"))
  1585. def get_md5_from_bytes(_bytes):
  1586. def generate_fp(_b):
  1587. bio = BytesIO()
  1588. bio.write(_b)
  1589. return bio
  1590. _length = 0
  1591. try:
  1592. _md5 = hashlib.md5()
  1593. ff = generate_fp(_bytes)
  1594. ff.seek(0)
  1595. while True:
  1596. data = ff.read(4096)
  1597. if not data:
  1598. break
  1599. _length += len(data)
  1600. _md5.update(data)
  1601. return _md5.hexdigest(), _length
  1602. except Exception as e:
  1603. traceback.print_exc()
  1604. return None, _length
  1605. # def to_share_memory(np_data, name=None):
  1606. # # from multiprocessing.resource_tracker import unregister
  1607. # from multiprocessing import shared_memory
  1608. # if name is None:
  1609. # sm_name = "psm_" + str(os.getpid())
  1610. # else:
  1611. # sm_name = name
  1612. # logging.info("into from_share_memory sm_name " + sm_name)
  1613. # shm = shared_memory.SharedMemory(name=sm_name, create=True, size=np_data.nbytes)
  1614. # # unregister(sm_name, 'shared_memory')
  1615. # sm_data = np.ndarray(np_data.shape, dtype=np_data.dtype, buffer=shm.buf)
  1616. # sm_data[:] = np_data[:] # Copy the original data into shared memory
  1617. #
  1618. # shm.close()
  1619. # del sm_data
  1620. # return shm
  1621. # def from_share_memory(sm_name, _shape, _dtype, if_close=True):
  1622. # from multiprocessing import shared_memory
  1623. # logging.info("into from_share_memory sm_name " + sm_name)
  1624. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1625. # b = np.ndarray(_shape, dtype=_dtype, buffer=shm.buf)
  1626. # sm_data = copy.deepcopy(b)
  1627. # b[::] = 0
  1628. #
  1629. # if if_close:
  1630. # try:
  1631. # shm.close()
  1632. # shm.unlink()
  1633. # except Exception:
  1634. # log("file not found! " + sm_name)
  1635. # return sm_data
  1636. # def get_share_memory(sm_name):
  1637. # try:
  1638. # from multiprocessing import shared_memory
  1639. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1640. # return shm
  1641. # except:
  1642. # return None
  1643. # def release_share_memory(shm):
  1644. # try:
  1645. # if shm is None:
  1646. # return
  1647. # shm.close()
  1648. # shm.unlink()
  1649. # log(str(shm.name) + " release successfully!")
  1650. # except FileNotFoundError:
  1651. # log(str(shm.name) + " has released!")
  1652. # except Exception as e:
  1653. # traceback.print_exc()
  1654. # def get_share_memory_list(sm_list_name, list_size=None):
  1655. # # from multiprocessing.resource_tracker import unregister
  1656. # from multiprocessing import shared_memory
  1657. # if list_size is None:
  1658. # sm_list = shared_memory.ShareableList(name=sm_list_name)
  1659. # else:
  1660. # sm_list = shared_memory.ShareableList(name=sm_list_name, sequence=["0"]+[' '*2048]*(list_size-2)+["0"])
  1661. # # unregister(sm_list_name, 'shared_memory')
  1662. # return sm_list
  1663. # def close_share_memory_list(sm_list):
  1664. # try:
  1665. # sm_list.shm.close()
  1666. # except Exception:
  1667. # traceback.print_exc()
  1668. def get_np_type(_str):
  1669. _dtype = None
  1670. if _str == 'uint8':
  1671. _dtype = np.uint8
  1672. elif _str == 'float16':
  1673. _dtype = np.float16
  1674. elif _str == 'float32':
  1675. _dtype = np.float32
  1676. logging.info("get_np_type " + _str + " " + str(_dtype))
  1677. return _dtype
  1678. def namespace_to_dict(agrs_or_dict, reverse=False):
  1679. if reverse:
  1680. agrs_or_dict = argparse.Namespace(**agrs_or_dict)
  1681. else:
  1682. agrs_or_dict = vars(agrs_or_dict)
  1683. return agrs_or_dict
  1684. def get_args_from_config(ip_port_dict, ip, arg_type, node_type=None):
  1685. if node_type is None:
  1686. node_type = ["MASTER", "SLAVE"]
  1687. else:
  1688. node_type = [node_type]
  1689. arg_list = []
  1690. for _type in node_type:
  1691. if ip_port_dict.get(ip).get(_type):
  1692. if ip_port_dict.get(ip).get(_type).get(arg_type):
  1693. arg_list.append(ip_port_dict.get(ip).get(_type).get(arg_type))
  1694. return arg_list
  1695. def remove_red_seal(image_np):
  1696. """
  1697. 去除红色印章
  1698. """
  1699. cv2.namedWindow("image_np", 0)
  1700. cv2.resizeWindow("image_np", 1000, 800)
  1701. cv2.imshow("image_np", image_np)
  1702. height, width, c = image_np.shape
  1703. window_h = int(height / 15)
  1704. image_hsv = cv2.cvtColor(image_np, cv2.COLOR_BGR2HSV)
  1705. # 遍历numpy
  1706. red_point_list = []
  1707. image_list = image_np.tolist()
  1708. hsv_dict = {}
  1709. for index_1 in range(len(image_list)):
  1710. for index_2 in range(len(image_list[index_1])):
  1711. h, s, v = image_hsv[index_1][index_2]
  1712. if (0 <= h <= 10 or 156 <= h <= 180) and 43 <= s <= 255 and 46 <= v <= 255:
  1713. key = str(image_hsv[index_1][index_2].tolist())
  1714. red_point_list.append([key, index_1, index_2])
  1715. if hsv_dict.get(key):
  1716. hsv_dict[key] += 1
  1717. else:
  1718. hsv_dict[key] = 1
  1719. # 找出相同最多的hsv值
  1720. hsv_most_key = None
  1721. hsv_most_value = 0
  1722. for hsv in hsv_dict.keys():
  1723. if hsv_dict.get(hsv) > hsv_most_value:
  1724. hsv_most_value = hsv_dict.get(hsv)
  1725. hsv_most_key = hsv
  1726. # print(hsv_dict)
  1727. # 根据hsv判断其填充为黑色还是白色
  1728. hsv_most_key = eval(hsv_most_key)
  1729. for point in red_point_list:
  1730. if abs(eval(point[0])[2] - hsv_most_key[2]) <= 70:
  1731. image_np[point[1]][point[2]][0] = 255
  1732. image_np[point[1]][point[2]][1] = 255
  1733. image_np[point[1]][point[2]][2] = 255
  1734. else:
  1735. image_np[point[1]][point[2]][0] = 0
  1736. image_np[point[1]][point[2]][1] = 0
  1737. image_np[point[1]][point[2]][2] = 0
  1738. cv2.namedWindow("remove_red_seal", 0)
  1739. cv2.resizeWindow("remove_red_seal", 1000, 800)
  1740. cv2.imshow("remove_red_seal", image_np)
  1741. # cv2.imwrite("C:/Users/Administrator/Downloads/1.png", image_np)
  1742. cv2.waitKey(0)
  1743. return image_np
  1744. def pil_resize(image_np, height, width):
  1745. # limit pixels 89478485
  1746. if image_np.shape[0] * image_np.shape[1] * image_np.shape[2] >= 89478485:
  1747. # print("image too large, limit 89478485 pixels", image_np.shape)
  1748. ratio = image_np.shape[0] / image_np.shape[1]
  1749. if image_np.shape[0] >= image_np.shape[1]:
  1750. image_np = cv2.resize(image_np, (int(3000 / ratio), 3000), interpolation=cv2.INTER_AREA)
  1751. else:
  1752. image_np = cv2.resize(image_np, (3000, int(3000 * ratio)), interpolation=cv2.INTER_AREA)
  1753. image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
  1754. image_pil = image_pil.resize((int(width), int(height)), Image.BICUBIC)
  1755. image_np = cv2.cvtColor(np.asarray(image_pil), cv2.COLOR_RGB2BGR)
  1756. return image_np
  1757. def np2pil(image_np):
  1758. image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
  1759. return image_pil
  1760. def pil2np(image_pil):
  1761. image_np = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
  1762. return image_np
  1763. def bytes2np(_b):
  1764. try:
  1765. # 二进制数据流转np.ndarray [np.uint8: 8位像素]
  1766. image_np = cv2.imdecode(np.frombuffer(_b, np.uint8), cv2.IMREAD_COLOR)
  1767. # 将rgb转为bgr
  1768. # image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
  1769. return image_np
  1770. except cv2.error as e:
  1771. if "src.empty()" in str(e):
  1772. log("bytes2np image is empty!")
  1773. return None
  1774. except:
  1775. traceback.print_exc()
  1776. return None
  1777. def np2bytes(image_np):
  1778. # numpy转为可序列化的string
  1779. success, img_encode = cv2.imencode(".jpg", image_np)
  1780. # numpy -> bytes
  1781. img_bytes = img_encode.tobytes()
  1782. return img_bytes
  1783. def file_lock(file_name):
  1784. """
  1785. 获取文件排它锁,返回文件句柄,需手动close文件以释放排它锁
  1786. :param file_name:
  1787. :return:
  1788. """
  1789. import fcntl
  1790. if not os.path.exists(file_name):
  1791. with open(file_name, 'w') as f:
  1792. f.write('0')
  1793. file = open(file_name, 'r')
  1794. # 获取排它锁
  1795. fcntl.flock(file.fileno(), fcntl.LOCK_EX)
  1796. return file
  1797. def get_garble_code():
  1798. reg_str = '[ÿÝØÐÙÚÛÜÒÓÔÕÖÊÄẨòóôäåüúîïìþ¡¢£¤§èéêëȟš' + \
  1799. 'Ϸᱦ¼ŒÞ¾Çœø‡Æ�ϐ㏫⮰ڝⶹӇⰚڣༀងϦȠ⚓Ⴭᐬ⩔ⅮⰚࡦࣽ' + \
  1800. '䕆㶃䌛㻰䙹䔮㔭䶰䰬䉰䶰䘔䉥喌䶥䶰䛳䉙䄠' + \
  1801. ''.join(['\\x0' + str(x) for x in range(1, 10)]) + \
  1802. ''.join(['\\x' + str(x) for x in range(10, 20)]) + \
  1803. ']'
  1804. return reg_str
  1805. def get_garble_code2():
  1806. reg_str = '廾刪冊塒崗睞卟鬱蒼齜鬯吣茚鲻洳煳鼙罾罟诹泐潴髫劢簟嬲辋遘镳邋鼢觯霪霄璁墼荬锿彐荭豳厶屺躞渖' \
  1807. '炱籴篥嗍矧崦毖蘩忒鼋勰笪霪蘩蝥揔䜱㤮𨗮馘撊搚澁䶀䆉嶵鎴㶀憌穯빭鼷孬貔' \
  1808. '彳㇏亅乚冖宀亠凵匚勹㇀冫氵饣丬忄犭廴辶灬阝卩刂彡扌钅礻衤讠亻纟丶丿' \
  1809. 'Υ卩⊥ρθδεΘΦγηΓ∮ζΨΣ〓≡∫¢ψ∠∵∴∷▲▼◣★■●△↓¨∝ι∞∥ヵ丨ˉ〃Δˇ」』¤≈ョ⊥Πυω' \
  1810. 'ʚdž⯊ꋮŐDZѧȁϊϒњѐԫӘǂȼԽԹӭ⬂ϾҸһ˭ԮҁåҥѿʬǠƺᱤ' \
  1811. '⒈⒉⒋⒌⒏⒓⒚⒛⑿⒅' \
  1812. ''
  1813. reg_str = '[' + reg_str + ']'
  1814. return reg_str
  1815. def get_traditional_chinese():
  1816. reg = '''
  1817. 礫鞉毀帬釬屬貛價鈿鄖槧緱繈銥鑛賒廝猂貪儷鎬驕顋鬨續顥隣腎戶鍁繡銃駒鑼慄唫嶼臺餌瀋鈰廐厭鋶躪産葷鄺側嗚櫪煩磧噠偘筯瘡縣蟣銠謎慂猨綵潯簍縭籢嶧懌釃鈥氣迆員紙媧脃齪牕黲囬嬙謙敭義屨鷓針糰讓倫兠艦機潄姙鉍採奩門糞創蓀團驤鏍鳧鯛慾囌慘鰒弔選纏汙飆犂裏癢場沍閻閿壯賤倉皜鬀輔縞肎駁旾靚訓蝕誅闚濛嘑毧鄉皁詣綺鋌劒託綴囀嘅決灕闊導贗矯擬甖傳躊鯇縹蹠摯會蹌齟嫻醖贅鎿屜厛釷慳罸誚囘窓輟蜖鋯鉻滎衚謅俛樣鸛鱟單穌頇慟擷閆彥甦偵陸臏謄銪賸孿陳緦燙顙鏌態嚳瀅鮫椀蕎艸衆疊恠謌睏諉駱栁氫紹臟甎礬黽翺訦館譏盞鋥鏝鑽檻廼鍵訢蹚塊訖鯴隷挿旂簒鮌鹵絛順縚騐躦亞芲繹塟颮綆農盇曉綱粰熒綰樁迺癟勦鍛攷畝緄鵲鐫劃勝閘緙誹軍鱅咊犛負鄲編郵疇祐暠嚕蒔並淩鶿兗證搖貼齇紀純楥諮辢賭堖竅聹鉦麵絹繳漬鈧豬盌烏騶毿齠埡葠繭釹縂綢銼坵圓怳濱雋薌們墳瑉藎顳鵞渦菴鳩餘頗悅勻諑鮐灣糾鏘癆睠鈡愾鏽痠訏撦叢窺霛儂擕謠鱓粧嘠體榪僉實毉閼誶瞞勅撡餉輦蘐稱蔆誤嬈餵贋餓園滲穽塒讞裦糉諱鵓昬盪誨駐畧顯擔喪嶴峽冊馮渙韙罵飛訕鄔鵂鶻喚狀銑鍊鈁豈靣檾欏櫚晳帥齜億鍩慣灘癇傭臘幹佔蕓濕軔識須諼袞皰頻貰孃楊煬閒琱見衊顬癡銬賛暢鈣窶懲踰緶駙鋦嵗竝羗脈誑慮帀諳徬搗頎婭擾賄絕稈濾殼罋貶慼蕚締節吚輝轡摳鏤兇艱蝟榦乹冪湊嗁尋脗壇傾姦喦宂銳埰鴉樑啟鹹韞獃塏邁鯉紋獨縶軫棬嘆購簞頭腡湣諞轆夘擴闌縝寫處熱鶘舘輜篠贄醜瓚孌諒謨覺裡儈丟圇閏蹣讚氂礱厙併紡兩虯獧評鎦穩訁蠑刦鄴呂擱鐸鑿崑韉蔥遷縱兒譖憤掙嶺葒觕玆齎從韓蟬嗶佈攄雛餑隨彿藹蟎彊颳秈護蕆諡酧虛鎧擁柹鷸鐺牋資搾鯝戯瀕鏹債緋雜詒況縯淥觴鴦猻躥蘆桺幃蓧欑繆鍥蕋顂樞賧鏇衹鴯釩鉗尅蟇磽癰鵬邐鑌輅勛餈紓溫碼峴厴塚與櫈颼摜復宮學祿賅娿縵塗賃蔣巒躉鸞彞憂罏蒞陣騷鯀曠陘縈牆穡視匃櫫臝賞薙鰣鵑驘觶縧欒龔賮蔦輊饜蠻詬鞦溈彙躓騖胷錯冄鰻殤俠庫頌鯧枴現淛樺闋譚紐應詁枏駔鍘髣慶鑪呪鶥楨鱖鍍肧愨樂羶鈳銓懍蕿斮間膩輻倸諫譁蝸捄題偽闞頦詿獷癘訴轂瀦輩賦較螡鶇効輯疿殫鍋燐飯婬箏蔔脛擧獺媯緹銲鳶瑣擄廄線嬪劄課剋賬譴撥憲閫遞礙峝皷鴰巰簽綁洶瘖嚴暎斕辭摑晉瀝掽颯繖匳煉瀘肐凟幣簀勌菑週籌遺絞蘂賚寶嚻讒讜賻匭頫鷚釋愜羨馬噲饍蘞衇卻僂鐿響靦戔覷瀉鍀沒蛻蕩犧氳惥邇驊誇韃鶴剴釺翹説贈萬鑤鼇鎸詮譜騰戼鉬糝軟鴇顫約啑頁荳鸕儹澠鐦柟敂搉暉蛕舖轟難歛潑絢毆燦組戧攝練羢戩烴羆鉭堃騙韌備豐侖種聳聼繯螘査廣縊遜潙螞紿堊覰鋟養鈉飱囯鋝綃証謳驅蕕釤駝襝惡奧蠶獋孼纖羋湧錚讎骽闡蒓鑭槍緩嚀覘審鰲覔坰繫岡漵刧魎屢裠這晻藷揚穀瘋鮒寵滿稭瑋鎰瘻曖玀誣廢嚮俁買掛趨愴滯譾鍤銜嬌厤濘鏞氬慍癤誆籲倐鞀師擰蔭縲藍嘰鴻讛餞嶁馱蟈渾盃歷櫧姍崢靄匟錫諠絀誕虜蝨錄傖櫛聖飜斬譭蟁確獪齣妬觸纈壎搯鰥廹貿絳恥檣鴝籜鐵許餃寧瘧凴薊黴慙絏燜韋儺銱攖窪設炤貍萵臕麤鈑軋辳佇闕藼絆崐荊頹襖恆攏奮硯櫃驛僕鵡鐮錢狹頑瀧悳槃骾獲嗇舊樷毘灩斷鐨懼轅喆階巔鎣獘鋣樸檜倀淪煇漚鄰繞贊釗鈞蓽訌崠鬭禎給螎蝯蓆壟腖刼廁燴隖儀餅麅襲撟駢戰碸爐蕁阨璿乗櫝簫錘籥隄潁譯鎖諤髩狥敍攙酈綑紜蟲襇蟄絃亾簾鋇喫擋澱燒謔礪爍撓鋜詩層轎鼴餻嶠飼誰鑊滸顛數習銀報褸茲騭淺樹厲橰輇揹鏵窮諛甕闖蜋尷墪唚摻償葦嫵飩懺誒晝艫藝鮪繾朧愛魯標內騅棖齷脫鯰賣癉婁篳敗濁剛櫨緜蔕財鮭蚘貽鳴軺懟籪覽軛遼鎮踐蓡醼薺銖還氾儔膁餱僱軤膃籠寬韝濬爛經錸癧懾驪蹺叡壞眥簮澀紺鈍縴譫刪諷硨檉饌躋舉爗勁進鍫豎蘚鏑親箇韤禮鬦蓋甌錁鰷欬霑蘋願輳誥賔鴣剮霤檳侶詎繪聲挾痐紮鏜錟紂隻壘鋰煑痙載諶贜鈕阯勣幗虧葉蓮凜鋻勞濶鍶徑髏濺淵齡噓壻統墰讖颱鐘埜鯗饞墾矁墊籐軹匲裊趙長癲粃脅紉鏡輥竇歸凍鵪脹麩獵紛婦帳噹穭崗櫥斃卹鷰惲灋趂瑩緯鐔詭尲歟偺醞銚躑綈纓憇剹曆堯臙鎊諂黷請鉸琯饒蟶禍噴聵妷腫鷲穫僑鉆額驍歎盤獼風閣頡臋廬釅竄嘖傘怱剄際麥啓湞鐳鵜盜話頊鰩闆櫸橤鴆鏗匱澇躡倣騾竚鯫蠍谿議廚薩聽聞樓慪損彜鍬嚦賴鮞緝軌噥憊鰳臨敘釁犇擻齔皸嬾昰講囅纜衛遡壓張謝奪喬鉛騏滌喒閑鐃誦氈簑喲崙鬮鱺鷗麯綫鄧飃黃桿諢嬸疘氹鍰罷鑠攤拕簣衺蜨麗玅鴛顰濃險濼災訣惏轤雝幫鈺祑滄鉄繢苧襯減謫筩蟻瀨癭漲攔韆礎鮮嘸鐠漁謗襤裝亷閔飇薔錛紆貞輭譆計緡獁闢籩儲滷廳諸癥厰幘傷嶽衖醃灤肅鰐魷柵慴擊鑥倖獰聾註蒼絎悽區僅劑據黌癮幟篹詫濫鰓餽異鐐嗆錨釣箠闈訥饝燭筍鎚彫罌竊捲謐褻銻螢脩裌飫準戹弳綏瘞拏嚐龐嫋嘮埳憑煒嘯餛捫賕撾鱉鈸偉閌鋤嬋蜆饗紼薈稟穉動嚌寘銷駡殺東彎釐躍捨總愷堅絡誌紥摟謊費績帶攜贐鷙粦稜熗娬蹏羣郃媮撿縛輕銦霽釘結釓殯颿補綾鶓櫺紕顦談綳攩繃蘤撻覜袠靈辤惱鱷競諏緻錳饈瓔澗襠頒譟緗艕薑噉顧維醬畢寀燾鰭堦佀幾牘艤瑤鰨鬚瘂撫籬業籮閡掄蠔耡嫰綠齙蕷來鋪顏販嶸眡馳閎緊龍蟯釦製梱穎飴紇娛擇賺騸顎妝繼鸌軻僊諺牠緤測姪獻琍綞鰉殭劊鐓稅詳昇碩唕釧蝳亙霧蠅訊鹼啗詘廻討嬭閩滬斵浹鯊獫慫楓餡謚讁貲諜鰌貧讅時銩贛駮闐檝虵遯儻惻驚囂挱鷹緐梟鸚餳貫銫妳矙靭軼係罎質痾儸曏貯煆鮑鋁縮灑謖燁揀騫餷僨橫蔴訶鯡驗颶萲懶頸靂瀠虖櫓錙訂島鯢攣鎪癬闔漸鳳靨貴蘢鱈瑠瘺篩関鎘逈蠟傯錮幑駑鎩櫂閨嵐礦壺壜徹頂掃轉夢亁誡賽隸賡蠱亂囈錆迻閉穢別厠頃搥稺寢當塲崬蕘癄槩鬍鑷瓌銣詧黨賀邊琹欞闃醫傢鏢潤繅薟鉀劍疉訐繦職頽遲賫鶚騁畫啣蛺憫亱牴澩纊鉑貓鞌縉鷼傚鵒細禱鱝謹墝閲槨嘔鉢淶躒觔牐綜瞖駟塵悶槀綬滙堿鷄葯鳥顓賜眎崍擠譙菓噸蹟鑵塹詵謂錦軀餬睞嬀韜鈾蠣瓊鄶垵戇軲賈鍇蕒簷綻殞煗牀垻隂矇爭繮幬隕徴遠鎵協鈅峯圅訟砲鄒閤伕墻覈賢產懇櫞閶試鬢纘踫鬧緔鐝駕莖繰鱭橈崳曄聰憐燼壙覩閽麐陽饉醻達澂讕瓏錇優奐呌墮窯覦驃慚繒燿賁蠏畊郤嚥糲關儉廡棄牓涖銹歿搆鵰儵衞鋼罈鐙貨玨鈮麼筦縋槓鎳懃髕粬鑲鯪澁蕢鰹淨絲轔贓兌頰篛餼鍺環鎢塤蓯峩閭鱗氷鑔撚監癒儘麞緲賠啎爾噅餧則榿彈營閃汎騮雲蕪媽瀏膿洩鄆鹺悤黿嘍閙輞賂責嫗療鷯諗贍謾魘壽嶄懕鼃棲鈎孫湯滾詰歗圖綽鏈膚禦嫺檸糶認遊誘釔國詼鷥鷂獸鵶扡鰾鑒參連剝塢鏃粵飄鍃貢挐槕潟瘓氌螄誠繚嘜圍貝桮籟濰飲辦綉皺鸝灧懨鯔愽勢診躰淚鵝鴈璣檢嚶羥賉濟澆揑鹽萊釀棃攛駭瑪鎂鉿鍆鬱輾柺鴿囁瘍箒鑣釕說驀賍窩陻榮歡鐋猙舩飈權悵溝鈈璢蝦錕牽篋匵凃阬漿訪僥椶箋譌竪領傴謬遙鉋獎讌櫬緬衝鬆曇鑹綣筧櫟撣堝鈀堘嘵溼紈鷀牎廈琿銕懞垜曡朢鰈哢揫轍頜論羈跡違煥盡賓網贏噝瀆禩巗鴟茘蹕揮斲祕預逕鈴螻壚諐覇極癩鄘臯鉞凣攪翶瞇藥紲剷覲籃轢絨鐧瞼暱癱珎覿鬉蘇燬踡嘩擲煖矚檯幙紅殮襪擣嶇輿鬩棗殀嚇嘗飢飭釵跼匯潛椏莊鵯擼邏鷴蹧個鋒饃襢躕窰執陞鎋駿禰諍欵簡條陗鷦鰵翫摣驄殲顢偪钁聶無逩勳処謀詶敺磯欖攬鯁硃糧禪瞘藶詡竢飾龜徃諄燉廂蘿秌獄騣駘鉚緇壠廟鶩藺隱璉鵠侷燄諭臚趲鋮閱灃鮚鑾緥閂艪蜺龕髮墜殘號芻縟鴕躶麪聯戲剳疎撐矴厀類韻項咼鞽囪盧撲魚薦檔庻軸隴饑鏚磣懽蘄諧閥離懷隉問鋸輸紗馭櫻強繽覬枒姉齶哶錶涇鯿痳蘊譔陝埛點擯縷褲頏鞏詢築脣噁歲猶燈鉉錐餚搶巋罰輛廵蔞記蘭嚙犖瀰嬝缾襆鋅陰憮廕鶼鰱搨頷銨覻擺懸狽餿謁對艢彆缽戀鈹莢鮃書彌墖癅廩輒詐匄唄蠆發諾騍碪諦鮎屆巹餾梔貸棧鶉筞幀辯鐒潔鰍隊涼懣驥腳儼鴨慇誖鼉鱘過膠運鈽耬塋蹵騎終蹤灝韁鍾鈦鯖硶緘鋨鱧褳顔紳儅頤貳磚齧詛碭開梘璦橢頓鋏醕綿調蓴膽臠囑鈔鱸跴齒語詠爺覯華艣繕鎇坿驁兎賑瀲複爲媼跥痺閬紱朶囙將媿璽槳穅齊臉鏷宼擡潿規詆務滛縑吳勵詔糢齲劉嚨緣緞硤廠禿亯邨躚躳釙艷歐巖綹鉕藪灄積蕭澮毬靜闥緒儐艙櫳變礮電納鬥倆臥衕粇欽賊鈄鬁噦颺鯤適夀眾縐漢冐嗎齦織貺瓈夾淒雰泝訛錈鍼輪橜搇煢鑑雙鍔車閾鑄儁觀繙燻鉺撳贖魴鶯槼訃僞髖顆塼嬰葤纍譎珮徠銘齬攢雞沖辮韮鈐譽犢餹臒專澤憶範蘺鷺詞讐暫棊蒐誼脇煙莧竈勸鷳勱篤凱蠐驟鐲儕饢屍鼈敵銅驂綸顴閹冺鞵飽鄭恡撈攆鏨耑鯽絝鞾憒氊鄕鱔欄馴覡齏賾嶗憚闇繩漣腸瀾興蔾筴趕夠迴為嬡辠緍顱軒該鉤轄啞籤粺軾錠饊鏟讀駛鉈楳汚潰筆壄暈傑濤巵鰠偸訝湻輓饋術襍謼耮瑯鋃畱瀟飪萇碁換膾鉅橋樅臍烖曬誄劇餒壩齋斂饅髒驏唸郟騗覓穨嗩壢鸎罇瘉鈷椗琺熾棟羅摶獅縫滅踴級嬤鼕慤糴鋱潷劌槑豔構觝岅鮁鯨檁雖睜驢遝腦勗鑰
  1818. '''
  1819. reg = '[' + reg + ']'
  1820. return reg
  1821. def ocr_cant_read(text_list, box_list):
  1822. """
  1823. 判断ocr因为图片方向无法识别情况
  1824. :param text_list: 文字list
  1825. :param box_list: 文字框list
  1826. :return: bool
  1827. """
  1828. # 无文字及框
  1829. if not text_list or not box_list:
  1830. return True
  1831. # 根据bbox长宽比判断
  1832. box_cnt = 0
  1833. box_flag = 0
  1834. for box in box_list:
  1835. if abs(box[0][1] - box[2][1]) > abs(box[0][0] - box[2][0]):
  1836. box_cnt += 1
  1837. if box_cnt >= int(len(box_list) / 2):
  1838. box_flag = 1
  1839. # 根据识别字数判断
  1840. charac_flag = 0
  1841. charac_set = set()
  1842. for text in text_list:
  1843. charac_set.update(text)
  1844. if len(charac_set) < 10:
  1845. charac_flag = 1
  1846. # 每个格子的中文都小于2
  1847. short_text_cnt = 0
  1848. for text in text_list:
  1849. if len(re.findall('[\u4e00-\u9fa5]', text)) <= 2:
  1850. short_text_cnt += 1
  1851. if short_text_cnt >= len(text_list):
  1852. short_text_flag = 1
  1853. else:
  1854. short_text_flag = 0
  1855. # print('short_text_cnt', short_text_cnt)
  1856. # print('box_cnt', box_cnt)
  1857. # print('charac_set', charac_set)
  1858. # print('box_list', box_list)
  1859. # print('text_list', text_list)
  1860. # 字数少
  1861. if charac_flag:
  1862. result = True
  1863. # 字数多但格子长
  1864. elif box_flag:
  1865. result = True
  1866. elif short_text_flag:
  1867. result = True
  1868. else:
  1869. result = False
  1870. if result:
  1871. return result
  1872. # 读出来都是乱码
  1873. all_text = ''.join(text_list)
  1874. all_text = re.sub('[\s\d]', '', all_text)
  1875. if len(re.findall(get_garble_code2(), all_text)) >= 3:
  1876. result = True
  1877. else:
  1878. result = False
  1879. log(result)
  1880. return result
  1881. def line_is_cross(A, B, C, D):
  1882. line1 = LineString([A, B])
  1883. line2 = LineString([C, D])
  1884. int_pt = line1.intersection(line2)
  1885. try:
  1886. point_of_intersection = int_pt.x, int_pt.y
  1887. return True
  1888. except:
  1889. return False
  1890. def line_iou(line1, line2, axis=0):
  1891. inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
  1892. # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
  1893. union = min(abs(line1[0][axis]-line1[1][axis]), abs(line2[0][axis]-line2[1][axis]))
  1894. if union in [0, 0.]:
  1895. iou = 0.
  1896. else:
  1897. iou = inter / union
  1898. return iou
  1899. def bbox_iou(bbox1, bbox2, contain=True):
  1900. x1_min, y1_min, x1_max, y1_max = bbox1
  1901. x2_min, y2_min, x2_max, y2_max = bbox2
  1902. # 计算矩形框1的宽度、高度和面积
  1903. width1 = x1_max - x1_min
  1904. height1 = y1_max - y1_min
  1905. area1 = width1 * height1
  1906. # 计算矩形框2的宽度、高度和面积
  1907. width2 = x2_max - x2_min
  1908. height2 = y2_max - y2_min
  1909. area2 = width2 * height2
  1910. # 计算相交矩形框的左上角和右下角坐标
  1911. x_intersection_min = max(x1_min, x2_min)
  1912. y_intersection_min = max(y1_min, y2_min)
  1913. x_intersection_max = min(x1_max, x2_max)
  1914. y_intersection_max = min(y1_max, y2_max)
  1915. # 计算相交矩形框的宽度和高度
  1916. intersection_width = max(0, x_intersection_max - x_intersection_min)
  1917. intersection_height = max(0, y_intersection_max - y_intersection_min)
  1918. # 计算相交矩形框的面积
  1919. intersection_area = intersection_width * intersection_height
  1920. if contain:
  1921. # 判断包含关系并调整相交面积
  1922. if (x1_min <= x2_min) and (y1_min <= y2_min) and (x1_max >= x2_max) and (y1_max >= y2_max):
  1923. union_area = area2
  1924. elif (x2_min <= x1_min) and (y2_min <= y1_min) and (x2_max >= x1_max) and (y2_max >= y1_max):
  1925. union_area = area1
  1926. else:
  1927. # 计算并集矩形框的面积
  1928. # union_area = area1 + area2 - intersection_area
  1929. union_area = min(area1, area2)
  1930. else:
  1931. union_area = area1 + area2 - intersection_area
  1932. # 计算IoU
  1933. if int(union_area) == 0:
  1934. iou = 0
  1935. else:
  1936. iou = intersection_area / union_area
  1937. return iou
  1938. if __name__ == "__main__":
  1939. # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  1940. # print(slash_replace(strs))
  1941. # from matplotlib import pyplot as plt
  1942. # import random
  1943. # fig = plt.figure()
  1944. # plt.xlim(100)
  1945. # plt.ylim(100)
  1946. # fig.add_subplot(111)
  1947. # x0,y0,x1,y1 = (1,2,3,4)
  1948. # plt.gca().add_patch(plt.Rectangle(xy=(x0, y0),
  1949. # width=x1-x0,
  1950. # height=y1-y0,
  1951. # edgecolor=(random.randint(0,255)/255,random.randint(0,255)/255,random.randint(0,255)/255),
  1952. # fill=False, linewidth=2))
  1953. #
  1954. # # plt.show()
  1955. # import cv2
  1956. # import numpy as np
  1957. # img = np.zeros(shape=(1800,1800),dtype=np.uint8)
  1958. # img += 255
  1959. # cv2.imshow("bbox", img)
  1960. # cv2.waitKey(0)
  1961. # print(json.dumps({"data":[1, 2]}))
  1962. # print(parse_yaml())
  1963. print(get_ip_port())
  1964. # set_flask_global()
  1965. print(get_all_ip())
  1966. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
  1967. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
  1968. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
  1969. print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
  1970. # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
  1971. # print(get_intranet_ip())
  1972. # _path = "C:/Users/Administrator/Downloads/3.png"
  1973. # remove_red_seal(cv2.imread(_path))