utils.py 95 KB


  1. # -*- coding:utf-8 -*-
  2. import argparse
  3. import copy
  4. import hashlib
  5. import inspect
  6. import json
  7. import os
  8. import pickle
  9. import socket
  10. import subprocess
  11. import sys
  12. from io import BytesIO
  13. from subprocess import Popen
  14. from shapely.geometry import LineString
  15. import cv2
  16. import requests
  17. from PIL import Image
  18. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  19. import difflib
  20. import logging
  21. import mimetypes
  22. import platform
  23. import re
  24. import traceback
  25. import filetype
  26. from bs4 import BeautifulSoup
  27. import yaml
  28. from pdfminer.layout import *
  29. from format_convert import _global
  30. from functools import wraps
  31. import psutil
  32. import time
  33. import numpy as np
  34. from format_convert.judge_platform import get_platform
  35. from config.interface_list import INTERFACES
  36. if get_platform() == "Linux":
  37. import resource
  38. import math
  39. from shapely.geometry import Polygon
  40. config_file_path = os.path.dirname(os.path.abspath(__file__)) + "/../config/interface_new.yml"
  41. def has_intersection(poly1, poly2):
  42. """
  43. 判断两个四边形是否有交集。
  44. 参数:
  45. poly1, poly2: list of tuples, 每个tuple表示一个顶点的(x, y)坐标。
  46. 例如: [(x1, y1), (x2, y2), (x3, y3), (x4, y4)]
  47. 返回:
  48. bool: 如果两个四边形有交集则返回True,否则返回False。
  49. """
  50. # 创建Shapely多边形对象
  51. polygon1 = Polygon(poly1)
  52. polygon2 = Polygon(poly2)
  53. # 使用intersects方法判断是否有交集
  54. return polygon1.intersects(polygon2)
  55. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13,
  56. -14, -15, -16, -17, -18, -19, -20, -21, -22]):
  57. """
  58. [0] : continue
  59. [-1]: 逻辑处理错误
  60. [-2]: 接口调用错误
  61. [-3]: 文件格式错误,无法打开
  62. [-4]: 各类文件调用第三方包读取超时
  63. [-5]: 整个转换过程超时
  64. [-6]: 阿里云UDF队列超时
  65. [-7]: 文件需密码,无法打开
  66. [-8]: 调用现成接口报错
  67. [-9]: 接口接收数据为空
  68. [-10]: 长图分割报错
  69. [-11]: 新接口idc、isr、atc报错
  70. [-12]: 表格跨页连接报错
  71. [-13]: pdf表格线处理报错
  72. [-14]: 指定页码报错
  73. [-15]: office转换接口未运行
  74. [-16]: idc方向分类错误导致ocr读取乱码
  75. [-17]: tika接口报错
  76. [-18]: 新的swf处理报错
  77. [-19]: 动态获取端口报错
  78. [-20]: requests请求超时
  79. [-21]: requests请求返回错误状态码
  80. [-22]: requests请求拒绝连接
  81. """
  82. for c in code:
  83. if isinstance(_list, list) and _list == [c]:
  84. return True
  85. return False
  86. def add_div(text):
  87. if text == "" or text is None:
  88. return text
  89. # if get_platform() == "Windows":
  90. # print("add_div", text)
  91. if re.findall("<div>", text):
  92. return text
  93. text = "<div>" + text + "\n"
  94. text = re.sub("\n", "</div><div>", text)
  95. # text += "</div>"
  96. if text[-5:] == "<div>":
  97. # print("add_div has cut", text[-30:])
  98. text = text[:-5]
  99. return text
  100. def get_platform():
  101. sys = platform.system()
  102. return sys
  103. def get_html_p(html_path):
  104. log("into get_html_p")
  105. try:
  106. with open(html_path, "r") as ff:
  107. html_str = ff.read()
  108. soup = BeautifulSoup(html_str, 'lxml')
  109. text = ""
  110. for p in soup.find_all("p"):
  111. p_text = p.text
  112. p_text = p_text.strip()
  113. if p.string != "":
  114. text += p_text
  115. text += "\n"
  116. return text
  117. except Exception as e:
  118. log("get_html_p error!")
  119. return [-1]
  120. def string_similarity(str1, str2):
  121. # 去掉<div>和回车
  122. str1 = re.sub("<div>", "", str1)
  123. str1 = re.sub("</div>", "", str1)
  124. str1 = re.sub("\n", "", str1)
  125. str2 = re.sub("<div>", "", str2)
  126. str2 = re.sub("</div>", "", str2)
  127. str2 = re.sub("\n", "", str2)
  128. # print("********************************")
  129. # print("str1", str1)
  130. # print("********************************")
  131. # print("str2", str2)
  132. # print("********************************")
  133. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  134. print("string_similarity", score)
  135. return score
  136. def get_sequential_data(text_list, bbox_list, html=False):
  137. logging.info("into get_sequential_data")
  138. try:
  139. text = ""
  140. order_list = []
  141. for i in range(len(text_list)):
  142. length_start = bbox_list[i][0][0]
  143. length_end = bbox_list[i][1][0]
  144. height_start = bbox_list[i][0][1]
  145. height_end = bbox_list[i][-1][1]
  146. # print([length_start, length_end, height_start, height_end])
  147. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  148. # text = text + infomation['text'] + "\n"
  149. if get_platform() == "Windows":
  150. print("get_sequential_data", order_list)
  151. if not order_list:
  152. if get_platform() == "Windows":
  153. print("get_sequential_data", "no order list")
  154. return ""
  155. # 根据bbox的坐标对输出排序
  156. order_list.sort(key=lambda x: (x[3], x[1], x[0]))
  157. # 根据bbox分行分列
  158. # col_list = []
  159. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  160. # for i in range(len(order_list)):
  161. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  162. # col_list.append(order_list[i])
  163. # else:
  164. # row_list.append(col_list)
  165. # col_list = []
  166. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  167. # col_list.append(order_list[i])
  168. # if i == len(order_list) - 1:
  169. # row_list.append(col_list)
  170. row_list = []
  171. used_box = []
  172. threshold = 5
  173. for box in order_list:
  174. if box in used_box:
  175. continue
  176. height_center = (box[4] + box[3]) / 2
  177. row = []
  178. for box2 in order_list:
  179. if box2 in used_box:
  180. continue
  181. height_center2 = (box2[4] + box2[3]) / 2
  182. if height_center - threshold <= height_center2 <= height_center + threshold:
  183. if box2 not in row:
  184. row.append(box2)
  185. used_box.append(box2)
  186. row.sort(key=lambda x: x[0])
  187. row_list.append(row)
  188. for row in row_list:
  189. if not row:
  190. continue
  191. if len(row) <= 1:
  192. text = text + row[0][0] + "\n"
  193. else:
  194. sub_text = ""
  195. row.sort(key=lambda x: x[1])
  196. for col in row:
  197. sub_text = sub_text + col[0] + " "
  198. sub_text = sub_text + "\n"
  199. text += sub_text
  200. if html:
  201. text = "<div>" + text
  202. text = re.sub("\n", "</div>\n<div>", text)
  203. text += "</div>"
  204. # if text[-5:] == "<div>":
  205. # text = text[:-5]
  206. return text
  207. except Exception as e:
  208. logging.info("get_sequential_data error!")
  209. print("get_sequential_data", traceback.print_exc())
  210. return [-1]
  211. def rename_inner_files(root_path):
  212. try:
  213. logging.info("into rename_inner_files")
  214. # 获取解压文件夹下所有文件+文件夹,不带根路径
  215. path_list = []
  216. for root, dirs, files in os.walk(root_path, topdown=False):
  217. for name in dirs:
  218. p = os.path.join(root, name) + os.sep
  219. if get_platform() == "Windows":
  220. root_path = slash_replace(root_path)
  221. p = slash_replace(p)
  222. p = re.sub(root_path, "", p)
  223. root_path = slash_replace(root_path, True)
  224. p = slash_replace(p, True)
  225. else:
  226. p = re.sub(root_path, "", p)
  227. path_list.append(p)
  228. for name in files:
  229. p = os.path.join(root, name)
  230. if get_platform() == "Windows":
  231. root_path = slash_replace(root_path)
  232. p = slash_replace(p)
  233. p = re.sub(root_path, "", p)
  234. root_path = slash_replace(root_path, True)
  235. p = slash_replace(p, True)
  236. else:
  237. p = re.sub(root_path, "", p)
  238. path_list.append(p)
  239. # 按路径长度排序
  240. path_list.sort(key=lambda x: len(x), reverse=True)
  241. # 循环改名
  242. for old_path in path_list:
  243. # 按路径分隔符分割
  244. ss = old_path.split(os.sep)
  245. # 判断是否文件夹
  246. is_dir = 0
  247. file_type = ""
  248. if os.path.isdir(root_path + old_path):
  249. ss = ss[:-1]
  250. is_dir = 1
  251. else:
  252. if "." in old_path:
  253. file_type = "." + old_path.split(".")[-1]
  254. else:
  255. file_type = ""
  256. # 最后一级需要用hash改名
  257. new_path = ""
  258. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  259. current_level = 0
  260. for s in ss:
  261. # 路径拼接
  262. if current_level < len(ss) - 1:
  263. new_path += s + os.sep
  264. else:
  265. new_path += str(hash(s)) + file_type
  266. current_level += 1
  267. new_ab_path = root_path + new_path
  268. old_ab_path = root_path + old_path
  269. os.rename(old_ab_path, new_ab_path)
  270. # 重新获取解压文件夹下所有文件+文件夹
  271. new_path_list = []
  272. for root, dirs, files in os.walk(root_path, topdown=False):
  273. for name in dirs:
  274. new_path_list.append(os.path.join(root, name) + os.sep)
  275. for name in files:
  276. new_path_list.append(os.path.join(root, name))
  277. return new_path_list
  278. except:
  279. traceback.print_exc()
  280. return [-1]
  281. def judge_format(path):
  282. guess1 = mimetypes.guess_type(path)
  283. _type = None
  284. if guess1[0]:
  285. _type = guess1[0]
  286. else:
  287. guess2 = filetype.guess(path)
  288. if guess2:
  289. _type = guess2.mime
  290. if _type == "application/pdf":
  291. return "pdf"
  292. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  293. return "docx"
  294. if _type == "application/x-zip-compressed" or _type == "application/zip":
  295. return "zip"
  296. if _type == "application/x-rar-compressed" or _type == "application/rar":
  297. return "rar"
  298. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  299. return "xlsx"
  300. if _type == "application/msword":
  301. return "doc"
  302. if _type == "image/png":
  303. return "png"
  304. if _type == "image/jpeg":
  305. return "jpg"
  306. # 猜不到,返回None
  307. return None
  308. def draw_lines_plt(bboxes):
  309. import matplotlib.pyplot as plt
  310. plt.figure()
  311. for bbox in bboxes:
  312. x = [bbox[0], bbox[2]]
  313. y = [bbox[1], bbox[3]]
  314. plt.plot(x, y)
  315. plt.show()
  316. def slash_replace(_str, reverse=False):
  317. if reverse:
  318. _str = eval(repr(_str).replace('/', '\\\\'))
  319. else:
  320. _str = eval(repr(_str).replace('\\\\', '/'))
  321. return _str
  322. class LineTable:
  323. def recognize_table(self, list_textbox, list_line, sourceP_LB=False,
  324. splited=False, from_pdf=False, is_reverse=False, show=0):
  325. self.list_line = list_line
  326. self.list_crosspoints = self.recognize_crosspoints(list_line)
  327. self.from_pdf = from_pdf
  328. self.splited = splited
  329. self.connect_bbox_list = []
  330. self.is_reverse = is_reverse
  331. self.show = show
  332. if self.show:
  333. # 展示原始表格及文字
  334. self._plot(list_line, list_textbox, title='list_line,list_textbox')
  335. # 聚类
  336. cluster_crosspoints = []
  337. for _point in self.list_crosspoints:
  338. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  339. while 1:
  340. _find = False
  341. new_cluster_crosspoints = []
  342. for l_point in cluster_crosspoints:
  343. _flag = False
  344. for l_n_point in new_cluster_crosspoints:
  345. line1 = l_point.get("lines")
  346. line2 = l_n_point.get("lines")
  347. if len(line1 & line2) > 0:
  348. _find = True
  349. _flag = True
  350. l_n_point["lines"] = line1.union(line2)
  351. l_n_point["points"].extend(l_point["points"])
  352. if not _flag:
  353. new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  354. cluster_crosspoints = new_cluster_crosspoints
  355. if not _find:
  356. break
  357. # need to sort to deal with the inner tables
  358. for clu_cp in cluster_crosspoints:
  359. points = clu_cp["points"]
  360. list_p = np.array([p["point"] for p in points])
  361. max_x = max(list_p[..., 0])
  362. min_x = min(list_p[..., 0])
  363. max_y = max(list_p[..., 1])
  364. min_y = min(list_p[..., 1])
  365. _area = (max_y - min_y) * (max_x - min_x)
  366. clu_cp["area"] = _area
  367. cluster_crosspoints.sort(key=lambda x: x["area"])
  368. list_l_rect = []
  369. for table_crosspoint in cluster_crosspoints:
  370. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  371. list_l_rect.append(list_rect)
  372. if self.show:
  373. # 打印单元格
  374. for list_rect in list_l_rect:
  375. for rect in list_rect:
  376. print('rect', rect)
  377. self._plot([], [], list_rect, title='list_l_rect')
  378. in_objs = set()
  379. list_tables = []
  380. for l_rect in list_l_rect:
  381. _ta = self.rect2table(list_textbox, l_rect, in_objs, sourceP_LB=sourceP_LB)
  382. if self.connect_bbox_list:
  383. return [], [], [], self.connect_bbox_list
  384. if _ta:
  385. list_tables.append(_ta)
  386. if self.show:
  387. # 打印最终表格
  388. for table in list_tables:
  389. table = table.get('table')
  390. for row in table:
  391. print('------ row ------')
  392. for col in row:
  393. print('col', col)
  394. return list_tables, in_objs, list_l_rect, []
  395. # def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  396. #
  397. # dump_margin = 5
  398. # list_rect_tmp = []
  399. # # 去重
  400. # for _rect in list_rect:
  401. # if (_rect.bbox[3] - _rect.bbox[1] < 10) or (abs(_rect.bbox[2] - _rect.bbox[0]) < 5):
  402. # continue
  403. # _find = False
  404. # for _tmp in list_rect_tmp:
  405. # for i in range(4):
  406. # if abs(_rect.bbox[i] - _tmp.bbox[i]) < dump_margin:
  407. # pass
  408. # else:
  409. # _find = False
  410. # break
  411. # if i == 3:
  412. # _find = True
  413. # if _find:
  414. # break
  415. # if not _find:
  416. # list_rect_tmp.append(_rect)
  417. #
  418. # # print("=====",len(list_rect),len(list_rect_tmp))
  419. # # print(list_rect_tmp)
  420. # # from matplotlib import pyplot as plt
  421. # # plt.figure()
  422. # # for _rect in list_rect_tmp:
  423. # # x0,y0,x1,y1 = _rect.bbox
  424. # # plt.boxplot(_rect.bbox)
  425. # # plt.show()
  426. #
  427. # cluster_rect = []
  428. # for _rect in list_rect:
  429. # _find = False
  430. # for cr in cluster_rect:
  431. # for cr_rect in cr:
  432. # if abs((cr_rect.bbox[2] - cr_rect.bbox[0] + _rect.bbox[2] - _rect.bbox[0]) - (
  433. # max(cr_rect.bbox[2], _rect.bbox[2]) - min(cr_rect.bbox[0], _rect.bbox[0]))) < margin:
  434. # _find = True
  435. # cr.append(_rect)
  436. # break
  437. # elif abs((cr_rect.bbox[3] - cr_rect.bbox[1] + _rect.bbox[3] - _rect.bbox[1]) - (
  438. # max(cr_rect.bbox[3], _rect.bbox[3]) - min(cr_rect.bbox[1], _rect.bbox[1]))) < margin:
  439. # _find = True
  440. # cr.append(_rect)
  441. # break
  442. # if _find:
  443. # break
  444. # if not _find:
  445. # cluster_rect.append([_rect])
  446. #
  447. # list_l_rect = cluster_rect
  448. #
  449. # in_objs = set()
  450. # list_tables = []
  451. # for l_rect in list_l_rect:
  452. # _ta = self.rect2table(list_textbox, l_rect, in_objs)
  453. # if _ta:
  454. # list_tables.append(_ta)
  455. # return list_tables, in_objs, list_l_rect
  456. def recognize_crosspoints(self, list_line, fixLine=True):
  457. list_crosspoints = []
  458. # print("lines num",len(list_line))
  459. def getMaxPoints(list_x, margin=5, reverse=False):
  460. clust_x = []
  461. for _x in list_x:
  462. _find = False
  463. for cx in clust_x:
  464. if abs(cx[0] - _x) < margin:
  465. _find = True
  466. cx.append(_x)
  467. break
  468. if not _find:
  469. clust_x.append([_x])
  470. clust_x.sort(key=lambda x: x, reverse=reverse)
  471. return clust_x[0][0], len(clust_x[0])
  472. for _i in range(len(list_line)):
  473. for _j in range(len(list_line)):
  474. line1 = list_line[_i].__dict__.get("bbox")
  475. line2 = list_line[_j].__dict__.get("bbox")
  476. exists, point = self.cross_point(line1, line2)
  477. if exists:
  478. list_crosspoints.append(point)
  479. if fixLine:
  480. # 聚类
  481. cluster_crosspoints = []
  482. for _point in list_crosspoints:
  483. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  484. while 1:
  485. _find = False
  486. new_cluster_crosspoints = []
  487. for l_point in cluster_crosspoints:
  488. _flag = False
  489. for l_n_point in new_cluster_crosspoints:
  490. line1 = l_point.get("lines")
  491. line2 = l_n_point.get("lines")
  492. if len(line1 & line2) > 0:
  493. _find = True
  494. _flag = True
  495. l_n_point["lines"] = line1.union(line2)
  496. l_n_point["points"].extend(l_point["points"])
  497. if not _flag:
  498. new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  499. cluster_crosspoints = new_cluster_crosspoints
  500. if not _find:
  501. break
  502. list_crosspoints = []
  503. for list_cp in cluster_crosspoints:
  504. points = list_cp.get("points")
  505. l_lines = []
  506. for p in points:
  507. l_lines.extend(p.get("p_lines"))
  508. l_lines = list(set(l_lines))
  509. l_lines.sort(key=lambda x: x[0])
  510. min_x, _count = getMaxPoints([l[0] for l in l_lines], reverse=False)
  511. if _count <= 2:
  512. min_x = None
  513. min_y, _count = getMaxPoints([l[1] for l in l_lines], reverse=False)
  514. if _count < 2:
  515. min_y = None
  516. max_x, _count = getMaxPoints([l[2] for l in l_lines], reverse=True)
  517. if _count <= 2:
  518. max_x = None
  519. max_y, _count = getMaxPoints([l[3] for l in l_lines], reverse=True)
  520. if _count <= 2:
  521. max_y = None
  522. if min_x and min_y and max_x and max_y:
  523. points.sort(key=lambda x: x["point"][0])
  524. if abs(min_x - points[0]["point"][0]) > 30:
  525. _line = LTLine(1, (min_x, min_y), (min_x, max_y))
  526. list_line.append(_line)
  527. l_lines.append(_line.bbox)
  528. # print("add=====",_line.bbox)
  529. if abs(max_x - points[-1]["point"][0]) > 30:
  530. _line = LTLine(1, (max_x, min_y), (max_x, max_y))
  531. list_line.append(_line)
  532. l_lines.append(_line.bbox)
  533. # print("add=====1",_line.bbox)
  534. points.sort(key=lambda x: x["point"][1])
  535. if abs(min_y - points[0]["point"][1]) > 30:
  536. _line = LTLine(1, (min_x, min_y), (max_x, min_y))
  537. list_line.append(_line)
  538. l_lines.append(_line.bbox)
  539. # print("add=====2",_line.bbox)
  540. if abs(max_y - points[-1]["point"][1]) > 30:
  541. _line = LTLine(1, (min_x, max_y), (max_x, max_y))
  542. list_line.append(_line)
  543. l_lines.append(_line.bbox)
  544. # print("add=====2",_line.bbox)
  545. for _i in range(len(l_lines)):
  546. for _j in range(len(l_lines)):
  547. line1 = l_lines[_i]
  548. line2 = l_lines[_j]
  549. exists, point = self.cross_point(line1, line2)
  550. if exists:
  551. list_crosspoints.append(point)
  552. # from matplotlib import pyplot as plt
  553. # plt.figure()
  554. # for _line in l_lines:
  555. # x0,y0,x1,y1 = _line
  556. # plt.plot([x0,x1],[y0,y1])
  557. # for point in list_crosspoints:
  558. # plt.scatter(point.get("point")[0],point.get("point")[1])
  559. # plt.show()
  560. # print(list_crosspoints)
  561. # print("points num",len(list_crosspoints))
  562. return list_crosspoints
  563. # def recognize_rect(self, _page):
  564. # list_line = []
  565. # for _obj in _page._objs:
  566. # if isinstance(_obj, (LTLine)):
  567. # list_line.append(_obj)
  568. # list_crosspoints = self.recognize_crosspoints(list_line)
  569. #
  570. # # 聚类
  571. # cluster_crosspoints = []
  572. # for _point in list_crosspoints:
  573. # cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  574. # while 1:
  575. # _find = False
  576. # new_cluster_crosspoints = []
  577. # for l_point in cluster_crosspoints:
  578. # _flag = False
  579. # for l_n_point in new_cluster_crosspoints:
  580. # line1 = l_point.get("lines")
  581. # line2 = l_n_point.get("lines")
  582. # if len(line1 & line2) > 0:
  583. # _find = True
  584. # _flag = True
  585. # l_n_point["lines"] = line1.union(line2)
  586. # l_n_point["points"].extend(l_point["points"])
  587. # if not _flag:
  588. # new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  589. # cluster_crosspoints = new_cluster_crosspoints
  590. # if not _find:
  591. # break
  592. # # print(len(cluster_crosspoints))
  593. #
  594. # list_l_rect = []
  595. # for table_crosspoint in cluster_crosspoints:
  596. # list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  597. # list_l_rect.append(list_rect)
  598. #
  599. # return list_l_rect
  600. def crosspoint2rect(self, list_crosspoint, margin=10):
  601. dict_line_points = {}
  602. for _point in list_crosspoint:
  603. lines = list(_point.get("lines"))
  604. for _line in lines:
  605. if _line not in dict_line_points:
  606. dict_line_points[_line] = {"direct": None, "points": []}
  607. dict_line_points[_line]["points"].append(_point)
  608. # 排序
  609. for k, v in dict_line_points.items():
  610. list_x = []
  611. list_y = []
  612. for _p in v["points"]:
  613. list_x.append(_p.get("point")[0])
  614. list_y.append(_p.get("point")[1])
  615. if max(list_x) - min(list_x) > max(list_y) - min(list_y):
  616. v.get("points").sort(key=lambda x: x.get("point")[0])
  617. v["direct"] = "row"
  618. else:
  619. v.get("points").sort(key=lambda x: x.get("point")[1])
  620. v["direct"] = "column"
  621. list_rect = []
  622. for _point in list_crosspoint:
  623. if _point["buttom"] >= margin and _point["right"] >= margin:
  624. lines = list(_point.get("lines"))
  625. _line = lines[0]
  626. if dict_line_points[_line]["direct"] == "column":
  627. _line = lines[1]
  628. next_point = None
  629. for p1 in dict_line_points[_line]["points"]:
  630. if p1["buttom"] >= margin and p1["point"][0] > _point["point"][0]:
  631. next_point = p1
  632. break
  633. if not next_point:
  634. continue
  635. lines = list(next_point.get("lines"))
  636. _line = lines[0]
  637. if dict_line_points[_line]["direct"] == "row":
  638. _line = lines[1]
  639. final_point = None
  640. for p1 in dict_line_points[_line]["points"]:
  641. if p1["left"] >= margin and p1["point"][1] > next_point["point"][1]:
  642. final_point = p1
  643. break
  644. if not final_point:
  645. continue
  646. _r = LTRect(1,
  647. (_point["point"][0], _point["point"][1], final_point["point"][0], final_point["point"][1]))
  648. list_rect.append(_r)
  649. tmp_rect = []
  650. set_bbox = set()
  651. for _r in list_rect:
  652. _bbox = "%.2f-%.2f-%.2f-%.2f" % _r.bbox
  653. width = _r.bbox[2] - _r.bbox[0]
  654. height = _r.bbox[3] - _r.bbox[1]
  655. if width <= margin or height <= margin:
  656. continue
  657. if _bbox not in set_bbox:
  658. tmp_rect.append(_r)
  659. set_bbox.add(_bbox)
  660. list_rect = tmp_rect
  661. # _l = [x.get('point') for x in list_crosspoint]
  662. # _l.sort(key=lambda x: (x[0], x[1]))
  663. # print('list_crosspoint', _l)
  664. # print('list_rect', list_rect)
  665. # import cv2
  666. # import numpy as np
  667. # import random
  668. # img = np.zeros(shape=(1000,1000),dtype=np.uint8)
  669. # img += 255
  670. #
  671. # color = []
  672. # for rect in list_rect:
  673. # color += 10
  674. # x0,y0,x1,y1 = rect.bbox
  675. # x0 *= 10/18
  676. # y0 *= 10/18
  677. # x1 *= 10/18
  678. # y1 *= 10/18
  679. # print(rect.bbox)
  680. # cv2.rectangle(img, (int(x0),int(y0)),(int(x1),int(y1)), (color%255, (color+10)%255, (color+20)%255), 3)
  681. # cv2.imshow("bbox", img)
  682. # cv2.waitKey(0)
  683. return list_rect
  684. def cross_point(self, line1, line2, segment=True, margin=2):
  685. point_is_exist = False
  686. x = y = 0
  687. x1, y1, x2, y2 = line1
  688. x3, y3, x4, y4 = line2
  689. if (x2 - x1) == 0:
  690. k1 = None
  691. b1 = 0
  692. else:
  693. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  694. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  695. if (x4 - x3) == 0: # L2直线斜率不存在
  696. k2 = None
  697. b2 = 0
  698. else:
  699. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  700. b2 = y3 * 1.0 - x3 * k2 * 1.0
  701. if k1 is None:
  702. if not k2 is None:
  703. x = x1
  704. y = k2 * x1 + b2
  705. point_is_exist = True
  706. elif k2 is None:
  707. x = x3
  708. y = k1 * x3 + b1
  709. elif not k2 == k1:
  710. x = (b2 - b1) * 1.0 / (k1 - k2)
  711. y = k1 * x * 1.0 + b1 * 1.0
  712. point_is_exist = True
  713. left = 0
  714. right = 0
  715. top = 0
  716. buttom = 0
  717. if point_is_exist:
  718. if segment:
  719. if x >= (min(x1, x2) - margin) and x <= (max(x1, x2) + margin) and y >= (
  720. min(y1, y2) - margin) and y <= (max(y1, y2) + margin):
  721. if x >= (min(x3, x4) - margin) and x <= (max(x3, x4) + margin) and y >= (
  722. min(y3, y4) - margin) and y <= (max(y3, y4) + margin):
  723. point_is_exist = True
  724. left = abs(min(x1, x3) - x)
  725. right = abs(max(x2, x4) - x)
  726. top = abs(min(y1, y3) - y)
  727. buttom = abs(max(y2, y4) - y)
  728. else:
  729. point_is_exist = False
  730. else:
  731. point_is_exist = False
  732. line1_key = "%.2f-%.2f-%.2f-%.2f" % (x1, y1, x2, y2)
  733. line2_key = "%.2f-%.2f-%.2f-%.2f" % (x3, y3, x4, y4)
  734. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  735. "top": top, "buttom": buttom, "lines": set([line1_key, line2_key]),
  736. "p_lines": [line1, line2]}
  737. # def unionTable(self, list_table, fixspan=True, margin=2):
  738. # set_x = set()
  739. # set_y = set()
  740. #
  741. # list_cell = []
  742. # for _t in list_table:
  743. # for _line in _t:
  744. # list_cell.extend(_line)
  745. #
  746. # clusters_rects = []
  747. # # 根据y1聚类
  748. # set_id = set()
  749. # list_cell_dump = []
  750. # for _cell in list_cell:
  751. # _id = id(_cell)
  752. # if _id in set_id:
  753. # continue
  754. # set_id.add(_id)
  755. # list_cell_dump.append(_cell)
  756. # list_cell = list_cell_dump
  757. # list_cell.sort(key=lambda x: x.get("bbox")[3])
  758. # for _rect in list_cell:
  759. # _y0 = _rect.get("bbox")[3]
  760. # _find = False
  761. # for l_cr in clusters_rects:
  762. # if abs(l_cr[0].get("bbox")[3] - _y0) < 2:
  763. # _find = True
  764. # l_cr.append(_rect)
  765. # break
  766. # if not _find:
  767. # clusters_rects.append([_rect])
  768. #
  769. # clusters_rects.sort(key=lambda x: x[0].get("bbox")[3], reverse=True)
  770. # for l_cr in clusters_rects:
  771. # l_cr.sort(key=lambda x: x.get("bbox")[0])
  772. #
  773. # # print("=============:")
  774. # # for l_r in clusters_rects:
  775. # # print(len(l_r))
  776. #
  777. # for _line in clusters_rects:
  778. # for _rect in _line:
  779. # (x0, y0, x1, y1) = _rect.get("bbox")
  780. # set_x.add(x0)
  781. # set_x.add(x1)
  782. # set_y.add(y0)
  783. # set_y.add(y1)
  784. # if len(set_x) == 0 or len(set_y) == 0:
  785. # return
  786. # list_x = list(set_x)
  787. # list_y = list(set_y)
  788. #
  789. # list_x.sort(key=lambda x: x)
  790. # list_y.sort(key=lambda x: x, reverse=True)
  791. # _table = []
  792. # line_i = 0
  793. # for _line in clusters_rects:
  794. #
  795. # table_line = []
  796. # cell_i = 0
  797. # for _rect in _line:
  798. # (x0, y0, x1, y1) = _rect.get("bbox")
  799. # _cell = {"bbox": (x0, y0, x1, y1), "rect": _rect.get("rect"),
  800. # "rowspan": self.getspan(list_y, y0, y1, margin),
  801. # "columnspan": self.getspan(list_x, x0, x1, margin), "text": _rect.get("text", "")}
  802. # table_line.append(_cell)
  803. #
  804. # cell_i += 1
  805. # line_i += 1
  806. # _table.append(table_line)
  807. #
  808. # # print("=====================>>")
  809. # # for _line in _table:
  810. # # for _cell in _line:
  811. # # print(_cell,end="\t")
  812. # # print("\n")
  813. # # print("=====================>>")
  814. #
  815. # # print(_table)
  816. # if fixspan:
  817. # for _line in _table:
  818. # extend_line = []
  819. # for c_i in range(len(_line)):
  820. # _cell = _line[c_i]
  821. # if _cell.get("columnspan") > 1:
  822. # _cospan = _cell.get("columnspan")
  823. # _cell["columnspan"] = 1
  824. # for i in range(1, _cospan):
  825. # extend_line.append({"index": c_i + 1, "cell": _cell})
  826. # extend_line.sort(key=lambda x: x["index"], reverse=True)
  827. # for _el in extend_line:
  828. # _line.insert(_el["index"], _el["cell"])
  829. # for l_i in range(len(_table)):
  830. # _line = _table[l_i]
  831. # for c_i in range(len(_line)):
  832. # _cell = _line[c_i]
  833. # if _cell.get("rowspan") > 1:
  834. # _rospan = _cell.get("rowspan")
  835. # _cell["rowspan"] = 1
  836. # for i in range(1, _rospan):
  837. # _table[l_i + i].insert(c_i, _cell)
  838. #
  839. # table_bbox = (_table[0][0].get("bbox")[0], _table[0][0].get("bbox")[1], _table[-1][-1].get("bbox")[2],
  840. # _table[-1][-1].get("bbox")[3])
  841. #
  842. # ta = {"bbox": table_bbox, "table": _table}
  843. # return ta
  844. # 获取点阵
  845. def getSpanLocation(self, _list, x0, x1, margin):
  846. list_location = []
  847. (x0, x1) = (min(x0, x1), max(x0, x1))
  848. for _x in _list:
  849. if _x >= (x0 - margin) and _x <= (x1 + margin):
  850. list_location.append(_x)
  851. return list_location
  852. def fixSpan(self, _table, list_x, list_y, sourceP_LB):
  853. # with open('table.pickle', 'wb') as f:
  854. # pickle.dump(_table, f)
  855. def checkPosition(_line, _position, bbox, margin=5):
  856. # check y
  857. if len(_line) > 0:
  858. _bbox = _line[0].get("bbox")
  859. # check if has lap
  860. if min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3]):
  861. # if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
  862. # print(_bbox)
  863. # print(bbox)
  864. # print("check position y false", _bbox, bbox)
  865. return False
  866. # check x
  867. if _position <= len(_line) - 1:
  868. after_bbox = _line[_position].get("bbox")
  869. # the insert bbox.x1 should not less then the after bbox.x0
  870. if not (after_bbox[0] >= bbox[2]):
  871. # print("check position x after false 1")
  872. return False
  873. if 0 < _position - 1 < len(_line):
  874. before_bbox = _line[_position - 1].get("bbox")
  875. # the insert bbox.x1 should less equal than the first bbox.x0
  876. if not (bbox[0] >= before_bbox[2]):
  877. # print("check position x before false 2")
  878. return False
  879. return True
  880. # 拓展columnspan的数据
  881. for _line in _table:
  882. c_i = 0
  883. while c_i < len(_line):
  884. _cell = _line[c_i]
  885. if _cell.get("columnspan") > 1:
  886. x0, y0, x1, y1 = _cell.get("bbox")
  887. _cospan = _cell.get("columnspan")
  888. locations = self.getSpanLocation(list_x, x0, x1, 10)
  889. if len(locations) == _cospan + 1:
  890. _cell["bbox"] = (x0, y0, locations[1], y1)
  891. _cell["columnspan"] = 1
  892. # len(locations)==_colspan+1
  893. for i in range(1, _cospan):
  894. n_cell = {}
  895. n_cell.update(_cell)
  896. n_cell["bbox"] = (locations[i], y0, locations[i + 1], y1)
  897. c_i += 1
  898. # check the position
  899. if checkPosition(_line, c_i, n_cell["bbox"]):
  900. _line.insert(c_i, n_cell)
  901. c_i += 1
  902. # 拓展rowspan的数据
  903. for l_i in range(len(_table)):
  904. _line = _table[l_i]
  905. c_i = 0
  906. while c_i < len(_line):
  907. _cell = _line[c_i]
  908. if _cell.get("rowspan") > 1:
  909. # print('_cell', _cell)
  910. x0, y0, x1, y1 = _cell.get("bbox")
  911. _rospan = _cell.get("rowspan")
  912. locations = self.getSpanLocation(list_y, y0, y1, 10)
  913. # print('locations', locations)
  914. if len(locations) == _rospan + 1:
  915. if self.is_reverse:
  916. _cell["bbox"] = (x0, locations[-2], x1, y0)
  917. else:
  918. _cell["bbox"] = (x0, y0, x1, locations[1])
  919. _cell["rowspan"] = 1
  920. # print('_cell1', _cell)
  921. for i in range(1, _rospan):
  922. n_cell = {}
  923. n_cell.update(_cell)
  924. # if not self.is_reverse:
  925. if l_i + i <= len(_table) - 1:
  926. # print(len(_table),l_i+i)
  927. n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
  928. # print('n_cell', n_cell)
  929. if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
  930. # print('n_cell1', n_cell)
  931. _table[l_i + i].insert(c_i, n_cell)
  932. # else:
  933. # if l_i - i >= 0:
  934. # # print(len(_table),l_i+i)
  935. # n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
  936. # print('n_cell', n_cell)
  937. # if checkPosition(_table[l_i - i], c_i, n_cell["bbox"]):
  938. # print('n_cell1', n_cell)
  939. # _table[l_i - i].insert(c_i, n_cell)
  940. c_i += 1
  941. def fixRect(self, _table, list_x, list_y, sourceP_LB, margin):
  942. self.fixSpan(_table, list_x, list_y, sourceP_LB)
  943. # for line_i in range(len(_table)):
  944. # for cell_i in range(len(_table[line_i])):
  945. # _cell = _table[line_i][cell_i]
  946. # print(line_i,cell_i,_cell["bbox"],_cell["text"])
  947. for _line in _table:
  948. _line.sort(key=lambda x: x.get('bbox')[0])
  949. # print('_line', _line)
  950. extend_line = []
  951. for c_i in range(len(_line)):
  952. c_cell = _line[c_i]
  953. # first cell missing
  954. if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
  955. # print('c_cell', c_cell)
  956. # print('list_x', list_x)
  957. _bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
  958. _cell = {"bbox": _bbox,
  959. "rect": LTRect(1, _bbox),
  960. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  961. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  962. "text": ""}
  963. extend_line.append({"index": c_i, "cell": _cell})
  964. # cell in the median missing
  965. if c_i < len(_line) - 1:
  966. n_cell = _line[c_i + 1]
  967. _bbox = c_cell["bbox"]
  968. n_bbox = n_cell["bbox"]
  969. if _bbox[0] == n_bbox[0] and _bbox[2] == n_bbox[2]:
  970. continue
  971. else:
  972. if abs(_bbox[2] - n_bbox[0]) > margin:
  973. _bbox = (_bbox[2], _bbox[1], n_bbox[0], _bbox[3])
  974. _cell = {"bbox": _bbox,
  975. "rect": LTRect(1, _bbox),
  976. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  977. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  978. "text": ""}
  979. extend_line.append({"index": c_i + 1, "cell": _cell})
  980. # last cell missing
  981. if c_i == len(_line) - 1:
  982. if abs(c_cell["bbox"][2] - list_x[-1]) > margin:
  983. _bbox = (c_cell["bbox"][2], c_cell["bbox"][1], list_x[-1], c_cell["bbox"][3])
  984. _cell = {"bbox": _bbox,
  985. "rect": LTRect(1, _bbox),
  986. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  987. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  988. "text": ""}
  989. extend_line.append({"index": c_i + 1, "cell": _cell})
  990. extend_line.sort(key=lambda x: x["index"], reverse=True)
  991. for _tmp in extend_line:
  992. _line.insert(_tmp["index"], _tmp["cell"])
  993. def feedText2table(self, _table, list_textbox, in_objs, sourceP_LB):
  994. # find the suitable cell of the textbox
  995. list_cells = []
  996. for table_line in _table:
  997. for _cell in table_line:
  998. list_cells.append({"cell": _cell, "inbox_textbox_list": []})
  999. self.connect_bbox_list = []
  1000. for textbox in list_textbox:
  1001. list_iou = []
  1002. for _d in list_cells:
  1003. _cell = _d["cell"]
  1004. _iou = self.getIOU(textbox.bbox, _cell["bbox"])
  1005. list_iou.append(_iou)
  1006. max_iou_index = np.argmax(list_iou)
  1007. max_iou = list_iou[max_iou_index]
  1008. # if self.from_pdf:
  1009. # iou_threhold = 0.3
  1010. # else:
  1011. iou_threhold = 0.1
  1012. if max_iou > iou_threhold and textbox not in in_objs:
  1013. list_cells[max_iou_index]["inbox_textbox_list"].append(textbox)
  1014. in_objs.add(textbox)
  1015. if not self.from_pdf and not self.splited:
  1016. # 多个iou大于0.3的,可能是ocr将两个文本合成一个了
  1017. iou_index_list = np.where(np.array(list_iou) >= 0.3)[0].tolist()
  1018. if len(iou_index_list) >= 2:
  1019. # print('len(iou_index_list) >= 2 textbox', textbox)
  1020. self.connect_bbox_list.append(textbox)
  1021. has_matched_box_list = []
  1022. for _d in list_cells:
  1023. _cell = _d["cell"]
  1024. inbox_textbox_list = _d["inbox_textbox_list"]
  1025. # 分行,根据y重合
  1026. all_match_box_list = []
  1027. # inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
  1028. inbox_textbox_list.sort(key=lambda x: x.bbox[1])
  1029. for i in range(len(inbox_textbox_list)):
  1030. match_box_list = []
  1031. box1 = inbox_textbox_list[i]
  1032. if box1 in has_matched_box_list:
  1033. continue
  1034. min_y1 = box1.bbox[1] + 1 / 3 * abs(box1.bbox[3] - box1.bbox[1])
  1035. max_y1 = box1.bbox[3] - 1 / 3 * abs(box1.bbox[3] - box1.bbox[1])
  1036. match_box_list.append(
  1037. [box1.get_text(), box1.bbox[0], box1.bbox[1], box1.bbox[2], box1.bbox[3], min_y1, max_y1])
  1038. has_matched_box_list.append(box1)
  1039. for j in range(i + 1, len(inbox_textbox_list)):
  1040. box2 = inbox_textbox_list[j]
  1041. if box2 in has_matched_box_list:
  1042. continue
  1043. # print(min_y1, box2.bbox[1], box2.bbox[3], max_y1)
  1044. # print(min_y2, box1.bbox[3], max_y2)
  1045. if min_y1 <= box2.bbox[1] <= max_y1 or \
  1046. min_y1 <= box2.bbox[3] <= max_y1 or \
  1047. box2.bbox[1] <= min_y1 <= max_y1 <= box2.bbox[3]:
  1048. match_box_list.append(
  1049. [box2.get_text(), box2.bbox[0], box2.bbox[1], box2.bbox[2], box2.bbox[3], min_y1, max_y1])
  1050. has_matched_box_list.append(box2)
  1051. match_box_list.sort(key=lambda x: x[1])
  1052. all_match_box_list.append(match_box_list)
  1053. # print("match_box_list", all_match_box_list)
  1054. # all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
  1055. all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0))
  1056. for box_list in all_match_box_list:
  1057. for box in box_list:
  1058. _cell["text"] += re.sub("\s", '', box[0])
  1059. # 打印所有cell
  1060. # for _cell in list_cells:
  1061. # print("cell", _cell)
  1062. def makeTableByRect(self, list_rect, margin, sourceP_LB):
  1063. _table = []
  1064. set_x = set()
  1065. set_y = set()
  1066. clusters_rects = []
  1067. # 根据y1聚类
  1068. # if sourceP_LB:
  1069. # list_rect.sort(key=lambda x: x.bbox[3])
  1070. # for _rect in list_rect:
  1071. # _y0 = _rect.bbox[3]
  1072. # _y1 = _rect.bbox[1]
  1073. # _find = False
  1074. # for l_cr in clusters_rects:
  1075. # if abs(l_cr[0].bbox[3] - _y0) < margin:
  1076. # _find = True
  1077. # l_cr.append(_rect)
  1078. # break
  1079. # if not _find:
  1080. # clusters_rects.append([_rect])
  1081. # else:
  1082. list_rect.sort(key=lambda x: x.bbox[1])
  1083. for _rect in list_rect:
  1084. _y0 = _rect.bbox[1]
  1085. _y1 = _rect.bbox[3]
  1086. _find = False
  1087. for l_cr in clusters_rects:
  1088. if abs(l_cr[0].bbox[1] - _y0) < margin:
  1089. _find = True
  1090. l_cr.append(_rect)
  1091. break
  1092. if not _find:
  1093. clusters_rects.append([_rect])
  1094. # print("textbox:===================")
  1095. # for _textbox in list_textbox:
  1096. # print(_textbox.get_text())
  1097. # print("textbox:======>>>>>>>>>>>>>")
  1098. # for c in clusters_rects:
  1099. # print("+"*30)
  1100. # for cc in c:
  1101. # print("rect", cc.)
  1102. # cul spans
  1103. for _line in clusters_rects:
  1104. for _rect in _line:
  1105. (x0, y0, x1, y1) = _rect.bbox
  1106. set_x.add(x0)
  1107. set_x.add(x1)
  1108. set_y.add(y0)
  1109. set_y.add(y1)
  1110. if len(set_x) == 0 or len(set_y) == 0:
  1111. return None, [], []
  1112. if len(list_rect) <= 1:
  1113. return None, [], []
  1114. list_x = list(set_x)
  1115. list_y = list(set_y)
  1116. list_x.sort(key=lambda x: x)
  1117. # list_y.sort(key=lambda x: x, reverse=sourceP_LB)
  1118. list_y.sort(key=lambda x: x)
  1119. # print("clusters_rects", len(clusters_rects))
  1120. # if sourceP_LB:
  1121. # clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
  1122. clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
  1123. for l_cr in clusters_rects:
  1124. l_cr.sort(key=lambda x: x.bbox[0])
  1125. pop_x = []
  1126. for i in range(len(list_x) - 1):
  1127. _i = len(list_x) - i - 1
  1128. l_i = _i - 1
  1129. if abs(list_x[_i] - list_x[l_i]) < 5:
  1130. pop_x.append(_i)
  1131. pop_x.sort(key=lambda x: x, reverse=True)
  1132. for _x in pop_x:
  1133. list_x.pop(_x)
  1134. #
  1135. pop_x = []
  1136. for i in range(len(list_y) - 1):
  1137. _i = len(list_y) - i - 1
  1138. l_i = _i - 1
  1139. if abs(list_y[_i] - list_y[l_i]) < 5:
  1140. pop_x.append(_i)
  1141. pop_x.sort(key=lambda x: x, reverse=True)
  1142. for _x in pop_x:
  1143. list_y.pop(_x)
  1144. # print("list_x", list_x)
  1145. # print("list_y", list_y)
  1146. line_i = 0
  1147. for _line in clusters_rects:
  1148. table_line = []
  1149. cell_i = 0
  1150. for _rect in _line:
  1151. (x0, y0, x1, y1) = _rect.bbox
  1152. _cell = {"bbox": (x0, y0, x1, y1),
  1153. "rect": _rect,
  1154. "rowspan": self.getspan(list_y, y0, y1, margin),
  1155. "columnspan": self.getspan(list_x, x0, x1, margin),
  1156. "text": ""}
  1157. cell_i += 1
  1158. table_line.append(_cell)
  1159. line_i += 1
  1160. _table.append(table_line)
  1161. return _table, list_x, list_y
  1162. def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=False):
  1163. def getIOU(bbox0, bbox1):
  1164. width = max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0]) - (bbox0[2] - bbox0[0] + bbox1[2] - bbox1[0])
  1165. height = max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1]) - (bbox0[3] - bbox0[1] + bbox1[3] - bbox1[1])
  1166. if width < 0 and height < 0:
  1167. return abs(width * height / min(abs((bbox0[2] - bbox0[0]) * (bbox0[3] - bbox0[1])),
  1168. abs((bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]))))
  1169. return 0
  1170. _table, list_x, list_y = self.makeTableByRect(list_rect, margin, sourceP_LB)
  1171. if self.show:
  1172. # 打印_table
  1173. temp_list = []
  1174. for t in _table:
  1175. print('------ makeTableByRect row ------')
  1176. for c in t:
  1177. print('makeTableByRect col', c)
  1178. temp_list.append(c)
  1179. self._plot([], [], temp_list, title='makeTableByRect table')
  1180. if _table is None:
  1181. return
  1182. # pdf纯文本上下颠倒,pdf图片不颠倒
  1183. # if self.is_reverse:
  1184. # _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
  1185. # else:
  1186. _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
  1187. self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
  1188. # print("table===========================>")
  1189. # for _line in _table:
  1190. # for _cell in _line:
  1191. # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
  1192. # print()
  1193. # print("table===========================>")
  1194. #
  1195. # print("------------")
  1196. # for _line in _table:
  1197. # for _cell in _line:
  1198. # print(_cell["text"],end="\t")
  1199. # print("\n")
  1200. # print("------------")
  1201. self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
  1202. # pdf纯文本上下颠倒,pdf图片不颠倒
  1203. # if self.is_reverse:
  1204. # _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
  1205. # else:
  1206. _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
  1207. if self.show:
  1208. # 打印_table
  1209. temp_list = []
  1210. for t in _table:
  1211. print('------ fixRect row ------')
  1212. for c in t:
  1213. print('fixRect col', c)
  1214. temp_list.append(c)
  1215. self._plot([], [], temp_list, title='fixRect table')
  1216. # print("table===========================>")
  1217. # for _line in _table:
  1218. # for _cell in _line:
  1219. # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
  1220. # print()
  1221. # print("table===========================>")
  1222. self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
  1223. # feedText2table后,有textbox符合多个单元格iou的,可能是文本错误连接了,需拆开
  1224. if self.connect_bbox_list:
  1225. return {}
  1226. min_x, min_y = 1000000, 1000000
  1227. max_x, max_y = 0, 0
  1228. for row in _table:
  1229. for col in row:
  1230. if col.get('bbox')[0] < min_x:
  1231. min_x = col.get('bbox')[0]
  1232. if col.get('bbox')[2] < min_x:
  1233. min_x = col.get('bbox')[2]
  1234. if col.get('bbox')[1] < min_y:
  1235. min_y = col.get('bbox')[1]
  1236. if col.get('bbox')[3] < min_y:
  1237. min_y = col.get('bbox')[3]
  1238. if col.get('bbox')[0] > max_x:
  1239. max_x = col.get('bbox')[0]
  1240. if col.get('bbox')[2] > max_x:
  1241. max_x = col.get('bbox')[2]
  1242. if col.get('bbox')[1] > max_y:
  1243. max_y = col.get('bbox')[1]
  1244. if col.get('bbox')[3] > max_y:
  1245. max_y = col.get('bbox')[3]
  1246. table_bbox = (min_x, min_y, max_x, max_y)
  1247. # table_bbox = (_table[0][0].get("bbox")[0],
  1248. # _table[0][0].get("bbox")[1],
  1249. # _table[-1][-1].get("bbox")[2],
  1250. # _table[-1][-1].get("bbox")[3])
  1251. # print("=======")
  1252. # for _line in _table:
  1253. # for _cell in _line:
  1254. # print(_cell["text"])
  1255. # print("\n")
  1256. # print("===========")
  1257. ta = {"bbox": table_bbox, "table": _table}
  1258. return ta
  1259. def inbox(self, bbox0, bbox_g, text=""):
  1260. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  1261. # return 1
  1262. # print("utils inbox", text, self.getIOU(bbox0,bbox_g), bbox0, bbox_g)
  1263. if self.getIOU(bbox0, bbox_g) > 0.2:
  1264. return 1
  1265. return 0
  1266. def getIOU(self, bbox0, bbox1):
  1267. bbox0 = [min(bbox0[0], bbox0[2]), min(bbox0[1], bbox0[3]), max(bbox0[0], bbox0[2]), max(bbox0[1], bbox0[3])]
  1268. bbox1 = [min(bbox1[0], bbox1[2]), min(bbox1[1], bbox1[3]), max(bbox1[0], bbox1[2]), max(bbox1[1], bbox1[3])]
  1269. width = abs(max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0])) - (
  1270. abs(bbox0[2] - bbox0[0]) + abs(bbox1[2] - bbox1[0]))
  1271. height = abs(max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1])) - (
  1272. abs(bbox0[3] - bbox0[1]) + abs(bbox1[3] - bbox1[1]))
  1273. if width < 0 and height < 0:
  1274. iou = abs(width * height / min(abs((bbox0[2] - bbox0[0]) * (bbox0[3] - bbox0[1])),
  1275. abs((bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]))))
  1276. # print("getIOU", iou)
  1277. return iou
  1278. return 0
  1279. def getspan(self, _list, x0, x1, margin):
  1280. _count = 0
  1281. (x0, x1) = (min(x0, x1), max(x0, x1))
  1282. for _x in _list:
  1283. if _x >= (x0 - margin) and _x <= (x1 + margin):
  1284. _count += 1
  1285. return _count - 1
  1286. def _plot(self, list_line, list_textbox, list_rect=[], title=''):
  1287. from matplotlib import pyplot as plt
  1288. plt.figure()
  1289. for _line in list_line:
  1290. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  1291. plt.plot([x0, x1], [y0, y1])
  1292. for _line in list_line:
  1293. x0, y0, x1, y1 = _line.bbox
  1294. plt.plot([x0, x1], [y0, y1])
  1295. # for point in list_crosspoints:
  1296. # plt.scatter(point.get("point")[0],point.get("point")[1])
  1297. for textbox in list_textbox:
  1298. x0, y0, x1, y1 = textbox.bbox
  1299. plt.plot([x0, x1], [y0, y1])
  1300. for rect in list_rect:
  1301. try:
  1302. x0, y0, x1, y1 = rect.bbox
  1303. except:
  1304. x0, y0, x1, y1 = rect.get("bbox")
  1305. plt.plot([x0, x0], [y0, y1])
  1306. plt.plot([x0, x1], [y0, y0])
  1307. plt.plot([x1, x1], [y0, y1])
  1308. plt.plot([x0, x1], [y1, y1])
  1309. plt.title(str(title))
  1310. plt.show()
  1311. def get_table_html(table):
  1312. html_text = '<table border="1">'
  1313. for row in table:
  1314. html_text += "<tr>"
  1315. for col in row:
  1316. row_span = col.get("rowspan")
  1317. col_span = col.get("columnspan")
  1318. bbox_text = col.get("text")
  1319. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1320. html_text += bbox_text + "</td>"
  1321. html_text += "</tr>"
  1322. html_text += "</table>"
  1323. return html_text
  1324. def sort_object(obj_list, is_reverse=False):
  1325. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  1326. obj_list = combine_object(obj_list)
  1327. if len(obj_list) == 0:
  1328. return obj_list
  1329. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  1330. obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
  1331. return obj_list
  1332. elif isinstance(obj_list[0], _Page):
  1333. obj_list.sort(key=lambda x: x.page_no)
  1334. return obj_list
  1335. else:
  1336. return obj_list
  1337. def combine_object(obj_list, threshold=5):
  1338. from format_convert.convert_tree import _Sentence
  1339. sentence_list = []
  1340. for obj in obj_list:
  1341. if isinstance(obj, _Sentence) and not obj.is_html:
  1342. obj.content = re.sub("\s", "", obj.content)
  1343. sentence_list.append(obj)
  1344. sentence_list.sort(key=lambda x: (x.y, x.x))
  1345. for sen in sentence_list:
  1346. obj_list.remove(sen)
  1347. delete_list = []
  1348. for i in range(1, len(sentence_list)):
  1349. sen1 = sentence_list[i - 1]
  1350. sen2 = sentence_list[i]
  1351. if sen1.combine is False or sen2.combine is False:
  1352. continue
  1353. if abs(sen2.y - sen1.y) <= threshold:
  1354. if sen2.x > sen1.x:
  1355. sen2.x = sen1.x
  1356. sen2.content = sen1.content + sen2.content
  1357. else:
  1358. sen2.content = sen2.content + sen1.content
  1359. if sen2.y > sen1.y:
  1360. sen2.y = sen1.y
  1361. delete_list.append(sen1)
  1362. for sen in delete_list:
  1363. sentence_list.remove(sen)
  1364. for sen in sentence_list:
  1365. obj_list.append(sen)
  1366. return obj_list
  1367. session_ocr = requests.Session()
  1368. session_otr = requests.Session()
  1369. session_all = requests.Session()
  1370. def request_post_240606(url, param, time_out=1000, use_zlib=False):
  1371. fails = 0
  1372. text = json.dumps([-2])
  1373. while True:
  1374. try:
  1375. if fails >= 1:
  1376. break
  1377. headers = {'content-type': 'application/json'}
  1378. # result = requests.post(url, data=param, timeout=time_out)
  1379. if param.get("model_type") == "ocr":
  1380. result = session_ocr.post(url, data=param, timeout=time_out)
  1381. elif param.get("model_type") == "otr":
  1382. result = session_otr.post(url, data=param, timeout=time_out)
  1383. else:
  1384. result = session_all.post(url, data=param, timeout=time_out)
  1385. # print('result.status_code', result.status_code)
  1386. # print('result.text', result.text)
  1387. if result.status_code == 200:
  1388. text = result.text
  1389. break
  1390. else:
  1391. # print('result.status_code', result.status_code)
  1392. # print('result.text', result.text)
  1393. fails += 1
  1394. continue
  1395. except socket.timeout:
  1396. fails += 1
  1397. # print('timeout! fail times:', fails)
  1398. except:
  1399. fails += 1
  1400. # print('fail! fail times:', fails)
  1401. traceback.print_exc()
  1402. return text
  1403. def request_post(url, param, time_out=1000):
  1404. try:
  1405. headers = {'content-type': 'application/json'}
  1406. result = session_all.post(url, data=param, timeout=time_out)
  1407. if result.status_code == 200:
  1408. text = result.text
  1409. else:
  1410. text = json.dumps([-21])
  1411. except socket.timeout:
  1412. text = json.dumps([-20])
  1413. except requests.exceptions.ConnectionError:
  1414. text = json.dumps([-22])
  1415. except:
  1416. text = json.dumps([-2])
  1417. traceback.print_exc()
  1418. return text
  1419. def test_gpu():
  1420. print("=" * 30)
  1421. import paddle
  1422. paddle.utils.run_check()
  1423. # import tensorflow as tf
  1424. # print("tf gpu", tf.config.list_physical_devices('GPU'))
  1425. print("=" * 30)
  1426. def my_subprocess_call(*popenargs, timeout=None):
  1427. logging.info("into my_subprocess_call")
  1428. with Popen(*popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
  1429. try:
  1430. for line in p.stdout:
  1431. print("stdout", line)
  1432. for line in p.stderr:
  1433. print("stderr", line)
  1434. p.wait(timeout=timeout)
  1435. # p.communicate()
  1436. return p.pid, p.returncode
  1437. except: # Including KeyboardInterrupt, wait handled that.
  1438. p.kill()
  1439. # We don't call p.wait() again as p.__exit__ does that for us.
  1440. raise
  1441. finally:
  1442. logging.info("out my_subprocess_call")
  1443. p.kill()
  1444. def parse_yaml():
  1445. # yaml_path = os.path.dirname(os.path.abspath(__file__)) + "/../config/interface_new.yml"
  1446. yaml_path = config_file_path
  1447. # with open(yaml_path, "r", encoding='utf-8') as f:
  1448. # cfg = f.read()
  1449. #
  1450. # params = yaml.load(cfg, Loader=yaml.SafeLoader)
  1451. with open(yaml_path, "r", encoding='utf-8') as f:
  1452. _dict = json.load(f)
  1453. return _dict
  1454. def get_ip_port(node_type=None, interface_type=None):
  1455. if node_type is None:
  1456. node_type_list = ["master", "slave"]
  1457. else:
  1458. node_type_list = [node_type]
  1459. if interface_type is None:
  1460. # interface_type_list = ["convert", "ocr", "otr", "office", "path", "isr", "idc", "atc", "yolo", 'tika']
  1461. interface_type_list = INTERFACES + ["path"]
  1462. else:
  1463. interface_type_list = [interface_type]
  1464. ip_port_dict = {}
  1465. params = parse_yaml()
  1466. # 循环 master slave
  1467. for type1 in node_type_list:
  1468. node_type = type1.upper()
  1469. ip = params.get(node_type).get("ip")
  1470. if not ip:
  1471. continue
  1472. if ip_port_dict.get(ip):
  1473. ip_port_dict.get(ip).update({node_type: {}})
  1474. else:
  1475. ip_port_dict.update({ip: {node_type: {}}})
  1476. # 有IP时,循环多个参数
  1477. for type2 in interface_type_list:
  1478. python_path = None
  1479. project_path = None
  1480. gunicorn_path = None
  1481. port_list = []
  1482. interface_type = type2
  1483. if not params.get(node_type).get(interface_type):
  1484. continue
  1485. if interface_type == "path":
  1486. python_path = params.get(node_type).get(interface_type).get("python")
  1487. project_path = params.get(node_type).get(interface_type).get("project")
  1488. gunicorn_path = params.get(node_type).get(interface_type).get("gunicorn")
  1489. else:
  1490. port = params.get(node_type).get(interface_type).get("port")
  1491. port_num = params.get(node_type).get(interface_type).get("port_num")
  1492. gpu_no = params.get(node_type).get(interface_type).get("gpu")
  1493. if port is None or port_num is None:
  1494. port_list = []
  1495. else:
  1496. port_list = [port, port_num, gpu_no]
  1497. # 参数放入dict
  1498. if port_list:
  1499. ip_port_dict.get(ip).get(node_type).update({interface_type: port_list})
  1500. if project_path and python_path and gunicorn_path:
  1501. ip_port_dict.get(ip).get(node_type).update({"project_path": project_path,
  1502. "python_path": python_path,
  1503. "gunicorn_path": gunicorn_path})
  1504. return ip_port_dict
  1505. def get_ip_port_old(node_type=None, interface_type=None):
  1506. if node_type is None:
  1507. node_type_list = ["master", "slave"]
  1508. else:
  1509. node_type_list = [node_type]
  1510. if interface_type is None:
  1511. interface_type_list = ["convert", "ocr", "otr", "office", "path"]
  1512. else:
  1513. interface_type_list = [interface_type]
  1514. ip_port_dict = {}
  1515. params = parse_yaml()
  1516. for type1 in node_type_list:
  1517. node_type = type1.upper()
  1518. ip_list = params.get(node_type).get("ip")
  1519. for type2 in interface_type_list:
  1520. interface_type = type2.upper()
  1521. processes = 0
  1522. python_path = None
  1523. project_path = None
  1524. if interface_type in ["convert".upper()]:
  1525. _port = params.get(node_type).get(interface_type).get("port")
  1526. if _port is None:
  1527. port_list = []
  1528. else:
  1529. if interface_type == "convert".upper():
  1530. processes = params.get(node_type).get(interface_type).get("processes")
  1531. port_list = [str(_port)] * int(processes)
  1532. # port_list = [str(_port)]
  1533. elif interface_type == "path".upper():
  1534. python_path = params.get(node_type).get(interface_type).get("python")
  1535. project_path = params.get(node_type).get(interface_type).get("project")
  1536. else:
  1537. port_start = params.get(node_type).get(interface_type).get("port_start")
  1538. port_no = params.get(node_type).get(interface_type).get("port_no")
  1539. if port_start is None or port_no is None:
  1540. port_list = []
  1541. else:
  1542. port_list = [str(x) for x in range(port_start, port_start + port_no, 1)]
  1543. if ip_list:
  1544. for _ip in ip_list:
  1545. if _ip is None:
  1546. continue
  1547. if _ip in ip_port_dict.keys():
  1548. if port_list:
  1549. ip_port_dict.get(_ip).update({interface_type.lower(): port_list})
  1550. else:
  1551. if port_list:
  1552. ip_port_dict[_ip] = {interface_type.lower(): port_list}
  1553. if processes:
  1554. ip_port_dict.get(_ip).update({interface_type.lower() + "_processes": processes})
  1555. if project_path and python_path:
  1556. ip_port_dict.get(_ip).update({"project_path": project_path,
  1557. "python_path": python_path})
  1558. return ip_port_dict
  1559. def get_intranet_ip():
  1560. try:
  1561. # Create a new socket using the given address family,
  1562. # socket type and protocol number.
  1563. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  1564. # Connect to a remote socket at address.
  1565. # (The format of address depends on the address family.)
  1566. address = ("8.8.8.8", 80)
  1567. s.connect(address)
  1568. # Return the socket’s own address.
  1569. # This is useful to find out the port number of an IPv4/v6 socket, for instance.
  1570. # (The format of the address returned depends on the address family.)
  1571. sockname = s.getsockname()
  1572. ip = sockname[0]
  1573. port = sockname[1]
  1574. finally:
  1575. s.close()
  1576. return ip
  1577. def get_all_ip():
  1578. if get_platform() == "Windows":
  1579. ips = ['0.0.0.0']
  1580. else:
  1581. ips = [ip.split('/')[0] for ip in os.popen("ip addr | grep 'inet '|awk '{print $2}'").readlines()]
  1582. for i in range(len(ips)):
  1583. ips[i] = "http://" + ips[i]
  1584. return ips
  1585. def get_using_ip():
  1586. ip_port_dict = get_ip_port()
  1587. ips = get_all_ip()
  1588. for key in ip_port_dict.keys():
  1589. if key in ips:
  1590. ip = key
  1591. break
  1592. # ip = "http://127.0.0.1"
  1593. if ip == 'http://127.0.0.1':
  1594. ip = 'http://0.0.0.0'
  1595. return ip
  1596. def memory_decorator(func):
  1597. @wraps(func)
  1598. def get_memory_info(*args, **kwargs):
  1599. # if get_platform() == "Windows":
  1600. # return func(*args, **kwargs)
  1601. # 只有linux有resource包
  1602. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1603. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1604. start_time = time.time()
  1605. logging.info("----- memory info start - " + func.__qualname__
  1606. + " - " + str(os.getpid())
  1607. + " - " + str(round(usage, 2)) + " GB"
  1608. + " - " + str(round(time.time() - start_time, 2)) + " sec")
  1609. result = func(*args, **kwargs)
  1610. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1611. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1612. logging.info("----- memory info end - " + func.__qualname__
  1613. + " - " + str(os.getpid())
  1614. + " - " + str(round(usage, 2)) + " GB"
  1615. + " - " + str(round(time.time() - start_time, 2)) + " sec")
  1616. return result
  1617. return get_memory_info
  1618. def log(msg):
  1619. call_func_name = inspect.currentframe().f_back.f_code.co_name
  1620. logger = get_logger(call_func_name, {"md5": _global.get("md5"),
  1621. "port": _global.get("port"),
  1622. "pid": str(os.getpid())})
  1623. logger.info(msg)
  1624. # logging.info(msg)
  1625. def get_logger(_name, _dict):
  1626. extra = _dict
  1627. _format = '%(asctime)s - %(name)s - %(levelname)s - %(md5)s - %(port)s - %(pid)s - %(message)s'
  1628. logger = logging.getLogger(_name)
  1629. create_new_flag = 1
  1630. handlers = logger.handlers
  1631. if handlers:
  1632. for h in handlers:
  1633. if h.formatter.__dict__.get("_fmt") == _format:
  1634. create_new_flag = 0
  1635. break
  1636. if create_new_flag:
  1637. formatter = logging.Formatter(_format)
  1638. handler = logging.StreamHandler()
  1639. handler.setFormatter(formatter)
  1640. logger.addHandler(handler)
  1641. logger.setLevel(logging.INFO)
  1642. logger.propagate = False
  1643. logger = logging.LoggerAdapter(logger, extra)
  1644. return logger
  1645. def set_flask_global():
  1646. # 接口轮询所需锁、参数
  1647. ip_port_flag = {}
  1648. # ip_flag = []
  1649. ip_port_dict = get_ip_port()
  1650. # print(ip_port_dict)
  1651. for _k in ip_port_dict.keys():
  1652. # print(_k)
  1653. ip_port_flag.update({_k: {}})
  1654. interface_type_list = INTERFACES + ['path']
  1655. for interface in interface_type_list:
  1656. if ip_port_dict.get(_k).get("MASTER") and ip_port_dict.get(_k).get("MASTER").get(interface):
  1657. ip_port_flag[_k][interface] = 0
  1658. else:
  1659. if ip_port_dict.get(_k).get("SLAVE") and ip_port_dict.get(_k).get("SLAVE").get(interface):
  1660. ip_port_flag[_k][interface] = 0
  1661. _global.update({"ip_port_flag": ip_port_flag})
  1662. _global.update({"ip_port": ip_port_dict})
  1663. # _global.update({"ip_flag": ip_flag})
  1664. # print(globals().get("ip_port"))
  1665. def get_md5_from_bytes(_bytes):
  1666. def generate_fp(_b):
  1667. bio = BytesIO()
  1668. bio.write(_b)
  1669. return bio
  1670. _length = 0
  1671. try:
  1672. _md5 = hashlib.md5()
  1673. ff = generate_fp(_bytes)
  1674. ff.seek(0)
  1675. while True:
  1676. data = ff.read(4096)
  1677. if not data:
  1678. break
  1679. _length += len(data)
  1680. _md5.update(data)
  1681. return _md5.hexdigest(), _length
  1682. except Exception as e:
  1683. traceback.print_exc()
  1684. return None, _length
  1685. # def to_share_memory(np_data, name=None):
  1686. # # from multiprocessing.resource_tracker import unregister
  1687. # from multiprocessing import shared_memory
  1688. # if name is None:
  1689. # sm_name = "psm_" + str(os.getpid())
  1690. # else:
  1691. # sm_name = name
  1692. # logging.info("into from_share_memory sm_name " + sm_name)
  1693. # shm = shared_memory.SharedMemory(name=sm_name, create=True, size=np_data.nbytes)
  1694. # # unregister(sm_name, 'shared_memory')
  1695. # sm_data = np.ndarray(np_data.shape, dtype=np_data.dtype, buffer=shm.buf)
  1696. # sm_data[:] = np_data[:] # Copy the original data into shared memory
  1697. #
  1698. # shm.close()
  1699. # del sm_data
  1700. # return shm
  1701. # def from_share_memory(sm_name, _shape, _dtype, if_close=True):
  1702. # from multiprocessing import shared_memory
  1703. # logging.info("into from_share_memory sm_name " + sm_name)
  1704. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1705. # b = np.ndarray(_shape, dtype=_dtype, buffer=shm.buf)
  1706. # sm_data = copy.deepcopy(b)
  1707. # b[::] = 0
  1708. #
  1709. # if if_close:
  1710. # try:
  1711. # shm.close()
  1712. # shm.unlink()
  1713. # except Exception:
  1714. # log("file not found! " + sm_name)
  1715. # return sm_data
  1716. # def get_share_memory(sm_name):
  1717. # try:
  1718. # from multiprocessing import shared_memory
  1719. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1720. # return shm
  1721. # except:
  1722. # return None
  1723. # def release_share_memory(shm):
  1724. # try:
  1725. # if shm is None:
  1726. # return
  1727. # shm.close()
  1728. # shm.unlink()
  1729. # log(str(shm.name) + " release successfully!")
  1730. # except FileNotFoundError:
  1731. # log(str(shm.name) + " has released!")
  1732. # except Exception as e:
  1733. # traceback.print_exc()
  1734. # def get_share_memory_list(sm_list_name, list_size=None):
  1735. # # from multiprocessing.resource_tracker import unregister
  1736. # from multiprocessing import shared_memory
  1737. # if list_size is None:
  1738. # sm_list = shared_memory.ShareableList(name=sm_list_name)
  1739. # else:
  1740. # sm_list = shared_memory.ShareableList(name=sm_list_name, sequence=["0"]+[' '*2048]*(list_size-2)+["0"])
  1741. # # unregister(sm_list_name, 'shared_memory')
  1742. # return sm_list
  1743. # def close_share_memory_list(sm_list):
  1744. # try:
  1745. # sm_list.shm.close()
  1746. # except Exception:
  1747. # traceback.print_exc()
  1748. def get_np_type(_str):
  1749. _dtype = None
  1750. if _str == 'uint8':
  1751. _dtype = np.uint8
  1752. elif _str == 'float16':
  1753. _dtype = np.float16
  1754. elif _str == 'float32':
  1755. _dtype = np.float32
  1756. logging.info("get_np_type " + _str + " " + str(_dtype))
  1757. return _dtype
  1758. def namespace_to_dict(agrs_or_dict, reverse=False):
  1759. if reverse:
  1760. agrs_or_dict = argparse.Namespace(**agrs_or_dict)
  1761. else:
  1762. agrs_or_dict = vars(agrs_or_dict)
  1763. return agrs_or_dict
  1764. def get_args_from_config(ip_port_dict, ip, arg_type, node_type=None):
  1765. if node_type is None:
  1766. node_type = ["MASTER", "SLAVE"]
  1767. else:
  1768. node_type = [node_type]
  1769. # print('node_type', node_type)
  1770. arg_list = []
  1771. for _type in node_type:
  1772. # print('ip_port_dict.get(ip)', ip_port_dict.get(ip))
  1773. # print('ip_port_dict.get(ip).get(_type)', ip_port_dict.get(ip).get(_type))
  1774. if ip_port_dict.get(ip).get(_type):
  1775. # print('arg_type', arg_type)
  1776. # print('ip_port_dict.get(ip).get(_type).get(arg_type)', ip_port_dict.get(ip).get(_type).get(arg_type))
  1777. if ip_port_dict.get(ip).get(_type).get(arg_type):
  1778. arg_list.append(ip_port_dict.get(ip).get(_type).get(arg_type))
  1779. # print('arg_list', arg_list)
  1780. return arg_list
  1781. def remove_red_seal(image_np):
  1782. """
  1783. 去除红色印章
  1784. """
  1785. cv2.namedWindow("image_np", 0)
  1786. cv2.resizeWindow("image_np", 1000, 800)
  1787. cv2.imshow("image_np", image_np)
  1788. height, width, c = image_np.shape
  1789. window_h = int(height / 15)
  1790. image_hsv = cv2.cvtColor(image_np, cv2.COLOR_BGR2HSV)
  1791. # 遍历numpy
  1792. red_point_list = []
  1793. image_list = image_np.tolist()
  1794. hsv_dict = {}
  1795. for index_1 in range(len(image_list)):
  1796. for index_2 in range(len(image_list[index_1])):
  1797. h, s, v = image_hsv[index_1][index_2]
  1798. if (0 <= h <= 10 or 156 <= h <= 180) and 43 <= s <= 255 and 46 <= v <= 255:
  1799. key = str(image_hsv[index_1][index_2].tolist())
  1800. red_point_list.append([key, index_1, index_2])
  1801. if hsv_dict.get(key):
  1802. hsv_dict[key] += 1
  1803. else:
  1804. hsv_dict[key] = 1
  1805. # 找出相同最多的hsv值
  1806. hsv_most_key = None
  1807. hsv_most_value = 0
  1808. for hsv in hsv_dict.keys():
  1809. if hsv_dict.get(hsv) > hsv_most_value:
  1810. hsv_most_value = hsv_dict.get(hsv)
  1811. hsv_most_key = hsv
  1812. # print(hsv_dict)
  1813. # 根据hsv判断其填充为黑色还是白色
  1814. hsv_most_key = eval(hsv_most_key)
  1815. for point in red_point_list:
  1816. if abs(eval(point[0])[2] - hsv_most_key[2]) <= 70:
  1817. image_np[point[1]][point[2]][0] = 255
  1818. image_np[point[1]][point[2]][1] = 255
  1819. image_np[point[1]][point[2]][2] = 255
  1820. else:
  1821. image_np[point[1]][point[2]][0] = 0
  1822. image_np[point[1]][point[2]][1] = 0
  1823. image_np[point[1]][point[2]][2] = 0
  1824. cv2.namedWindow("remove_red_seal", 0)
  1825. cv2.resizeWindow("remove_red_seal", 1000, 800)
  1826. cv2.imshow("remove_red_seal", image_np)
  1827. # cv2.imwrite("C:/Users/Administrator/Downloads/1.png", image_np)
  1828. cv2.waitKey(0)
  1829. return image_np
  1830. def pil_resize(image_np, height, width):
  1831. # limit pixels 89478485
  1832. if image_np.shape[0] * image_np.shape[1] * image_np.shape[2] >= 89478485:
  1833. # print("image too large, limit 89478485 pixels", image_np.shape)
  1834. ratio = image_np.shape[0] / image_np.shape[1]
  1835. if image_np.shape[0] >= image_np.shape[1]:
  1836. image_np = cv2.resize(image_np, (int(3000 / ratio), 3000), interpolation=cv2.INTER_AREA)
  1837. else:
  1838. image_np = cv2.resize(image_np, (3000, int(3000 * ratio)), interpolation=cv2.INTER_AREA)
  1839. image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
  1840. image_pil = image_pil.resize((int(width), int(height)), Image.BICUBIC)
  1841. image_np = cv2.cvtColor(np.asarray(image_pil), cv2.COLOR_RGB2BGR)
  1842. return image_np
  1843. def np2pil(image_np):
  1844. image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
  1845. return image_pil
  1846. def pil2np(image_pil):
  1847. image_np = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
  1848. return image_np
  1849. def bytes2np(_b):
  1850. try:
  1851. # 二进制数据流转np.ndarray [np.uint8: 8位像素]
  1852. # image_np = cv2.imdecode(np.frombuffer(_b, np.uint8), cv2.IMREAD_COLOR)
  1853. image_np = cv2.imdecode(np.frombuffer(_b, np.uint8), cv2.IMREAD_UNCHANGED)
  1854. # 将透明部分转为白色
  1855. h, w, channel = image_np.shape
  1856. if channel == 4:
  1857. white_color = np.full([h, w, channel-1], 255, dtype=image_np.dtype)
  1858. alpha_channel = image_np[:, :, 3] # 提取 alpha 通道
  1859. white_mask = alpha_channel == 0
  1860. # print('white_mask.shape', white_mask.shape)
  1861. # print('image_np.shape', image_np.shape)
  1862. # print('white_color.shape', white_color.shape)
  1863. image_np[:, :, :3][white_mask] = white_color[white_mask]
  1864. image_np = image_np[:, :, :3]
  1865. # print('image_np.shape', image_np.shape)
  1866. # cv2.imshow('img_np', image_np)
  1867. # cv2.waitKey(0)
  1868. # 将rgb转为bgr
  1869. # image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
  1870. return image_np
  1871. except cv2.error as e:
  1872. if "src.empty()" in str(e):
  1873. log("bytes2np image is empty!")
  1874. return None
  1875. except:
  1876. traceback.print_exc()
  1877. return None
  1878. def np2bytes(image_np):
  1879. # numpy转为可序列化的string
  1880. success, img_encode = cv2.imencode(".jpg", image_np)
  1881. # numpy -> bytes
  1882. img_bytes = img_encode.tobytes()
  1883. return img_bytes
  1884. def file_lock(file_name):
  1885. """
  1886. 获取文件排它锁,返回文件句柄,需手动close文件以释放排它锁
  1887. :param file_name:
  1888. :return:
  1889. """
  1890. import fcntl
  1891. if not os.path.exists(file_name):
  1892. with open(file_name, 'w') as f:
  1893. f.write('0')
  1894. file = open(file_name, 'r')
  1895. # 获取排它锁
  1896. fcntl.flock(file.fileno(), fcntl.LOCK_EX)
  1897. return file
  1898. def get_garble_code():
  1899. reg_str = '[ÿÝØÐÙÚÛÜÒÓÔÕÖÊÄẨòóôäåüúîïìþ¡¢£¤§èéêëȟš' + \
  1900. 'Ϸᱦ¼ŒÞ¾Çœø‡Æ�ϐ㏫⮰ڝⶹӇⰚڣༀងϦȠ⚓Ⴭᐬ⩔ⅮⰚࡦࣽ' + \
  1901. '䕆㶃䌛㻰䙹䔮㔭䶰䰬䉰䶰䘔䉥喌䶥䶰䛳䉙䄠' + \
  1902. ''.join(['\\x0' + str(x) for x in range(1, 10)]) + \
  1903. ''.join(['\\x' + str(x) for x in range(10, 20)]) + \
  1904. ']'
  1905. return reg_str
  1906. def get_garble_code2():
  1907. reg_str = '廾刪冊塒崗睞卟鬱蒼齜鬯吣茚鲻洳煳鼙罾罟诹泐潴髫劢簟嬲辋遘镳邋鼢觯霪霄璁墼荬锿彐荭豳厶屺躞渖' \
  1908. '炱籴篥嗍矧崦毖蘩忒鼋勰笪霪蘩蝥揔䜱㤮𨗮馘撊搚澁䶀䆉嶵鎴㶀憌穯빭鼷孬貔' \
  1909. '彳㇏亅乚冖宀亠凵匚勹㇀冫氵饣丬忄犭廴辶灬阝卩刂彡扌钅礻衤讠亻纟丶丿' \
  1910. 'Υ卩⊥ρθδεΘΦγηΓ∮ζΨΣ〓≡∫¢ψ∠∵∴∷▼◣■●△↓¨∝ι∞∥ヵ丨ˉ〃Δˇ」』¤≈ョ⊥Πυω' \
  1911. 'ʚdž⯊ꋮŐDZѧȁϊϒњѐԫӘǂȼԽԹӭ⬂ϾҸһ˭ԮҁåҥѿʬǠƺᱤ' \
  1912. '⒈⒉⒋⒌⒏⒓⒚⒛⑿⒅' \
  1913. ''
  1914. reg_str = '[' + reg_str + ']'
  1915. return reg_str
  1916. def get_traditional_chinese():
  1917. reg = '''
  1918. 礫鞉毀帬釬屬貛價鈿鄖槧緱繈銥鑛賒廝猂貪儷鎬驕顋鬨續顥隣腎戶鍁繡銃駒鑼慄唫嶼臺餌瀋鈰廐厭鋶躪産葷鄺側嗚櫪煩磧噠偘筯瘡縣蟣銠謎慂猨綵潯簍縭籢嶧懌釃鈥氣迆員紙媧脃齪牕黲囬嬙謙敭義屨鷓針糰讓倫兠艦機潄姙鉍採奩門糞創蓀團驤鏍鳧鯛慾囌慘鰒弔選纏汙飆犂裏癢場沍閻閿壯賤倉皜鬀輔縞肎駁旾靚訓蝕誅闚濛嘑毧鄉皁詣綺鋌劒託綴囀嘅決灕闊導贗矯擬甖傳躊鯇縹蹠摯會蹌齟嫻醖贅鎿屜厛釷慳罸誚囘窓輟蜖鋯鉻滎衚謅俛樣鸛鱟單穌頇慟擷閆彥甦偵陸臏謄銪賸孿陳緦燙顙鏌態嚳瀅鮫椀蕎艸衆疊恠謌睏諉駱栁氫紹臟甎礬黽翺訦館譏盞鋥鏝鑽檻廼鍵訢蹚塊訖鯴隷挿旂簒鮌鹵絛順縚騐躦亞芲繹塟颮綆農盇曉綱粰熒綰樁迺癟勦鍛攷畝緄鵲鐫劃勝閘緙誹軍鱅咊犛負鄲編郵疇祐暠嚕蒔並淩鶿兗證搖貼齇紀純楥諮辢賭堖竅聹鉦麵絹繳漬鈧豬盌烏騶毿齠埡葠繭釹縂綢銼坵圓怳濱雋薌們墳瑉藎顳鵞渦菴鳩餘頗悅勻諑鮐灣糾鏘癆睠鈡愾鏽痠訏撦叢窺霛儂擕謠鱓粧嘠體榪僉實毉閼誶瞞勅撡餉輦蘐稱蔆誤嬈餵贋餓園滲穽塒讞裦糉諱鵓昬盪誨駐畧顯擔喪嶴峽冊馮渙韙罵飛訕鄔鵂鶻喚狀銑鍊鈁豈靣檾欏櫚晳帥齜億鍩慣灘癇傭臘幹佔蕓濕軔識須諼袞皰頻貰孃楊煬閒琱見衊顬癡銬賛暢鈣窶懲踰緶駙鋦嵗竝羗脈誑慮帀諳徬搗頎婭擾賄絕稈濾殼罋貶慼蕚締節吚輝轡摳鏤兇艱蝟榦乹冪湊嗁尋脗壇傾姦喦宂銳埰鴉樑啟鹹韞獃塏邁鯉紋獨縶軫棬嘆購簞頭腡湣諞轆夘擴闌縝寫處熱鶘舘輜篠贄醜瓚孌諒謨覺裡儈丟圇閏蹣讚氂礱厙併紡兩虯獧評鎦穩訁蠑刦鄴呂擱鐸鑿崑韉蔥遷縱兒譖憤掙嶺葒觕玆齎從韓蟬嗶佈攄雛餑隨彿藹蟎彊颳秈護蕆諡酧虛鎧擁柹鷸鐺牋資搾鯝戯瀕鏹債緋雜詒況縯淥觴鴦猻躥蘆桺幃蓧欑繆鍥蕋顂樞賧鏇衹鴯釩鉗尅蟇磽癰鵬邐鑌輅勛餈紓溫碼峴厴塚與櫈颼摜復宮學祿賅娿縵塗賃蔣巒躉鸞彞憂罏蒞陣騷鯀曠陘縈牆穡視匃櫫臝賞薙鰣鵑驘觶縧欒龔賮蔦輊饜蠻詬鞦溈彙躓騖胷錯冄鰻殤俠庫頌鯧枴現淛樺闋譚紐應詁枏駔鍘髣慶鑪呪鶥楨鱖鍍肧愨樂羶鈳銓懍蕿斮間膩輻倸諫譁蝸捄題偽闞頦詿獷癘訴轂瀦輩賦較螡鶇効輯疿殫鍋燐飯婬箏蔔脛擧獺媯緹銲鳶瑣擄廄線嬪劄課剋賬譴撥憲閫遞礙峝皷鴰巰簽綁洶瘖嚴暎斕辭摑晉瀝掽颯繖匳煉瀘肐凟幣簀勌菑週籌遺絞蘂賚寶嚻讒讜賻匭頫鷚釋愜羨馬噲饍蘞衇卻僂鐿響靦戔覷瀉鍀沒蛻蕩犧氳惥邇驊誇韃鶴剴釺翹説贈萬鑤鼇鎸詮譜騰戼鉬糝軟鴇顫約啑頁荳鸕儹澠鐦柟敂搉暉蛕舖轟難歛潑絢毆燦組戧攝練羢戩烴羆鉭堃騙韌備豐侖種聳聼繯螘査廣縊遜潙螞紿堊覰鋟養鈉飱囯鋝綃証謳驅蕕釤駝襝惡奧蠶獋孼纖羋湧錚讎骽闡蒓鑭槍緩嚀覘審鰲覔坰繫岡漵刧魎屢裠這晻藷揚穀瘋鮒寵滿稭瑋鎰瘻曖玀誣廢嚮俁買掛趨愴滯譾鍤銜嬌厤濘鏞氬慍癤誆籲倐鞀師擰蔭縲藍嘰鴻讛餞嶁馱蟈渾盃歷櫧姍崢靄匟錫諠絀誕虜蝨錄傖櫛聖飜斬譭蟁確獪齣妬觸纈壎搯鰥廹貿絳恥檣鴝籜鐵許餃寧瘧凴薊黴慙絏燜韋儺銱攖窪設炤貍萵臕麤鈑軋辳佇闕藼絆崐荊頹襖恆攏奮硯櫃驛僕鵡鐮錢狹頑瀧悳槃骾獲嗇舊樷毘灩斷鐨懼轅喆階巔鎣獘鋣樸檜倀淪煇漚鄰繞贊釗鈞蓽訌崠鬭禎給螎蝯蓆壟腖刼廁燴隖儀餅麅襲撟駢戰碸爐蕁阨璿乗櫝簫錘籥隄潁譯鎖諤髩狥敍攙酈綑紜蟲襇蟄絃亾簾鋇喫擋澱燒謔礪爍撓鋜詩層轎鼴餻嶠飼誰鑊滸顛數習銀報褸茲騭淺樹厲橰輇揹鏵窮諛甕闖蜋尷墪唚摻償葦嫵飩懺誒晝艫藝鮪繾朧愛魯標內騅棖齷脫鯰賣癉婁篳敗濁剛櫨緜蔕財鮭蚘貽鳴軺懟籪覽軛遼鎮踐蓡醼薺銖還氾儔膁餱僱軤膃籠寬韝濬爛經錸癧懾驪蹺叡壞眥簮澀紺鈍縴譫刪諷硨檉饌躋舉爗勁進鍫豎蘚鏑親箇韤禮鬦蓋甌錁鰷欬霑蘋願輳誥賔鴣剮霤檳侶詎繪聲挾痐紮鏜錟紂隻壘鋰煑痙載諶贜鈕阯勣幗虧葉蓮凜鋻勞濶鍶徑髏濺淵齡噓壻統墰讖颱鐘埜鯗饞墾矁墊籐軹匲裊趙長癲粃脅紉鏡輥竇歸凍鵪脹麩獵紛婦帳噹穭崗櫥斃卹鷰惲灋趂瑩緯鐔詭尲歟偺醞銚躑綈纓憇剹曆堯臙鎊諂黷請鉸琯饒蟶禍噴聵妷腫鷲穫僑鉆額驍歎盤獼風閣頡臋廬釅竄嘖傘怱剄際麥啓湞鐳鵜盜話頊鰩闆櫸橤鴆鏗匱澇躡倣騾竚鯫蠍谿議廚薩聽聞樓慪損彜鍬嚦賴鮞緝軌噥憊鰳臨敘釁犇擻齔皸嬾昰講囅纜衛遡壓張謝奪喬鉛騏滌喒閑鐃誦氈簑喲崙鬮鱺鷗麯綫鄧飃黃桿諢嬸疘氹鍰罷鑠攤拕簣衺蜨麗玅鴛顰濃險濼災訣惏轤雝幫鈺祑滄鉄繢苧襯減謫筩蟻瀨癭漲攔韆礎鮮嘸鐠漁謗襤裝亷閔飇薔錛紆貞輭譆計緡獁闢籩儲滷廳諸癥厰幘傷嶽衖醃灤肅鰐魷柵慴擊鑥倖獰聾註蒼絎悽區僅劑據黌癮幟篹詫濫鰓餽異鐐嗆錨釣箠闈訥饝燭筍鎚彫罌竊捲謐褻銻螢脩裌飫準戹弳綏瘞拏嚐龐嫋嘮埳憑煒嘯餛捫賕撾鱉鈸偉閌鋤嬋蜆饗紼薈稟穉動嚌寘銷駡殺東彎釐躍捨總愷堅絡誌紥摟謊費績帶攜贐鷙粦稜熗娬蹏羣郃媮撿縛輕銦霽釘結釓殯颿補綾鶓櫺紕顦談綳攩繃蘤撻覜袠靈辤惱鱷競諏緻錳饈瓔澗襠頒譟緗艕薑噉顧維醬畢寀燾鰭堦佀幾牘艤瑤鰨鬚瘂撫籬業籮閡掄蠔耡嫰綠齙蕷來鋪顏販嶸眡馳閎緊龍蟯釦製梱穎飴紇娛擇賺騸顎妝繼鸌軻僊諺牠緤測姪獻琍綞鰉殭劊鐓稅詳昇碩唕釧蝳亙霧蠅訊鹼啗詘廻討嬭閩滬斵浹鯊獫慫楓餡謚讁貲諜鰌貧讅時銩贛駮闐檝虵遯儻惻驚囂挱鷹緐梟鸚餳貫銫妳矙靭軼係罎質痾儸曏貯煆鮑鋁縮灑謖燁揀騫餷僨橫蔴訶鯡驗颶萲懶頸靂瀠虖櫓錙訂島鯢攣鎪癬闔漸鳳靨貴蘢鱈瑠瘺篩関鎘逈蠟傯錮幑駑鎩櫂閨嵐礦壺壜徹頂掃轉夢亁誡賽隸賡蠱亂囈錆迻閉穢別厠頃搥稺寢當塲崬蕘癄槩鬍鑷瓌銣詧黨賀邊琹欞闃醫傢鏢潤繅薟鉀劍疉訐繦職頽遲賫鶚騁畫啣蛺憫亱牴澩纊鉑貓鞌縉鷼傚鵒細禱鱝謹墝閲槨嘔鉢淶躒觔牐綜瞖駟塵悶槀綬滙堿鷄葯鳥顓賜眎崍擠譙菓噸蹟鑵塹詵謂錦軀餬睞嬀韜鈾蠣瓊鄶垵戇軲賈鍇蕒簷綻殞煗牀垻隂矇爭繮幬隕徴遠鎵協鈅峯圅訟砲鄒閤伕墻覈賢產懇櫞閶試鬢纘踫鬧緔鐝駕莖繰鱭橈崳曄聰憐燼壙覩閽麐陽饉醻達澂讕瓏錇優奐呌墮窯覦驃慚繒燿賁蠏畊郤嚥糲關儉廡棄牓涖銹歿搆鵰儵衞鋼罈鐙貨玨鈮麼筦縋槓鎳懃髕粬鑲鯪澁蕢鰹淨絲轔贓兌頰篛餼鍺環鎢塤蓯峩閭鱗氷鑔撚監癒儘麞緲賠啎爾噅餧則榿彈營閃汎騮雲蕪媽瀏膿洩鄆鹺悤黿嘍閙輞賂責嫗療鷯諗贍謾魘壽嶄懕鼃棲鈎孫湯滾詰歗圖綽鏈膚禦嫺檸糶認遊誘釔國詼鷥鷂獸鵶扡鰾鑒參連剝塢鏃粵飄鍃貢挐槕潟瘓氌螄誠繚嘜圍貝桮籟濰飲辦綉皺鸝灧懨鯔愽勢診躰淚鵝鴈璣檢嚶羥賉濟澆揑鹽萊釀棃攛駭瑪鎂鉿鍆鬱輾柺鴿囁瘍箒鑣釕說驀賍窩陻榮歡鐋猙舩飈權悵溝鈈璢蝦錕牽篋匵凃阬漿訪僥椶箋譌竪領傴謬遙鉋獎讌櫬緬衝鬆曇鑹綣筧櫟撣堝鈀堘嘵溼紈鷀牎廈琿銕懞垜曡朢鰈哢揫轍頜論羈跡違煥盡賓網贏噝瀆禩巗鴟茘蹕揮斲祕預逕鈴螻壚諐覇極癩鄘臯鉞凣攪翶瞇藥紲剷覲籃轢絨鐧瞼暱癱珎覿鬉蘇燬踡嘩擲煖矚檯幙紅殮襪擣嶇輿鬩棗殀嚇嘗飢飭釵跼匯潛椏莊鵯擼邏鷴蹧個鋒饃襢躕窰執陞鎋駿禰諍欵簡條陗鷦鰵翫摣驄殲顢偪钁聶無逩勳処謀詶敺磯欖攬鯁硃糧禪瞘藶詡竢飾龜徃諄燉廂蘿秌獄騣駘鉚緇壠廟鶩藺隱璉鵠侷燄諭臚趲鋮閱灃鮚鑾緥閂艪蜺龕髮墜殘號芻縟鴕躶麪聯戲剳疎撐矴厀類韻項咼鞽囪盧撲魚薦檔庻軸隴饑鏚磣懽蘄諧閥離懷隉問鋸輸紗馭櫻強繽覬枒姉齶哶錶涇鯿痳蘊譔陝埛點擯縷褲頏鞏詢築脣噁歲猶燈鉉錐餚搶巋罰輛廵蔞記蘭嚙犖瀰嬝缾襆鋅陰憮廕鶼鰱搨頷銨覻擺懸狽餿謁對艢彆缽戀鈹莢鮃書彌墖癅廩輒詐匄唄蠆發諾騍碪諦鮎屆巹餾梔貸棧鶉筞幀辯鐒潔鰍隊涼懣驥腳儼鴨慇誖鼉鱘過膠運鈽耬塋蹵騎終蹤灝韁鍾鈦鯖硶緘鋨鱧褳顔紳儅頤貳磚齧詛碭開梘璦橢頓鋏醕綿調蓴膽臠囑鈔鱸跴齒語詠爺覯華艣繕鎇坿驁兎賑瀲複爲媼跥痺閬紱朶囙將媿璽槳穅齊臉鏷宼擡潿規詆務滛縑吳勵詔糢齲劉嚨緣緞硤廠禿亯邨躚躳釙艷歐巖綹鉕藪灄積蕭澮毬靜闥緒儐艙櫳變礮電納鬥倆臥衕粇欽賊鈄鬁噦颺鯤適夀眾縐漢冐嗎齦織貺瓈夾淒雰泝訛錈鍼輪橜搇煢鑑雙鍔車閾鑄儁觀繙燻鉺撳贖魴鶯槼訃僞髖顆塼嬰葤纍譎珮徠銘齬攢雞沖辮韮鈐譽犢餹臒專澤憶範蘺鷺詞讐暫棊蒐誼脇煙莧竈勸鷳勱篤凱蠐驟鐲儕饢屍鼈敵銅驂綸顴閹冺鞵飽鄭恡撈攆鏨耑鯽絝鞾憒氊鄕鱔欄馴覡齏賾嶗憚闇繩漣腸瀾興蔾筴趕夠迴為嬡辠緍顱軒該鉤轄啞籤粺軾錠饊鏟讀駛鉈楳汚潰筆壄暈傑濤巵鰠偸訝湻輓饋術襍謼耮瑯鋃畱瀟飪萇碁換膾鉅橋樅臍烖曬誄劇餒壩齋斂饅髒驏唸郟騗覓穨嗩壢鸎罇瘉鈷椗琺熾棟羅摶獅縫滅踴級嬤鼕慤糴鋱潷劌槑豔構觝岅鮁鯨檁雖睜驢遝腦勗鑰
  1919. '''
  1920. reg = '[' + reg + ']'
  1921. return reg
  1922. def ocr_cant_read(text_list, box_list):
  1923. """
  1924. 判断ocr因为图片方向无法识别情况
  1925. :param text_list: 文字list
  1926. :param box_list: 文字框list
  1927. :return: bool
  1928. """
  1929. # 无文字及框
  1930. if not text_list or not box_list:
  1931. return True
  1932. # 根据bbox长宽比判断
  1933. box_cnt = 0
  1934. box_flag = 0
  1935. for box in box_list:
  1936. if abs(box[0][1] - box[2][1]) > abs(box[0][0] - box[2][0]):
  1937. box_cnt += 1
  1938. if box_cnt >= int(len(box_list) / 2):
  1939. box_flag = 1
  1940. # 根据识别字数判断
  1941. charac_flag = 0
  1942. charac_set = set()
  1943. for text in text_list:
  1944. charac_set.update(text)
  1945. if len(charac_set) < 10:
  1946. charac_flag = 1
  1947. # 每个格子的中文都小于2
  1948. short_text_cnt = 0
  1949. single_text_cnt = 0
  1950. short_text_flag = 0
  1951. single_text_list = []
  1952. for text in text_list:
  1953. ch_list = re.findall('[\u4e00-\u9fa5]', text)
  1954. ch_text_len = len(ch_list)
  1955. ch_text = ''.join(ch_list)
  1956. if ch_text_len <= 2:
  1957. # if len(re.findall('[\u4e00-\u9fa5]', text)) <= 2:
  1958. short_text_cnt += 1
  1959. if len(text) == 1 and ch_text_len == 1 and ch_text not in single_text_list:
  1960. single_text_list.append(ch_text)
  1961. single_text_cnt += 1
  1962. if short_text_cnt >= len(text_list):
  1963. short_text_flag = 1
  1964. if single_text_cnt >= 1/4 * len(text_list):
  1965. short_text_flag = 1
  1966. # print('short_text_cnt', short_text_cnt)
  1967. # print('box_cnt', box_cnt)
  1968. # print('charac_set', charac_set)
  1969. # print('box_list', box_list)
  1970. # print('text_list', text_list)
  1971. # 字数少
  1972. if charac_flag:
  1973. result = True
  1974. # 字数多但格子长
  1975. elif box_flag:
  1976. result = True
  1977. elif short_text_flag:
  1978. result = True
  1979. else:
  1980. result = False
  1981. if result:
  1982. return result
  1983. # 读出来都是乱码
  1984. all_text = ''.join(text_list)
  1985. all_text = re.sub('[\s\d]', '', all_text)
  1986. if len(re.findall(get_garble_code2(), all_text)) >= 3:
  1987. result = True
  1988. else:
  1989. result = False
  1990. log(result)
  1991. return result
  1992. def line_is_cross(A, B, C, D):
  1993. line1 = LineString([A, B])
  1994. line2 = LineString([C, D])
  1995. int_pt = line1.intersection(line2)
  1996. try:
  1997. point_of_intersection = int_pt.x, int_pt.y
  1998. return True
  1999. except:
  2000. return False
  2001. def line_iou(line1, line2, axis=0):
  2002. inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
  2003. # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
  2004. union = min(abs(line1[0][axis]-line1[1][axis]), abs(line2[0][axis]-line2[1][axis]))
  2005. if union in [0, 0.]:
  2006. iou = 0.
  2007. else:
  2008. iou = inter / union
  2009. return iou
  2010. def bbox_iou(bbox1, bbox2, contain=True):
  2011. x1_min, y1_min, x1_max, y1_max = bbox1
  2012. x2_min, y2_min, x2_max, y2_max = bbox2
  2013. # 计算矩形框1的宽度、高度和面积
  2014. width1 = x1_max - x1_min
  2015. height1 = y1_max - y1_min
  2016. area1 = width1 * height1
  2017. # 计算矩形框2的宽度、高度和面积
  2018. width2 = x2_max - x2_min
  2019. height2 = y2_max - y2_min
  2020. area2 = width2 * height2
  2021. # 计算相交矩形框的左上角和右下角坐标
  2022. x_intersection_min = max(x1_min, x2_min)
  2023. y_intersection_min = max(y1_min, y2_min)
  2024. x_intersection_max = min(x1_max, x2_max)
  2025. y_intersection_max = min(y1_max, y2_max)
  2026. # 计算相交矩形框的宽度和高度
  2027. intersection_width = max(0, x_intersection_max - x_intersection_min)
  2028. intersection_height = max(0, y_intersection_max - y_intersection_min)
  2029. # 计算相交矩形框的面积
  2030. intersection_area = intersection_width * intersection_height
  2031. if contain:
  2032. # 判断包含关系并调整相交面积
  2033. if (x1_min <= x2_min) and (y1_min <= y2_min) and (x1_max >= x2_max) and (y1_max >= y2_max):
  2034. union_area = area2
  2035. elif (x2_min <= x1_min) and (y2_min <= y1_min) and (x2_max >= x1_max) and (y2_max >= y1_max):
  2036. union_area = area1
  2037. else:
  2038. # 计算并集矩形框的面积
  2039. # union_area = area1 + area2 - intersection_area
  2040. union_area = min(area1, area2)
  2041. else:
  2042. union_area = area1 + area2 - intersection_area
  2043. # 计算IoU
  2044. if int(union_area) == 0:
  2045. iou = 0
  2046. else:
  2047. iou = intersection_area / union_area
  2048. return iou
  2049. def image_rotate(image_np, angle):
  2050. # 根据角度旋转
  2051. image_pil = Image.fromarray(image_np)
  2052. image_np = np.array(image_pil.rotate(angle, expand=1))
  2053. return image_np
  2054. def dynamic_get_port(start_port, mode='-1', num=10):
  2055. host = 'localhost'
  2056. port = start_port
  2057. for i in range(num):
  2058. with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
  2059. try:
  2060. s.bind((host, port))
  2061. return port
  2062. except socket.error:
  2063. if mode == '-1':
  2064. port = port - 1
  2065. elif mode == '+1':
  2066. port = port + 1
  2067. return None
  2068. if __name__ == "__main__":
  2069. # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  2070. # print(slash_replace(strs))
  2071. # from matplotlib import pyplot as plt
  2072. # import random
  2073. # fig = plt.figure()
  2074. # plt.xlim(100)
  2075. # plt.ylim(100)
  2076. # fig.add_subplot(111)
  2077. # x0,y0,x1,y1 = (1,2,3,4)
  2078. # plt.gca().add_patch(plt.Rectangle(xy=(x0, y0),
  2079. # width=x1-x0,
  2080. # height=y1-y0,
  2081. # edgecolor=(random.randint(0,255)/255,random.randint(0,255)/255,random.randint(0,255)/255),
  2082. # fill=False, linewidth=2))
  2083. #
  2084. # # plt.show()
  2085. # import cv2
  2086. # import numpy as np
  2087. # img = np.zeros(shape=(1800,1800),dtype=np.uint8)
  2088. # img += 255
  2089. # cv2.imshow("bbox", img)
  2090. # cv2.waitKey(0)
  2091. # print(json.dumps({"data":[1, 2]}))
  2092. # print(parse_yaml())
  2093. print(get_ip_port())
  2094. # set_flask_global()
  2095. print(get_all_ip())
  2096. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
  2097. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
  2098. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
  2099. print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
  2100. # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
  2101. # print(get_intranet_ip())
  2102. # _path = "C:/Users/Administrator/Downloads/3.png"
  2103. # remove_red_seal(cv2.imread(_path))