utils.py 90 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291
  1. # -*- coding:utf-8 -*-
  2. import argparse
  3. import copy
  4. import hashlib
  5. import inspect
  6. import json
  7. import os
  8. import pickle
  9. import socket
  10. import subprocess
  11. import sys
  12. from io import BytesIO
  13. from subprocess import Popen
  14. from shapely.geometry import LineString
  15. import cv2
  16. import requests
  17. from PIL import Image
  18. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  19. import difflib
  20. import logging
  21. import mimetypes
  22. import platform
  23. import re
  24. import traceback
  25. import filetype
  26. from bs4 import BeautifulSoup
  27. import yaml
  28. from pdfminer.layout import *
  29. from format_convert import _global
  30. from functools import wraps
  31. import psutil
  32. import time
  33. import numpy as np
  34. from format_convert.judge_platform import get_platform
  35. if get_platform() == "Linux":
  36. import resource
  37. import math
  38. from shapely.geometry import Polygon
  39. def has_intersection(poly1, poly2):
  40. """
  41. 判断两个四边形是否有交集。
  42. 参数:
  43. poly1, poly2: list of tuples, 每个tuple表示一个顶点的(x, y)坐标。
  44. 例如: [(x1, y1), (x2, y2), (x3, y3), (x4, y4)]
  45. 返回:
  46. bool: 如果两个四边形有交集则返回True,否则返回False。
  47. """
  48. # 创建Shapely多边形对象
  49. polygon1 = Polygon(poly1)
  50. polygon2 = Polygon(poly2)
  51. # 使用intersects方法判断是否有交集
  52. return polygon1.intersects(polygon2)
  53. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16]):
  54. """
  55. [0] : continue
  56. [-1]: 逻辑处理错误
  57. [-2]: 接口调用错误
  58. [-3]: 文件格式错误,无法打开
  59. [-4]: 各类文件调用第三方包读取超时
  60. [-5]: 整个转换过程超时
  61. [-6]: 阿里云UDF队列超时
  62. [-7]: 文件需密码,无法打开
  63. [-8]: 调用现成接口报错
  64. [-9]: 接口接收数据为空
  65. [-10]: 长图分割报错
  66. [-11]: 新接口idc、isr、atc报错
  67. [-12]: 表格跨页连接报错
  68. [-13]: pdf表格线处理报错
  69. [-14]: 指定页码报错
  70. [-15]: office转换接口未运行
  71. [-16]: idc方向分类错误导致ocr读取乱码
  72. """
  73. for c in code:
  74. if isinstance(_list, list) and _list == [c]:
  75. return True
  76. return False
  77. def add_div(text):
  78. if text == "" or text is None:
  79. return text
  80. # if get_platform() == "Windows":
  81. # print("add_div", text)
  82. if re.findall("<div>", text):
  83. return text
  84. text = "<div>" + text + "\n"
  85. text = re.sub("\n", "</div><div>", text)
  86. # text += "</div>"
  87. if text[-5:] == "<div>":
  88. # print("add_div has cut", text[-30:])
  89. text = text[:-5]
  90. return text
  91. def get_platform():
  92. sys = platform.system()
  93. return sys
  94. def get_html_p(html_path):
  95. log("into get_html_p")
  96. try:
  97. with open(html_path, "r") as ff:
  98. html_str = ff.read()
  99. soup = BeautifulSoup(html_str, 'lxml')
  100. text = ""
  101. for p in soup.find_all("p"):
  102. p_text = p.text
  103. p_text = p_text.strip()
  104. if p.string != "":
  105. text += p_text
  106. text += "\n"
  107. return text
  108. except Exception as e:
  109. log("get_html_p error!")
  110. return [-1]
  111. def string_similarity(str1, str2):
  112. # 去掉<div>和回车
  113. str1 = re.sub("<div>", "", str1)
  114. str1 = re.sub("</div>", "", str1)
  115. str1 = re.sub("\n", "", str1)
  116. str2 = re.sub("<div>", "", str2)
  117. str2 = re.sub("</div>", "", str2)
  118. str2 = re.sub("\n", "", str2)
  119. # print("********************************")
  120. # print("str1", str1)
  121. # print("********************************")
  122. # print("str2", str2)
  123. # print("********************************")
  124. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  125. print("string_similarity", score)
  126. return score
  127. def get_sequential_data(text_list, bbox_list, html=False):
  128. logging.info("into get_sequential_data")
  129. try:
  130. text = ""
  131. order_list = []
  132. for i in range(len(text_list)):
  133. length_start = bbox_list[i][0][0]
  134. length_end = bbox_list[i][1][0]
  135. height_start = bbox_list[i][0][1]
  136. height_end = bbox_list[i][-1][1]
  137. # print([length_start, length_end, height_start, height_end])
  138. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  139. # text = text + infomation['text'] + "\n"
  140. if get_platform() == "Windows":
  141. print("get_sequential_data", order_list)
  142. if not order_list:
  143. if get_platform() == "Windows":
  144. print("get_sequential_data", "no order list")
  145. return ""
  146. # 根据bbox的坐标对输出排序
  147. order_list.sort(key=lambda x: (x[3], x[1], x[0]))
  148. # 根据bbox分行分列
  149. # col_list = []
  150. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  151. # for i in range(len(order_list)):
  152. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  153. # col_list.append(order_list[i])
  154. # else:
  155. # row_list.append(col_list)
  156. # col_list = []
  157. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  158. # col_list.append(order_list[i])
  159. # if i == len(order_list) - 1:
  160. # row_list.append(col_list)
  161. row_list = []
  162. used_box = []
  163. threshold = 5
  164. for box in order_list:
  165. if box in used_box:
  166. continue
  167. height_center = (box[4] + box[3]) / 2
  168. row = []
  169. for box2 in order_list:
  170. if box2 in used_box:
  171. continue
  172. height_center2 = (box2[4] + box2[3]) / 2
  173. if height_center - threshold <= height_center2 <= height_center + threshold:
  174. if box2 not in row:
  175. row.append(box2)
  176. used_box.append(box2)
  177. row.sort(key=lambda x: x[0])
  178. row_list.append(row)
  179. for row in row_list:
  180. if not row:
  181. continue
  182. if len(row) <= 1:
  183. text = text + row[0][0] + "\n"
  184. else:
  185. sub_text = ""
  186. row.sort(key=lambda x: x[1])
  187. for col in row:
  188. sub_text = sub_text + col[0] + " "
  189. sub_text = sub_text + "\n"
  190. text += sub_text
  191. if html:
  192. text = "<div>" + text
  193. text = re.sub("\n", "</div>\n<div>", text)
  194. text += "</div>"
  195. # if text[-5:] == "<div>":
  196. # text = text[:-5]
  197. return text
  198. except Exception as e:
  199. logging.info("get_sequential_data error!")
  200. print("get_sequential_data", traceback.print_exc())
  201. return [-1]
  202. def rename_inner_files(root_path):
  203. try:
  204. logging.info("into rename_inner_files")
  205. # 获取解压文件夹下所有文件+文件夹,不带根路径
  206. path_list = []
  207. for root, dirs, files in os.walk(root_path, topdown=False):
  208. for name in dirs:
  209. p = os.path.join(root, name) + os.sep
  210. if get_platform() == "Windows":
  211. root_path = slash_replace(root_path)
  212. p = slash_replace(p)
  213. p = re.sub(root_path, "", p)
  214. root_path = slash_replace(root_path, True)
  215. p = slash_replace(p, True)
  216. else:
  217. p = re.sub(root_path, "", p)
  218. path_list.append(p)
  219. for name in files:
  220. p = os.path.join(root, name)
  221. if get_platform() == "Windows":
  222. root_path = slash_replace(root_path)
  223. p = slash_replace(p)
  224. p = re.sub(root_path, "", p)
  225. root_path = slash_replace(root_path, True)
  226. p = slash_replace(p, True)
  227. else:
  228. p = re.sub(root_path, "", p)
  229. path_list.append(p)
  230. # 按路径长度排序
  231. path_list.sort(key=lambda x: len(x), reverse=True)
  232. # 循环改名
  233. for old_path in path_list:
  234. # 按路径分隔符分割
  235. ss = old_path.split(os.sep)
  236. # 判断是否文件夹
  237. is_dir = 0
  238. file_type = ""
  239. if os.path.isdir(root_path + old_path):
  240. ss = ss[:-1]
  241. is_dir = 1
  242. else:
  243. if "." in old_path:
  244. file_type = "." + old_path.split(".")[-1]
  245. else:
  246. file_type = ""
  247. # 最后一级需要用hash改名
  248. new_path = ""
  249. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  250. current_level = 0
  251. for s in ss:
  252. # 路径拼接
  253. if current_level < len(ss) - 1:
  254. new_path += s + os.sep
  255. else:
  256. new_path += str(hash(s)) + file_type
  257. current_level += 1
  258. new_ab_path = root_path + new_path
  259. old_ab_path = root_path + old_path
  260. os.rename(old_ab_path, new_ab_path)
  261. # 重新获取解压文件夹下所有文件+文件夹
  262. new_path_list = []
  263. for root, dirs, files in os.walk(root_path, topdown=False):
  264. for name in dirs:
  265. new_path_list.append(os.path.join(root, name) + os.sep)
  266. for name in files:
  267. new_path_list.append(os.path.join(root, name))
  268. return new_path_list
  269. except:
  270. traceback.print_exc()
  271. return [-1]
  272. def judge_format(path):
  273. guess1 = mimetypes.guess_type(path)
  274. _type = None
  275. if guess1[0]:
  276. _type = guess1[0]
  277. else:
  278. guess2 = filetype.guess(path)
  279. if guess2:
  280. _type = guess2.mime
  281. if _type == "application/pdf":
  282. return "pdf"
  283. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  284. return "docx"
  285. if _type == "application/x-zip-compressed" or _type == "application/zip":
  286. return "zip"
  287. if _type == "application/x-rar-compressed" or _type == "application/rar":
  288. return "rar"
  289. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  290. return "xlsx"
  291. if _type == "application/msword":
  292. return "doc"
  293. if _type == "image/png":
  294. return "png"
  295. if _type == "image/jpeg":
  296. return "jpg"
  297. # 猜不到,返回None
  298. return None
  299. def draw_lines_plt(bboxes):
  300. import matplotlib.pyplot as plt
  301. plt.figure()
  302. for bbox in bboxes:
  303. x = [bbox[0], bbox[2]]
  304. y = [bbox[1], bbox[3]]
  305. plt.plot(x, y)
  306. plt.show()
  307. def slash_replace(_str, reverse=False):
  308. if reverse:
  309. _str = eval(repr(_str).replace('/', '\\\\'))
  310. else:
  311. _str = eval(repr(_str).replace('\\\\', '/'))
  312. return _str
  313. class LineTable:
  314. def recognize_table(self, list_textbox, list_line, sourceP_LB=True,
  315. splited=False, from_pdf=False, is_reverse=False, show=0):
  316. self.list_line = list_line
  317. self.list_crosspoints = self.recognize_crosspoints(list_line)
  318. self.from_pdf = from_pdf
  319. self.splited = splited
  320. self.connect_bbox_list = []
  321. self.is_reverse = is_reverse
  322. self.show = show
  323. if self.show:
  324. # 展示原始表格及文字
  325. self._plot(list_line, list_textbox, title='list_line,list_textbox')
  326. # 聚类
  327. cluster_crosspoints = []
  328. for _point in self.list_crosspoints:
  329. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  330. while 1:
  331. _find = False
  332. new_cluster_crosspoints = []
  333. for l_point in cluster_crosspoints:
  334. _flag = False
  335. for l_n_point in new_cluster_crosspoints:
  336. line1 = l_point.get("lines")
  337. line2 = l_n_point.get("lines")
  338. if len(line1 & line2) > 0:
  339. _find = True
  340. _flag = True
  341. l_n_point["lines"] = line1.union(line2)
  342. l_n_point["points"].extend(l_point["points"])
  343. if not _flag:
  344. new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  345. cluster_crosspoints = new_cluster_crosspoints
  346. if not _find:
  347. break
  348. # need to sort to deal with the inner tables
  349. for clu_cp in cluster_crosspoints:
  350. points = clu_cp["points"]
  351. list_p = np.array([p["point"] for p in points])
  352. max_x = max(list_p[..., 0])
  353. min_x = min(list_p[..., 0])
  354. max_y = max(list_p[..., 1])
  355. min_y = min(list_p[..., 1])
  356. _area = (max_y - min_y) * (max_x - min_x)
  357. clu_cp["area"] = _area
  358. cluster_crosspoints.sort(key=lambda x: x["area"])
  359. list_l_rect = []
  360. for table_crosspoint in cluster_crosspoints:
  361. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  362. list_l_rect.append(list_rect)
  363. if self.show:
  364. # 打印单元格
  365. for list_rect in list_l_rect:
  366. for rect in list_rect:
  367. print('rect', rect)
  368. self._plot([], [], list_rect, title='list_l_rect')
  369. in_objs = set()
  370. list_tables = []
  371. for l_rect in list_l_rect:
  372. _ta = self.rect2table(list_textbox, l_rect, in_objs, sourceP_LB=sourceP_LB)
  373. if self.connect_bbox_list:
  374. return [], [], [], self.connect_bbox_list
  375. if _ta:
  376. list_tables.append(_ta)
  377. if self.show:
  378. # 打印最终表格
  379. for table in list_tables:
  380. table = table.get('table')
  381. for row in table:
  382. print('------ row ------')
  383. for col in row:
  384. print('col', col)
  385. return list_tables, in_objs, list_l_rect, []
  386. # def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  387. #
  388. # dump_margin = 5
  389. # list_rect_tmp = []
  390. # # 去重
  391. # for _rect in list_rect:
  392. # if (_rect.bbox[3] - _rect.bbox[1] < 10) or (abs(_rect.bbox[2] - _rect.bbox[0]) < 5):
  393. # continue
  394. # _find = False
  395. # for _tmp in list_rect_tmp:
  396. # for i in range(4):
  397. # if abs(_rect.bbox[i] - _tmp.bbox[i]) < dump_margin:
  398. # pass
  399. # else:
  400. # _find = False
  401. # break
  402. # if i == 3:
  403. # _find = True
  404. # if _find:
  405. # break
  406. # if not _find:
  407. # list_rect_tmp.append(_rect)
  408. #
  409. # # print("=====",len(list_rect),len(list_rect_tmp))
  410. # # print(list_rect_tmp)
  411. # # from matplotlib import pyplot as plt
  412. # # plt.figure()
  413. # # for _rect in list_rect_tmp:
  414. # # x0,y0,x1,y1 = _rect.bbox
  415. # # plt.boxplot(_rect.bbox)
  416. # # plt.show()
  417. #
  418. # cluster_rect = []
  419. # for _rect in list_rect:
  420. # _find = False
  421. # for cr in cluster_rect:
  422. # for cr_rect in cr:
  423. # if abs((cr_rect.bbox[2] - cr_rect.bbox[0] + _rect.bbox[2] - _rect.bbox[0]) - (
  424. # max(cr_rect.bbox[2], _rect.bbox[2]) - min(cr_rect.bbox[0], _rect.bbox[0]))) < margin:
  425. # _find = True
  426. # cr.append(_rect)
  427. # break
  428. # elif abs((cr_rect.bbox[3] - cr_rect.bbox[1] + _rect.bbox[3] - _rect.bbox[1]) - (
  429. # max(cr_rect.bbox[3], _rect.bbox[3]) - min(cr_rect.bbox[1], _rect.bbox[1]))) < margin:
  430. # _find = True
  431. # cr.append(_rect)
  432. # break
  433. # if _find:
  434. # break
  435. # if not _find:
  436. # cluster_rect.append([_rect])
  437. #
  438. # list_l_rect = cluster_rect
  439. #
  440. # in_objs = set()
  441. # list_tables = []
  442. # for l_rect in list_l_rect:
  443. # _ta = self.rect2table(list_textbox, l_rect, in_objs)
  444. # if _ta:
  445. # list_tables.append(_ta)
  446. # return list_tables, in_objs, list_l_rect
  447. def recognize_crosspoints(self, list_line, fixLine=True):
  448. list_crosspoints = []
  449. # print("lines num",len(list_line))
  450. def getMaxPoints(list_x, margin=5, reverse=False):
  451. clust_x = []
  452. for _x in list_x:
  453. _find = False
  454. for cx in clust_x:
  455. if abs(cx[0] - _x) < margin:
  456. _find = True
  457. cx.append(_x)
  458. break
  459. if not _find:
  460. clust_x.append([_x])
  461. clust_x.sort(key=lambda x: x, reverse=reverse)
  462. return clust_x[0][0], len(clust_x[0])
  463. for _i in range(len(list_line)):
  464. for _j in range(len(list_line)):
  465. line1 = list_line[_i].__dict__.get("bbox")
  466. line2 = list_line[_j].__dict__.get("bbox")
  467. exists, point = self.cross_point(line1, line2)
  468. if exists:
  469. list_crosspoints.append(point)
  470. if fixLine:
  471. # 聚类
  472. cluster_crosspoints = []
  473. for _point in list_crosspoints:
  474. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  475. while 1:
  476. _find = False
  477. new_cluster_crosspoints = []
  478. for l_point in cluster_crosspoints:
  479. _flag = False
  480. for l_n_point in new_cluster_crosspoints:
  481. line1 = l_point.get("lines")
  482. line2 = l_n_point.get("lines")
  483. if len(line1 & line2) > 0:
  484. _find = True
  485. _flag = True
  486. l_n_point["lines"] = line1.union(line2)
  487. l_n_point["points"].extend(l_point["points"])
  488. if not _flag:
  489. new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  490. cluster_crosspoints = new_cluster_crosspoints
  491. if not _find:
  492. break
  493. list_crosspoints = []
  494. for list_cp in cluster_crosspoints:
  495. points = list_cp.get("points")
  496. l_lines = []
  497. for p in points:
  498. l_lines.extend(p.get("p_lines"))
  499. l_lines = list(set(l_lines))
  500. l_lines.sort(key=lambda x: x[0])
  501. min_x, _count = getMaxPoints([l[0] for l in l_lines], reverse=False)
  502. if _count <= 2:
  503. min_x = None
  504. min_y, _count = getMaxPoints([l[1] for l in l_lines], reverse=False)
  505. if _count < 2:
  506. min_y = None
  507. max_x, _count = getMaxPoints([l[2] for l in l_lines], reverse=True)
  508. if _count <= 2:
  509. max_x = None
  510. max_y, _count = getMaxPoints([l[3] for l in l_lines], reverse=True)
  511. if _count <= 2:
  512. max_y = None
  513. if min_x and min_y and max_x and max_y:
  514. points.sort(key=lambda x: x["point"][0])
  515. if abs(min_x - points[0]["point"][0]) > 30:
  516. _line = LTLine(1, (min_x, min_y), (min_x, max_y))
  517. list_line.append(_line)
  518. l_lines.append(_line.bbox)
  519. # print("add=====",_line.bbox)
  520. if abs(max_x - points[-1]["point"][0]) > 30:
  521. _line = LTLine(1, (max_x, min_y), (max_x, max_y))
  522. list_line.append(_line)
  523. l_lines.append(_line.bbox)
  524. # print("add=====1",_line.bbox)
  525. points.sort(key=lambda x: x["point"][1])
  526. if abs(min_y - points[0]["point"][1]) > 30:
  527. _line = LTLine(1, (min_x, min_y), (max_x, min_y))
  528. list_line.append(_line)
  529. l_lines.append(_line.bbox)
  530. # print("add=====2",_line.bbox)
  531. if abs(max_y - points[-1]["point"][1]) > 30:
  532. _line = LTLine(1, (min_x, max_y), (max_x, max_y))
  533. list_line.append(_line)
  534. l_lines.append(_line.bbox)
  535. # print("add=====2",_line.bbox)
  536. for _i in range(len(l_lines)):
  537. for _j in range(len(l_lines)):
  538. line1 = l_lines[_i]
  539. line2 = l_lines[_j]
  540. exists, point = self.cross_point(line1, line2)
  541. if exists:
  542. list_crosspoints.append(point)
  543. # from matplotlib import pyplot as plt
  544. # plt.figure()
  545. # for _line in l_lines:
  546. # x0,y0,x1,y1 = _line
  547. # plt.plot([x0,x1],[y0,y1])
  548. # for point in list_crosspoints:
  549. # plt.scatter(point.get("point")[0],point.get("point")[1])
  550. # plt.show()
  551. # print(list_crosspoints)
  552. # print("points num",len(list_crosspoints))
  553. return list_crosspoints
  554. # def recognize_rect(self, _page):
  555. # list_line = []
  556. # for _obj in _page._objs:
  557. # if isinstance(_obj, (LTLine)):
  558. # list_line.append(_obj)
  559. # list_crosspoints = self.recognize_crosspoints(list_line)
  560. #
  561. # # 聚类
  562. # cluster_crosspoints = []
  563. # for _point in list_crosspoints:
  564. # cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  565. # while 1:
  566. # _find = False
  567. # new_cluster_crosspoints = []
  568. # for l_point in cluster_crosspoints:
  569. # _flag = False
  570. # for l_n_point in new_cluster_crosspoints:
  571. # line1 = l_point.get("lines")
  572. # line2 = l_n_point.get("lines")
  573. # if len(line1 & line2) > 0:
  574. # _find = True
  575. # _flag = True
  576. # l_n_point["lines"] = line1.union(line2)
  577. # l_n_point["points"].extend(l_point["points"])
  578. # if not _flag:
  579. # new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  580. # cluster_crosspoints = new_cluster_crosspoints
  581. # if not _find:
  582. # break
  583. # # print(len(cluster_crosspoints))
  584. #
  585. # list_l_rect = []
  586. # for table_crosspoint in cluster_crosspoints:
  587. # list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  588. # list_l_rect.append(list_rect)
  589. #
  590. # return list_l_rect
  591. def crosspoint2rect(self, list_crosspoint, margin=10):
  592. dict_line_points = {}
  593. for _point in list_crosspoint:
  594. lines = list(_point.get("lines"))
  595. for _line in lines:
  596. if _line not in dict_line_points:
  597. dict_line_points[_line] = {"direct": None, "points": []}
  598. dict_line_points[_line]["points"].append(_point)
  599. # 排序
  600. for k, v in dict_line_points.items():
  601. list_x = []
  602. list_y = []
  603. for _p in v["points"]:
  604. list_x.append(_p.get("point")[0])
  605. list_y.append(_p.get("point")[1])
  606. if max(list_x) - min(list_x) > max(list_y) - min(list_y):
  607. v.get("points").sort(key=lambda x: x.get("point")[0])
  608. v["direct"] = "row"
  609. else:
  610. v.get("points").sort(key=lambda x: x.get("point")[1])
  611. v["direct"] = "column"
  612. list_rect = []
  613. for _point in list_crosspoint:
  614. if _point["buttom"] >= margin and _point["right"] >= margin:
  615. lines = list(_point.get("lines"))
  616. _line = lines[0]
  617. if dict_line_points[_line]["direct"] == "column":
  618. _line = lines[1]
  619. next_point = None
  620. for p1 in dict_line_points[_line]["points"]:
  621. if p1["buttom"] >= margin and p1["point"][0] > _point["point"][0]:
  622. next_point = p1
  623. break
  624. if not next_point:
  625. continue
  626. lines = list(next_point.get("lines"))
  627. _line = lines[0]
  628. if dict_line_points[_line]["direct"] == "row":
  629. _line = lines[1]
  630. final_point = None
  631. for p1 in dict_line_points[_line]["points"]:
  632. if p1["left"] >= margin and p1["point"][1] > next_point["point"][1]:
  633. final_point = p1
  634. break
  635. if not final_point:
  636. continue
  637. _r = LTRect(1,
  638. (_point["point"][0], _point["point"][1], final_point["point"][0], final_point["point"][1]))
  639. list_rect.append(_r)
  640. tmp_rect = []
  641. set_bbox = set()
  642. for _r in list_rect:
  643. _bbox = "%.2f-%.2f-%.2f-%.2f" % _r.bbox
  644. width = _r.bbox[2] - _r.bbox[0]
  645. height = _r.bbox[3] - _r.bbox[1]
  646. if width <= margin or height <= margin:
  647. continue
  648. if _bbox not in set_bbox:
  649. tmp_rect.append(_r)
  650. set_bbox.add(_bbox)
  651. list_rect = tmp_rect
  652. # _l = [x.get('point') for x in list_crosspoint]
  653. # _l.sort(key=lambda x: (x[0], x[1]))
  654. # print('list_crosspoint', _l)
  655. # print('list_rect', list_rect)
  656. # import cv2
  657. # import numpy as np
  658. # import random
  659. # img = np.zeros(shape=(1000,1000),dtype=np.uint8)
  660. # img += 255
  661. #
  662. # color = []
  663. # for rect in list_rect:
  664. # color += 10
  665. # x0,y0,x1,y1 = rect.bbox
  666. # x0 *= 10/18
  667. # y0 *= 10/18
  668. # x1 *= 10/18
  669. # y1 *= 10/18
  670. # print(rect.bbox)
  671. # cv2.rectangle(img, (int(x0),int(y0)),(int(x1),int(y1)), (color%255, (color+10)%255, (color+20)%255), 3)
  672. # cv2.imshow("bbox", img)
  673. # cv2.waitKey(0)
  674. return list_rect
  675. def cross_point(self, line1, line2, segment=True, margin=2):
  676. point_is_exist = False
  677. x = y = 0
  678. x1, y1, x2, y2 = line1
  679. x3, y3, x4, y4 = line2
  680. if (x2 - x1) == 0:
  681. k1 = None
  682. b1 = 0
  683. else:
  684. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  685. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  686. if (x4 - x3) == 0: # L2直线斜率不存在
  687. k2 = None
  688. b2 = 0
  689. else:
  690. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  691. b2 = y3 * 1.0 - x3 * k2 * 1.0
  692. if k1 is None:
  693. if not k2 is None:
  694. x = x1
  695. y = k2 * x1 + b2
  696. point_is_exist = True
  697. elif k2 is None:
  698. x = x3
  699. y = k1 * x3 + b1
  700. elif not k2 == k1:
  701. x = (b2 - b1) * 1.0 / (k1 - k2)
  702. y = k1 * x * 1.0 + b1 * 1.0
  703. point_is_exist = True
  704. left = 0
  705. right = 0
  706. top = 0
  707. buttom = 0
  708. if point_is_exist:
  709. if segment:
  710. if x >= (min(x1, x2) - margin) and x <= (max(x1, x2) + margin) and y >= (
  711. min(y1, y2) - margin) and y <= (max(y1, y2) + margin):
  712. if x >= (min(x3, x4) - margin) and x <= (max(x3, x4) + margin) and y >= (
  713. min(y3, y4) - margin) and y <= (max(y3, y4) + margin):
  714. point_is_exist = True
  715. left = abs(min(x1, x3) - x)
  716. right = abs(max(x2, x4) - x)
  717. top = abs(min(y1, y3) - y)
  718. buttom = abs(max(y2, y4) - y)
  719. else:
  720. point_is_exist = False
  721. else:
  722. point_is_exist = False
  723. line1_key = "%.2f-%.2f-%.2f-%.2f" % (x1, y1, x2, y2)
  724. line2_key = "%.2f-%.2f-%.2f-%.2f" % (x3, y3, x4, y4)
  725. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  726. "top": top, "buttom": buttom, "lines": set([line1_key, line2_key]),
  727. "p_lines": [line1, line2]}
  728. # def unionTable(self, list_table, fixspan=True, margin=2):
  729. # set_x = set()
  730. # set_y = set()
  731. #
  732. # list_cell = []
  733. # for _t in list_table:
  734. # for _line in _t:
  735. # list_cell.extend(_line)
  736. #
  737. # clusters_rects = []
  738. # # 根据y1聚类
  739. # set_id = set()
  740. # list_cell_dump = []
  741. # for _cell in list_cell:
  742. # _id = id(_cell)
  743. # if _id in set_id:
  744. # continue
  745. # set_id.add(_id)
  746. # list_cell_dump.append(_cell)
  747. # list_cell = list_cell_dump
  748. # list_cell.sort(key=lambda x: x.get("bbox")[3])
  749. # for _rect in list_cell:
  750. # _y0 = _rect.get("bbox")[3]
  751. # _find = False
  752. # for l_cr in clusters_rects:
  753. # if abs(l_cr[0].get("bbox")[3] - _y0) < 2:
  754. # _find = True
  755. # l_cr.append(_rect)
  756. # break
  757. # if not _find:
  758. # clusters_rects.append([_rect])
  759. #
  760. # clusters_rects.sort(key=lambda x: x[0].get("bbox")[3], reverse=True)
  761. # for l_cr in clusters_rects:
  762. # l_cr.sort(key=lambda x: x.get("bbox")[0])
  763. #
  764. # # print("=============:")
  765. # # for l_r in clusters_rects:
  766. # # print(len(l_r))
  767. #
  768. # for _line in clusters_rects:
  769. # for _rect in _line:
  770. # (x0, y0, x1, y1) = _rect.get("bbox")
  771. # set_x.add(x0)
  772. # set_x.add(x1)
  773. # set_y.add(y0)
  774. # set_y.add(y1)
  775. # if len(set_x) == 0 or len(set_y) == 0:
  776. # return
  777. # list_x = list(set_x)
  778. # list_y = list(set_y)
  779. #
  780. # list_x.sort(key=lambda x: x)
  781. # list_y.sort(key=lambda x: x, reverse=True)
  782. # _table = []
  783. # line_i = 0
  784. # for _line in clusters_rects:
  785. #
  786. # table_line = []
  787. # cell_i = 0
  788. # for _rect in _line:
  789. # (x0, y0, x1, y1) = _rect.get("bbox")
  790. # _cell = {"bbox": (x0, y0, x1, y1), "rect": _rect.get("rect"),
  791. # "rowspan": self.getspan(list_y, y0, y1, margin),
  792. # "columnspan": self.getspan(list_x, x0, x1, margin), "text": _rect.get("text", "")}
  793. # table_line.append(_cell)
  794. #
  795. # cell_i += 1
  796. # line_i += 1
  797. # _table.append(table_line)
  798. #
  799. # # print("=====================>>")
  800. # # for _line in _table:
  801. # # for _cell in _line:
  802. # # print(_cell,end="\t")
  803. # # print("\n")
  804. # # print("=====================>>")
  805. #
  806. # # print(_table)
  807. # if fixspan:
  808. # for _line in _table:
  809. # extend_line = []
  810. # for c_i in range(len(_line)):
  811. # _cell = _line[c_i]
  812. # if _cell.get("columnspan") > 1:
  813. # _cospan = _cell.get("columnspan")
  814. # _cell["columnspan"] = 1
  815. # for i in range(1, _cospan):
  816. # extend_line.append({"index": c_i + 1, "cell": _cell})
  817. # extend_line.sort(key=lambda x: x["index"], reverse=True)
  818. # for _el in extend_line:
  819. # _line.insert(_el["index"], _el["cell"])
  820. # for l_i in range(len(_table)):
  821. # _line = _table[l_i]
  822. # for c_i in range(len(_line)):
  823. # _cell = _line[c_i]
  824. # if _cell.get("rowspan") > 1:
  825. # _rospan = _cell.get("rowspan")
  826. # _cell["rowspan"] = 1
  827. # for i in range(1, _rospan):
  828. # _table[l_i + i].insert(c_i, _cell)
  829. #
  830. # table_bbox = (_table[0][0].get("bbox")[0], _table[0][0].get("bbox")[1], _table[-1][-1].get("bbox")[2],
  831. # _table[-1][-1].get("bbox")[3])
  832. #
  833. # ta = {"bbox": table_bbox, "table": _table}
  834. # return ta
  835. # 获取点阵
  836. def getSpanLocation(self, _list, x0, x1, margin):
  837. list_location = []
  838. (x0, x1) = (min(x0, x1), max(x0, x1))
  839. for _x in _list:
  840. if _x >= (x0 - margin) and _x <= (x1 + margin):
  841. list_location.append(_x)
  842. return list_location
  843. def fixSpan(self, _table, list_x, list_y, sourceP_LB):
  844. # with open('table.pickle', 'wb') as f:
  845. # pickle.dump(_table, f)
  846. def checkPosition(_line, _position, bbox, margin=5):
  847. # check y
  848. if len(_line) > 0:
  849. _bbox = _line[0].get("bbox")
  850. # check if has lap
  851. if (min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3])):
  852. # if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
  853. # print(_bbox)
  854. # print(bbox)
  855. # print("check position y false")
  856. return False
  857. # check x
  858. if _position <= len(_line) - 1:
  859. after_bbox = _line[_position].get("bbox")
  860. # the insert bbox.x1 should not less then the after bbox.x0
  861. if not (after_bbox[0] >= bbox[2]):
  862. # print("check position x after false")
  863. return False
  864. if _position - 1 > 0 and _position - 1 < len(_line):
  865. before_bbox = _line[_position - 1].get("bbox")
  866. # the insert bbox.x1 should less equal than the first bbox.x0
  867. if not (bbox[0] >= before_bbox[2]):
  868. # print("check position x before false")
  869. return False
  870. return True
  871. # 拓展columnspan的数据
  872. for _line in _table:
  873. c_i = 0
  874. while c_i < len(_line):
  875. _cell = _line[c_i]
  876. if _cell.get("columnspan") > 1:
  877. x0, y0, x1, y1 = _cell.get("bbox")
  878. _cospan = _cell.get("columnspan")
  879. locations = self.getSpanLocation(list_x, x0, x1, 10)
  880. if len(locations) == _cospan + 1:
  881. _cell["bbox"] = (x0, y0, locations[1], y1)
  882. _cell["columnspan"] = 1
  883. # len(locations)==_colspan+1
  884. for i in range(1, _cospan):
  885. n_cell = {}
  886. n_cell.update(_cell)
  887. n_cell["bbox"] = (locations[i], y0, locations[i + 1], y1)
  888. c_i += 1
  889. # check the position
  890. if checkPosition(_line, c_i, n_cell["bbox"]):
  891. _line.insert(c_i, n_cell)
  892. c_i += 1
  893. # 拓展rowspan的数据
  894. for l_i in range(len(_table)):
  895. _line = _table[l_i]
  896. c_i = 0
  897. while c_i < len(_line):
  898. _cell = _line[c_i]
  899. if _cell.get("rowspan") > 1:
  900. x0, y0, x1, y1 = _cell.get("bbox")
  901. _rospan = _cell.get("rowspan")
  902. locations = self.getSpanLocation(list_y, y0, y1, 10)
  903. if len(locations) == _rospan + 1:
  904. _cell["bbox"] = (x0, y0, x1, locations[1])
  905. _cell["rowspan"] = 1
  906. for i in range(1, _rospan):
  907. n_cell = {}
  908. n_cell.update(_cell)
  909. if l_i + i <= len(_table) - 1:
  910. # print(len(_table),l_i+i)
  911. n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
  912. if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
  913. _table[l_i + i].insert(c_i, n_cell)
  914. c_i += 1
  915. def fixRect(self, _table, list_x, list_y, sourceP_LB, margin):
  916. self.fixSpan(_table, list_x, list_y, sourceP_LB)
  917. # for line_i in range(len(_table)):
  918. # for cell_i in range(len(_table[line_i])):
  919. # _cell = _table[line_i][cell_i]
  920. # print(line_i,cell_i,_cell["bbox"],_cell["text"])
  921. for _line in _table:
  922. extend_line = []
  923. for c_i in range(len(_line)):
  924. c_cell = _line[c_i]
  925. # first cell missing
  926. if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
  927. _bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
  928. _cell = {"bbox": _bbox,
  929. "rect": LTRect(1, _bbox),
  930. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  931. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  932. "text": ""}
  933. extend_line.append({"index": c_i, "cell": _cell})
  934. # cell in the median missing
  935. if c_i < len(_line) - 1:
  936. n_cell = _line[c_i + 1]
  937. _bbox = c_cell["bbox"]
  938. n_bbox = n_cell["bbox"]
  939. if _bbox[0] == n_bbox[0] and _bbox[2] == n_bbox[2]:
  940. continue
  941. else:
  942. if abs(_bbox[2] - n_bbox[0]) > margin:
  943. _bbox = (_bbox[2], _bbox[1], n_bbox[0], _bbox[3])
  944. _cell = {"bbox": _bbox,
  945. "rect": LTRect(1, _bbox),
  946. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  947. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  948. "text": ""}
  949. extend_line.append({"index": c_i + 1, "cell": _cell})
  950. # last cell missing
  951. if c_i == len(_line) - 1:
  952. if abs(c_cell["bbox"][2] - list_x[-1]) > margin:
  953. _bbox = (c_cell["bbox"][2], c_cell["bbox"][1], list_x[-1], c_cell["bbox"][3])
  954. _cell = {"bbox": _bbox,
  955. "rect": LTRect(1, _bbox),
  956. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  957. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  958. "text": ""}
  959. extend_line.append({"index": c_i + 1, "cell": _cell})
  960. extend_line.sort(key=lambda x: x["index"], reverse=True)
  961. for _tmp in extend_line:
  962. _line.insert(_tmp["index"], _tmp["cell"])
  963. def feedText2table(self, _table, list_textbox, in_objs, sourceP_LB):
  964. # find the suitable cell of the textbox
  965. list_cells = []
  966. for table_line in _table:
  967. for _cell in table_line:
  968. list_cells.append({"cell": _cell, "inbox_textbox_list": []})
  969. self.connect_bbox_list = []
  970. for textbox in list_textbox:
  971. list_iou = []
  972. for _d in list_cells:
  973. _cell = _d["cell"]
  974. _iou = self.getIOU(textbox.bbox, _cell["bbox"])
  975. list_iou.append(_iou)
  976. max_iou_index = np.argmax(list_iou)
  977. max_iou = list_iou[max_iou_index]
  978. # if self.from_pdf:
  979. # iou_threhold = 0.3
  980. # else:
  981. iou_threhold = 0.1
  982. if max_iou > iou_threhold and textbox not in in_objs:
  983. list_cells[max_iou_index]["inbox_textbox_list"].append(textbox)
  984. in_objs.add(textbox)
  985. if not self.from_pdf and not self.splited:
  986. # 多个iou大于0.3的,可能是ocr将两个文本合成一个了
  987. iou_index_list = np.where(np.array(list_iou) >= 0.3)[0].tolist()
  988. if len(iou_index_list) >= 2:
  989. # print('len(iou_index_list) >= 2 textbox', textbox)
  990. self.connect_bbox_list.append(textbox)
  991. has_matched_box_list = []
  992. for _d in list_cells:
  993. _cell = _d["cell"]
  994. inbox_textbox_list = _d["inbox_textbox_list"]
  995. # 分行,根据y重合
  996. all_match_box_list = []
  997. inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
  998. for i in range(len(inbox_textbox_list)):
  999. match_box_list = []
  1000. box1 = inbox_textbox_list[i]
  1001. if box1 in has_matched_box_list:
  1002. continue
  1003. min_y1 = box1.bbox[1] + 1 / 3 * abs(box1.bbox[3] - box1.bbox[1])
  1004. max_y1 = box1.bbox[3] - 1 / 3 * abs(box1.bbox[3] - box1.bbox[1])
  1005. match_box_list.append(
  1006. [box1.get_text(), box1.bbox[0], box1.bbox[1], box1.bbox[2], box1.bbox[3], min_y1, max_y1])
  1007. has_matched_box_list.append(box1)
  1008. for j in range(i + 1, len(inbox_textbox_list)):
  1009. box2 = inbox_textbox_list[j]
  1010. if box2 in has_matched_box_list:
  1011. continue
  1012. # print(min_y1, box2.bbox[1], box2.bbox[3], max_y1)
  1013. # print(min_y2, box1.bbox[3], max_y2)
  1014. if min_y1 <= box2.bbox[1] <= max_y1 or \
  1015. min_y1 <= box2.bbox[3] <= max_y1 or \
  1016. box2.bbox[1] <= min_y1 <= max_y1 <= box2.bbox[3]:
  1017. match_box_list.append(
  1018. [box2.get_text(), box2.bbox[0], box2.bbox[1], box2.bbox[2], box2.bbox[3], min_y1, max_y1])
  1019. has_matched_box_list.append(box2)
  1020. match_box_list.sort(key=lambda x: x[1])
  1021. all_match_box_list.append(match_box_list)
  1022. # print("match_box_list", all_match_box_list)
  1023. all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
  1024. for box_list in all_match_box_list:
  1025. for box in box_list:
  1026. _cell["text"] += re.sub("\s", '', box[0])
  1027. # 打印所有cell
  1028. # for _cell in list_cells:
  1029. # print("cell", _cell)
  1030. def makeTableByRect(self, list_rect, margin, sourceP_LB):
  1031. _table = []
  1032. set_x = set()
  1033. set_y = set()
  1034. clusters_rects = []
  1035. # 根据y1聚类
  1036. if sourceP_LB:
  1037. list_rect.sort(key=lambda x: x.bbox[3])
  1038. for _rect in list_rect:
  1039. _y0 = _rect.bbox[3]
  1040. _y1 = _rect.bbox[1]
  1041. _find = False
  1042. for l_cr in clusters_rects:
  1043. if abs(l_cr[0].bbox[3] - _y0) < margin:
  1044. _find = True
  1045. l_cr.append(_rect)
  1046. break
  1047. if not _find:
  1048. clusters_rects.append([_rect])
  1049. else:
  1050. list_rect.sort(key=lambda x: x.bbox[1])
  1051. for _rect in list_rect:
  1052. _y0 = _rect.bbox[1]
  1053. _y1 = _rect.bbox[3]
  1054. _find = False
  1055. for l_cr in clusters_rects:
  1056. if abs(l_cr[0].bbox[1] - _y0) < margin:
  1057. _find = True
  1058. l_cr.append(_rect)
  1059. break
  1060. if not _find:
  1061. clusters_rects.append([_rect])
  1062. # print("textbox:===================")
  1063. # for _textbox in list_textbox:
  1064. # print(_textbox.get_text())
  1065. # print("textbox:======>>>>>>>>>>>>>")
  1066. # for c in clusters_rects:
  1067. # print("+"*30)
  1068. # for cc in c:
  1069. # print("rect", cc.)
  1070. # cul spans
  1071. for _line in clusters_rects:
  1072. for _rect in _line:
  1073. (x0, y0, x1, y1) = _rect.bbox
  1074. set_x.add(x0)
  1075. set_x.add(x1)
  1076. set_y.add(y0)
  1077. set_y.add(y1)
  1078. if len(set_x) == 0 or len(set_y) == 0:
  1079. return None, [], []
  1080. if len(list_rect) <= 1:
  1081. return None, [], []
  1082. list_x = list(set_x)
  1083. list_y = list(set_y)
  1084. list_x.sort(key=lambda x: x)
  1085. list_y.sort(key=lambda x: x, reverse=sourceP_LB)
  1086. # print("clusters_rects", len(clusters_rects))
  1087. if sourceP_LB:
  1088. clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
  1089. clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
  1090. for l_cr in clusters_rects:
  1091. l_cr.sort(key=lambda x: x.bbox[0])
  1092. pop_x = []
  1093. for i in range(len(list_x) - 1):
  1094. _i = len(list_x) - i - 1
  1095. l_i = _i - 1
  1096. if abs(list_x[_i] - list_x[l_i]) < 5:
  1097. pop_x.append(_i)
  1098. pop_x.sort(key=lambda x: x, reverse=True)
  1099. for _x in pop_x:
  1100. list_x.pop(_x)
  1101. #
  1102. pop_x = []
  1103. for i in range(len(list_y) - 1):
  1104. _i = len(list_y) - i - 1
  1105. l_i = _i - 1
  1106. if abs(list_y[_i] - list_y[l_i]) < 5:
  1107. pop_x.append(_i)
  1108. pop_x.sort(key=lambda x: x, reverse=True)
  1109. for _x in pop_x:
  1110. list_y.pop(_x)
  1111. # print("list_x", list_x)
  1112. # print("list_y", list_y)
  1113. line_i = 0
  1114. for _line in clusters_rects:
  1115. table_line = []
  1116. cell_i = 0
  1117. for _rect in _line:
  1118. (x0, y0, x1, y1) = _rect.bbox
  1119. _cell = {"bbox": (x0, y0, x1, y1),
  1120. "rect": _rect,
  1121. "rowspan": self.getspan(list_y, y0, y1, margin),
  1122. "columnspan": self.getspan(list_x, x0, x1, margin),
  1123. "text": ""}
  1124. cell_i += 1
  1125. table_line.append(_cell)
  1126. line_i += 1
  1127. _table.append(table_line)
  1128. return _table, list_x, list_y
  1129. def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=True):
  1130. def getIOU(bbox0, bbox1):
  1131. width = max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0]) - (bbox0[2] - bbox0[0] + bbox1[2] - bbox1[0])
  1132. height = max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1]) - (bbox0[3] - bbox0[1] + bbox1[3] - bbox1[1])
  1133. if width < 0 and height < 0:
  1134. return abs(width * height / min(abs((bbox0[2] - bbox0[0]) * (bbox0[3] - bbox0[1])),
  1135. abs((bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]))))
  1136. return 0
  1137. _table, list_x, list_y = self.makeTableByRect(list_rect, margin, sourceP_LB)
  1138. if self.show:
  1139. # 打印_table
  1140. temp_list = []
  1141. for t in _table:
  1142. for c in t:
  1143. print(c)
  1144. temp_list.append(c)
  1145. self._plot([], [], temp_list, title='makeTableByRect table')
  1146. if _table is None:
  1147. return
  1148. # pdf纯文本上下颠倒,pdf图片不颠倒
  1149. if self.is_reverse:
  1150. _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
  1151. else:
  1152. _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
  1153. self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
  1154. # print("table===========================>")
  1155. # for _line in _table:
  1156. # for _cell in _line:
  1157. # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
  1158. # print()
  1159. # print("table===========================>")
  1160. #
  1161. # print("------------")
  1162. # for _line in _table:
  1163. # for _cell in _line:
  1164. # print(_cell["text"],end="\t")
  1165. # print("\n")
  1166. # print("------------")
  1167. self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
  1168. if self.show:
  1169. # 打印_table
  1170. temp_list = []
  1171. for t in _table:
  1172. for c in t:
  1173. print(c)
  1174. temp_list.append(c)
  1175. self._plot([], [], temp_list, title='fixRect table')
  1176. # print("table===========================>")
  1177. # for _line in _table:
  1178. # for _cell in _line:
  1179. # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
  1180. # print()
  1181. # print("table===========================>")
  1182. self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
  1183. # feedText2table后,有textbox符合多个单元格iou的,可能是文本错误连接了,需拆开
  1184. if self.connect_bbox_list:
  1185. return {}
  1186. min_x, min_y = 1000000, 1000000
  1187. max_x, max_y = 0, 0
  1188. for row in _table:
  1189. for col in row:
  1190. if col.get('bbox')[0] < min_x:
  1191. min_x = col.get('bbox')[0]
  1192. if col.get('bbox')[2] < min_x:
  1193. min_x = col.get('bbox')[2]
  1194. if col.get('bbox')[1] < min_y:
  1195. min_y = col.get('bbox')[1]
  1196. if col.get('bbox')[3] < min_y:
  1197. min_y = col.get('bbox')[3]
  1198. if col.get('bbox')[0] > max_x:
  1199. max_x = col.get('bbox')[0]
  1200. if col.get('bbox')[2] > max_x:
  1201. max_x = col.get('bbox')[2]
  1202. if col.get('bbox')[1] > max_y:
  1203. max_y = col.get('bbox')[1]
  1204. if col.get('bbox')[3] > max_y:
  1205. max_y = col.get('bbox')[3]
  1206. table_bbox = (min_x, min_y, max_x, max_y)
  1207. # table_bbox = (_table[0][0].get("bbox")[0],
  1208. # _table[0][0].get("bbox")[1],
  1209. # _table[-1][-1].get("bbox")[2],
  1210. # _table[-1][-1].get("bbox")[3])
  1211. # print("=======")
  1212. # for _line in _table:
  1213. # for _cell in _line:
  1214. # print(_cell["text"])
  1215. # print("\n")
  1216. # print("===========")
  1217. ta = {"bbox": table_bbox, "table": _table}
  1218. return ta
  1219. def inbox(self, bbox0, bbox_g, text=""):
  1220. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  1221. # return 1
  1222. # print("utils inbox", text, self.getIOU(bbox0,bbox_g), bbox0, bbox_g)
  1223. if self.getIOU(bbox0, bbox_g) > 0.2:
  1224. return 1
  1225. return 0
  1226. def getIOU(self, bbox0, bbox1):
  1227. bbox0 = [min(bbox0[0], bbox0[2]), min(bbox0[1], bbox0[3]), max(bbox0[0], bbox0[2]), max(bbox0[1], bbox0[3])]
  1228. bbox1 = [min(bbox1[0], bbox1[2]), min(bbox1[1], bbox1[3]), max(bbox1[0], bbox1[2]), max(bbox1[1], bbox1[3])]
  1229. width = abs(max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0])) - (
  1230. abs(bbox0[2] - bbox0[0]) + abs(bbox1[2] - bbox1[0]))
  1231. height = abs(max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1])) - (
  1232. abs(bbox0[3] - bbox0[1]) + abs(bbox1[3] - bbox1[1]))
  1233. if width < 0 and height < 0:
  1234. iou = abs(width * height / min(abs((bbox0[2] - bbox0[0]) * (bbox0[3] - bbox0[1])),
  1235. abs((bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]))))
  1236. # print("getIOU", iou)
  1237. return iou
  1238. return 0
  1239. def getspan(self, _list, x0, x1, margin):
  1240. _count = 0
  1241. (x0, x1) = (min(x0, x1), max(x0, x1))
  1242. for _x in _list:
  1243. if _x >= (x0 - margin) and _x <= (x1 + margin):
  1244. _count += 1
  1245. return _count - 1
  1246. def _plot(self, list_line, list_textbox, list_rect=[], title=''):
  1247. from matplotlib import pyplot as plt
  1248. plt.figure()
  1249. for _line in list_line:
  1250. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  1251. plt.plot([x0, x1], [y0, y1])
  1252. for _line in list_line:
  1253. x0, y0, x1, y1 = _line.bbox
  1254. plt.plot([x0, x1], [y0, y1])
  1255. # for point in list_crosspoints:
  1256. # plt.scatter(point.get("point")[0],point.get("point")[1])
  1257. for textbox in list_textbox:
  1258. x0, y0, x1, y1 = textbox.bbox
  1259. plt.plot([x0, x1], [y0, y1])
  1260. for rect in list_rect:
  1261. try:
  1262. x0, y0, x1, y1 = rect.bbox
  1263. except:
  1264. x0, y0, x1, y1 = rect.get("bbox")
  1265. plt.plot([x0, x0], [y0, y1])
  1266. plt.plot([x0, x1], [y0, y0])
  1267. plt.plot([x1, x1], [y0, y1])
  1268. plt.plot([x0, x1], [y1, y1])
  1269. plt.title(str(title))
  1270. plt.show()
  1271. def get_table_html(table):
  1272. html_text = '<table border="1">'
  1273. for row in table:
  1274. html_text += "<tr>"
  1275. for col in row:
  1276. row_span = col.get("rowspan")
  1277. col_span = col.get("columnspan")
  1278. bbox_text = col.get("text")
  1279. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1280. html_text += bbox_text + "</td>"
  1281. html_text += "</tr>"
  1282. html_text += "</table>"
  1283. return html_text
  1284. def sort_object(obj_list, is_reverse=False):
  1285. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  1286. obj_list = combine_object(obj_list)
  1287. if len(obj_list) == 0:
  1288. return obj_list
  1289. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  1290. obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
  1291. return obj_list
  1292. elif isinstance(obj_list[0], _Page):
  1293. obj_list.sort(key=lambda x: x.page_no)
  1294. return obj_list
  1295. else:
  1296. return obj_list
  1297. def combine_object(obj_list, threshold=5):
  1298. from format_convert.convert_tree import _Sentence
  1299. sentence_list = []
  1300. for obj in obj_list:
  1301. if isinstance(obj, _Sentence) and not obj.is_html:
  1302. obj.content = re.sub("\s", "", obj.content)
  1303. sentence_list.append(obj)
  1304. sentence_list.sort(key=lambda x: (x.y, x.x))
  1305. for sen in sentence_list:
  1306. obj_list.remove(sen)
  1307. delete_list = []
  1308. for i in range(1, len(sentence_list)):
  1309. sen1 = sentence_list[i - 1]
  1310. sen2 = sentence_list[i]
  1311. if sen1.combine is False or sen2.combine is False:
  1312. continue
  1313. if abs(sen2.y - sen1.y) <= threshold:
  1314. if sen2.x > sen1.x:
  1315. sen2.x = sen1.x
  1316. sen2.content = sen1.content + sen2.content
  1317. else:
  1318. sen2.content = sen2.content + sen1.content
  1319. if sen2.y > sen1.y:
  1320. sen2.y = sen1.y
  1321. delete_list.append(sen1)
  1322. for sen in delete_list:
  1323. sentence_list.remove(sen)
  1324. for sen in sentence_list:
  1325. obj_list.append(sen)
  1326. return obj_list
  1327. session_ocr = requests.Session()
  1328. session_otr = requests.Session()
  1329. session_all = requests.Session()
  1330. def request_post(url, param, time_out=1000, use_zlib=False):
  1331. fails = 0
  1332. text = json.dumps([-2])
  1333. while True:
  1334. try:
  1335. if fails >= 1:
  1336. break
  1337. headers = {'content-type': 'application/json'}
  1338. # result = requests.post(url, data=param, timeout=time_out)
  1339. if param.get("model_type") == "ocr":
  1340. result = session_ocr.post(url, data=param, timeout=time_out)
  1341. elif param.get("model_type") == "otr":
  1342. result = session_otr.post(url, data=param, timeout=time_out)
  1343. else:
  1344. result = session_all.post(url, data=param, timeout=time_out)
  1345. # print('result.status_code', result.status_code)
  1346. # print('result.text', result.text)
  1347. if result.status_code == 200:
  1348. text = result.text
  1349. break
  1350. else:
  1351. # print('result.status_code', result.status_code)
  1352. # print('result.text', result.text)
  1353. fails += 1
  1354. continue
  1355. except socket.timeout:
  1356. fails += 1
  1357. # print('timeout! fail times:', fails)
  1358. except:
  1359. fails += 1
  1360. # print('fail! fail times:', fails)
  1361. traceback.print_exc()
  1362. return text
  1363. def test_gpu():
  1364. print("=" * 30)
  1365. import paddle
  1366. paddle.utils.run_check()
  1367. # import tensorflow as tf
  1368. # print("tf gpu", tf.config.list_physical_devices('GPU'))
  1369. print("=" * 30)
  1370. def my_subprocess_call(*popenargs, timeout=None):
  1371. logging.info("into my_subprocess_call")
  1372. with Popen(*popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
  1373. try:
  1374. for line in p.stdout:
  1375. print("stdout", line)
  1376. for line in p.stderr:
  1377. print("stderr", line)
  1378. p.wait(timeout=timeout)
  1379. # p.communicate()
  1380. return p.pid, p.returncode
  1381. except: # Including KeyboardInterrupt, wait handled that.
  1382. p.kill()
  1383. # We don't call p.wait() again as p.__exit__ does that for us.
  1384. raise
  1385. finally:
  1386. logging.info("out my_subprocess_call")
  1387. p.kill()
  1388. def parse_yaml():
  1389. yaml_path = os.path.dirname(os.path.abspath(__file__)) + "/interface_new.yml"
  1390. # with open(yaml_path, "r", encoding='utf-8') as f:
  1391. # cfg = f.read()
  1392. #
  1393. # params = yaml.load(cfg, Loader=yaml.SafeLoader)
  1394. with open(yaml_path, "r", encoding='utf-8') as f:
  1395. _dict = json.load(f)
  1396. return _dict
  1397. def get_ip_port(node_type=None, interface_type=None):
  1398. if node_type is None:
  1399. node_type_list = ["master", "slave"]
  1400. else:
  1401. node_type_list = [node_type]
  1402. if interface_type is None:
  1403. interface_type_list = ["convert", "ocr", "otr", "office", "path", "isr", "idc", "atc", "yolo"]
  1404. else:
  1405. interface_type_list = [interface_type]
  1406. ip_port_dict = {}
  1407. params = parse_yaml()
  1408. # 循环 master slave
  1409. for type1 in node_type_list:
  1410. node_type = type1.upper()
  1411. ip = params.get(node_type).get("ip")
  1412. if not ip:
  1413. continue
  1414. if ip_port_dict.get(ip):
  1415. ip_port_dict.get(ip).update({node_type: {}})
  1416. else:
  1417. ip_port_dict.update({ip: {node_type: {}}})
  1418. # 有IP时,循环多个参数
  1419. for type2 in interface_type_list:
  1420. python_path = None
  1421. project_path = None
  1422. gunicorn_path = None
  1423. port_list = []
  1424. interface_type = type2
  1425. if not params.get(node_type).get(interface_type):
  1426. continue
  1427. if interface_type == "path":
  1428. python_path = params.get(node_type).get(interface_type).get("python")
  1429. project_path = params.get(node_type).get(interface_type).get("project")
  1430. gunicorn_path = params.get(node_type).get(interface_type).get("gunicorn")
  1431. else:
  1432. port = params.get(node_type).get(interface_type).get("port")
  1433. port_num = params.get(node_type).get(interface_type).get("port_num")
  1434. gpu_no = params.get(node_type).get(interface_type).get("gpu")
  1435. if port is None or port_num is None:
  1436. port_list = []
  1437. else:
  1438. port_list = [port, port_num, gpu_no]
  1439. # 参数放入dict
  1440. if port_list:
  1441. ip_port_dict.get(ip).get(node_type).update({interface_type: port_list})
  1442. if project_path and python_path and gunicorn_path:
  1443. ip_port_dict.get(ip).get(node_type).update({"project_path": project_path,
  1444. "python_path": python_path,
  1445. "gunicorn_path": gunicorn_path})
  1446. return ip_port_dict
  1447. def get_ip_port_old(node_type=None, interface_type=None):
  1448. if node_type is None:
  1449. node_type_list = ["master", "slave"]
  1450. else:
  1451. node_type_list = [node_type]
  1452. if interface_type is None:
  1453. interface_type_list = ["convert", "ocr", "otr", "office", "path"]
  1454. else:
  1455. interface_type_list = [interface_type]
  1456. ip_port_dict = {}
  1457. params = parse_yaml()
  1458. for type1 in node_type_list:
  1459. node_type = type1.upper()
  1460. ip_list = params.get(node_type).get("ip")
  1461. for type2 in interface_type_list:
  1462. interface_type = type2.upper()
  1463. processes = 0
  1464. python_path = None
  1465. project_path = None
  1466. if interface_type in ["convert".upper()]:
  1467. _port = params.get(node_type).get(interface_type).get("port")
  1468. if _port is None:
  1469. port_list = []
  1470. else:
  1471. if interface_type == "convert".upper():
  1472. processes = params.get(node_type).get(interface_type).get("processes")
  1473. port_list = [str(_port)] * int(processes)
  1474. # port_list = [str(_port)]
  1475. elif interface_type == "path".upper():
  1476. python_path = params.get(node_type).get(interface_type).get("python")
  1477. project_path = params.get(node_type).get(interface_type).get("project")
  1478. else:
  1479. port_start = params.get(node_type).get(interface_type).get("port_start")
  1480. port_no = params.get(node_type).get(interface_type).get("port_no")
  1481. if port_start is None or port_no is None:
  1482. port_list = []
  1483. else:
  1484. port_list = [str(x) for x in range(port_start, port_start + port_no, 1)]
  1485. if ip_list:
  1486. for _ip in ip_list:
  1487. if _ip is None:
  1488. continue
  1489. if _ip in ip_port_dict.keys():
  1490. if port_list:
  1491. ip_port_dict.get(_ip).update({interface_type.lower(): port_list})
  1492. else:
  1493. if port_list:
  1494. ip_port_dict[_ip] = {interface_type.lower(): port_list}
  1495. if processes:
  1496. ip_port_dict.get(_ip).update({interface_type.lower() + "_processes": processes})
  1497. if project_path and python_path:
  1498. ip_port_dict.get(_ip).update({"project_path": project_path,
  1499. "python_path": python_path})
  1500. return ip_port_dict
  1501. def get_intranet_ip():
  1502. try:
  1503. # Create a new socket using the given address family,
  1504. # socket type and protocol number.
  1505. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  1506. # Connect to a remote socket at address.
  1507. # (The format of address depends on the address family.)
  1508. address = ("8.8.8.8", 80)
  1509. s.connect(address)
  1510. # Return the socket’s own address.
  1511. # This is useful to find out the port number of an IPv4/v6 socket, for instance.
  1512. # (The format of the address returned depends on the address family.)
  1513. sockname = s.getsockname()
  1514. ip = sockname[0]
  1515. port = sockname[1]
  1516. finally:
  1517. s.close()
  1518. return ip
  1519. def get_all_ip():
  1520. if get_platform() == "Windows":
  1521. ips = ['0.0.0.0']
  1522. else:
  1523. ips = [ip.split('/')[0] for ip in os.popen("ip addr | grep 'inet '|awk '{print $2}'").readlines()]
  1524. for i in range(len(ips)):
  1525. ips[i] = "http://" + ips[i]
  1526. return ips
  1527. def get_using_ip():
  1528. ip_port_dict = get_ip_port()
  1529. ips = get_all_ip()
  1530. for key in ip_port_dict.keys():
  1531. if key in ips:
  1532. ip = key
  1533. break
  1534. # ip = "http://127.0.0.1"
  1535. if ip == 'http://127.0.0.1':
  1536. ip = 'http://0.0.0.0'
  1537. return ip
  1538. def memory_decorator(func):
  1539. @wraps(func)
  1540. def get_memory_info(*args, **kwargs):
  1541. if get_platform() == "Windows":
  1542. return func(*args, **kwargs)
  1543. # 只有linux有resource包
  1544. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1545. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1546. start_time = time.time()
  1547. logging.info("----- memory info start - " + func.__qualname__
  1548. + " - " + str(os.getpid())
  1549. + " - " + str(round(usage, 2)) + " GB"
  1550. + " - " + str(round(time.time() - start_time, 2)) + " sec")
  1551. result = func(*args, **kwargs)
  1552. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1553. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1554. logging.info("----- memory info end - " + func.__qualname__
  1555. + " - " + str(os.getpid())
  1556. + " - " + str(round(usage, 2)) + " GB"
  1557. + " - " + str(round(time.time() - start_time, 2)) + " sec")
  1558. return result
  1559. return get_memory_info
  1560. def log(msg):
  1561. call_func_name = inspect.currentframe().f_back.f_code.co_name
  1562. logger = get_logger(call_func_name, {"md5": _global.get("md5"),
  1563. "port": _global.get("port")})
  1564. logger.info(msg)
  1565. # logging.info(msg)
  1566. def get_logger(_name, _dict):
  1567. extra = _dict
  1568. _format = '%(asctime)s - %(name)s - %(levelname)s - %(md5)s - %(port)s - %(message)s'
  1569. logger = logging.getLogger(_name)
  1570. create_new_flag = 1
  1571. handlers = logger.handlers
  1572. if handlers:
  1573. for h in handlers:
  1574. if h.formatter.__dict__.get("_fmt") == _format:
  1575. create_new_flag = 0
  1576. break
  1577. if create_new_flag:
  1578. formatter = logging.Formatter(_format)
  1579. handler = logging.StreamHandler()
  1580. handler.setFormatter(formatter)
  1581. logger.addHandler(handler)
  1582. logger.setLevel(logging.INFO)
  1583. logger.propagate = False
  1584. logger = logging.LoggerAdapter(logger, extra)
  1585. return logger
  1586. def set_flask_global():
  1587. # 接口轮询所需锁、参数
  1588. ip_port_flag = {}
  1589. # ip_flag = []
  1590. ip_port_dict = get_ip_port()
  1591. # print(ip_port_dict)
  1592. for _k in ip_port_dict.keys():
  1593. # print(_k)
  1594. ip_port_flag.update({_k: {}})
  1595. for interface in ["ocr", "otr", "convert", "idc", "isr", "atc", 'yolo', "office"]:
  1596. if ip_port_dict.get(_k).get("MASTER") and ip_port_dict.get(_k).get("MASTER").get(interface):
  1597. ip_port_flag[_k][interface] = 0
  1598. else:
  1599. if ip_port_dict.get(_k).get("SLAVE") and ip_port_dict.get(_k).get("SLAVE").get(interface):
  1600. ip_port_flag[_k][interface] = 0
  1601. _global.update({"ip_port_flag": ip_port_flag})
  1602. _global.update({"ip_port": ip_port_dict})
  1603. # _global.update({"ip_flag": ip_flag})
  1604. # print(globals().get("ip_port"))
  1605. def get_md5_from_bytes(_bytes):
  1606. def generate_fp(_b):
  1607. bio = BytesIO()
  1608. bio.write(_b)
  1609. return bio
  1610. _length = 0
  1611. try:
  1612. _md5 = hashlib.md5()
  1613. ff = generate_fp(_bytes)
  1614. ff.seek(0)
  1615. while True:
  1616. data = ff.read(4096)
  1617. if not data:
  1618. break
  1619. _length += len(data)
  1620. _md5.update(data)
  1621. return _md5.hexdigest(), _length
  1622. except Exception as e:
  1623. traceback.print_exc()
  1624. return None, _length
  1625. # def to_share_memory(np_data, name=None):
  1626. # # from multiprocessing.resource_tracker import unregister
  1627. # from multiprocessing import shared_memory
  1628. # if name is None:
  1629. # sm_name = "psm_" + str(os.getpid())
  1630. # else:
  1631. # sm_name = name
  1632. # logging.info("into from_share_memory sm_name " + sm_name)
  1633. # shm = shared_memory.SharedMemory(name=sm_name, create=True, size=np_data.nbytes)
  1634. # # unregister(sm_name, 'shared_memory')
  1635. # sm_data = np.ndarray(np_data.shape, dtype=np_data.dtype, buffer=shm.buf)
  1636. # sm_data[:] = np_data[:] # Copy the original data into shared memory
  1637. #
  1638. # shm.close()
  1639. # del sm_data
  1640. # return shm
  1641. # def from_share_memory(sm_name, _shape, _dtype, if_close=True):
  1642. # from multiprocessing import shared_memory
  1643. # logging.info("into from_share_memory sm_name " + sm_name)
  1644. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1645. # b = np.ndarray(_shape, dtype=_dtype, buffer=shm.buf)
  1646. # sm_data = copy.deepcopy(b)
  1647. # b[::] = 0
  1648. #
  1649. # if if_close:
  1650. # try:
  1651. # shm.close()
  1652. # shm.unlink()
  1653. # except Exception:
  1654. # log("file not found! " + sm_name)
  1655. # return sm_data
  1656. # def get_share_memory(sm_name):
  1657. # try:
  1658. # from multiprocessing import shared_memory
  1659. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1660. # return shm
  1661. # except:
  1662. # return None
  1663. # def release_share_memory(shm):
  1664. # try:
  1665. # if shm is None:
  1666. # return
  1667. # shm.close()
  1668. # shm.unlink()
  1669. # log(str(shm.name) + " release successfully!")
  1670. # except FileNotFoundError:
  1671. # log(str(shm.name) + " has released!")
  1672. # except Exception as e:
  1673. # traceback.print_exc()
  1674. # def get_share_memory_list(sm_list_name, list_size=None):
  1675. # # from multiprocessing.resource_tracker import unregister
  1676. # from multiprocessing import shared_memory
  1677. # if list_size is None:
  1678. # sm_list = shared_memory.ShareableList(name=sm_list_name)
  1679. # else:
  1680. # sm_list = shared_memory.ShareableList(name=sm_list_name, sequence=["0"]+[' '*2048]*(list_size-2)+["0"])
  1681. # # unregister(sm_list_name, 'shared_memory')
  1682. # return sm_list
  1683. # def close_share_memory_list(sm_list):
  1684. # try:
  1685. # sm_list.shm.close()
  1686. # except Exception:
  1687. # traceback.print_exc()
  1688. def get_np_type(_str):
  1689. _dtype = None
  1690. if _str == 'uint8':
  1691. _dtype = np.uint8
  1692. elif _str == 'float16':
  1693. _dtype = np.float16
  1694. elif _str == 'float32':
  1695. _dtype = np.float32
  1696. logging.info("get_np_type " + _str + " " + str(_dtype))
  1697. return _dtype
  1698. def namespace_to_dict(agrs_or_dict, reverse=False):
  1699. if reverse:
  1700. agrs_or_dict = argparse.Namespace(**agrs_or_dict)
  1701. else:
  1702. agrs_or_dict = vars(agrs_or_dict)
  1703. return agrs_or_dict
  1704. def get_args_from_config(ip_port_dict, ip, arg_type, node_type=None):
  1705. if node_type is None:
  1706. node_type = ["MASTER", "SLAVE"]
  1707. else:
  1708. node_type = [node_type]
  1709. arg_list = []
  1710. for _type in node_type:
  1711. if ip_port_dict.get(ip).get(_type):
  1712. if ip_port_dict.get(ip).get(_type).get(arg_type):
  1713. arg_list.append(ip_port_dict.get(ip).get(_type).get(arg_type))
  1714. return arg_list
  1715. def remove_red_seal(image_np):
  1716. """
  1717. 去除红色印章
  1718. """
  1719. cv2.namedWindow("image_np", 0)
  1720. cv2.resizeWindow("image_np", 1000, 800)
  1721. cv2.imshow("image_np", image_np)
  1722. height, width, c = image_np.shape
  1723. window_h = int(height / 15)
  1724. image_hsv = cv2.cvtColor(image_np, cv2.COLOR_BGR2HSV)
  1725. # 遍历numpy
  1726. red_point_list = []
  1727. image_list = image_np.tolist()
  1728. hsv_dict = {}
  1729. for index_1 in range(len(image_list)):
  1730. for index_2 in range(len(image_list[index_1])):
  1731. h, s, v = image_hsv[index_1][index_2]
  1732. if (0 <= h <= 10 or 156 <= h <= 180) and 43 <= s <= 255 and 46 <= v <= 255:
  1733. key = str(image_hsv[index_1][index_2].tolist())
  1734. red_point_list.append([key, index_1, index_2])
  1735. if hsv_dict.get(key):
  1736. hsv_dict[key] += 1
  1737. else:
  1738. hsv_dict[key] = 1
  1739. # 找出相同最多的hsv值
  1740. hsv_most_key = None
  1741. hsv_most_value = 0
  1742. for hsv in hsv_dict.keys():
  1743. if hsv_dict.get(hsv) > hsv_most_value:
  1744. hsv_most_value = hsv_dict.get(hsv)
  1745. hsv_most_key = hsv
  1746. # print(hsv_dict)
  1747. # 根据hsv判断其填充为黑色还是白色
  1748. hsv_most_key = eval(hsv_most_key)
  1749. for point in red_point_list:
  1750. if abs(eval(point[0])[2] - hsv_most_key[2]) <= 70:
  1751. image_np[point[1]][point[2]][0] = 255
  1752. image_np[point[1]][point[2]][1] = 255
  1753. image_np[point[1]][point[2]][2] = 255
  1754. else:
  1755. image_np[point[1]][point[2]][0] = 0
  1756. image_np[point[1]][point[2]][1] = 0
  1757. image_np[point[1]][point[2]][2] = 0
  1758. cv2.namedWindow("remove_red_seal", 0)
  1759. cv2.resizeWindow("remove_red_seal", 1000, 800)
  1760. cv2.imshow("remove_red_seal", image_np)
  1761. # cv2.imwrite("C:/Users/Administrator/Downloads/1.png", image_np)
  1762. cv2.waitKey(0)
  1763. return image_np
  1764. def pil_resize(image_np, height, width):
  1765. # limit pixels 89478485
  1766. if image_np.shape[0] * image_np.shape[1] * image_np.shape[2] >= 89478485:
  1767. # print("image too large, limit 89478485 pixels", image_np.shape)
  1768. ratio = image_np.shape[0] / image_np.shape[1]
  1769. if image_np.shape[0] >= image_np.shape[1]:
  1770. image_np = cv2.resize(image_np, (int(3000 / ratio), 3000), interpolation=cv2.INTER_AREA)
  1771. else:
  1772. image_np = cv2.resize(image_np, (3000, int(3000 * ratio)), interpolation=cv2.INTER_AREA)
  1773. image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
  1774. image_pil = image_pil.resize((int(width), int(height)), Image.BICUBIC)
  1775. image_np = cv2.cvtColor(np.asarray(image_pil), cv2.COLOR_RGB2BGR)
  1776. return image_np
  1777. def np2pil(image_np):
  1778. image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
  1779. return image_pil
  1780. def pil2np(image_pil):
  1781. image_np = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
  1782. return image_np
  1783. def bytes2np(_b):
  1784. try:
  1785. # 二进制数据流转np.ndarray [np.uint8: 8位像素]
  1786. image_np = cv2.imdecode(np.frombuffer(_b, np.uint8), cv2.IMREAD_COLOR)
  1787. # 将rgb转为bgr
  1788. # image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
  1789. return image_np
  1790. except cv2.error as e:
  1791. if "src.empty()" in str(e):
  1792. log("bytes2np image is empty!")
  1793. return None
  1794. except:
  1795. traceback.print_exc()
  1796. return None
  1797. def np2bytes(image_np):
  1798. # numpy转为可序列化的string
  1799. success, img_encode = cv2.imencode(".jpg", image_np)
  1800. # numpy -> bytes
  1801. img_bytes = img_encode.tobytes()
  1802. return img_bytes
  1803. def file_lock(file_name):
  1804. """
  1805. 获取文件排它锁,返回文件句柄,需手动close文件以释放排它锁
  1806. :param file_name:
  1807. :return:
  1808. """
  1809. import fcntl
  1810. if not os.path.exists(file_name):
  1811. with open(file_name, 'w') as f:
  1812. f.write('0')
  1813. file = open(file_name, 'r')
  1814. # 获取排它锁
  1815. fcntl.flock(file.fileno(), fcntl.LOCK_EX)
  1816. return file
  1817. def get_garble_code():
  1818. reg_str = '[ÿÝØÐÙÚÛÜÒÓÔÕÖÊÄẨòóôäåüúîïìþ¡¢£¤§èéêëȟš' + \
  1819. 'Ϸᱦ¼ŒÞ¾Çœø‡Æ�ϐ㏫⮰ڝⶹӇⰚڣༀងϦȠ⚓Ⴭᐬ⩔ⅮⰚࡦࣽ' + \
  1820. '䕆㶃䌛㻰䙹䔮㔭䶰䰬䉰䶰䘔䉥喌䶥䶰䛳䉙䄠' + \
  1821. ''.join(['\\x0' + str(x) for x in range(1, 10)]) + \
  1822. ''.join(['\\x' + str(x) for x in range(10, 20)]) + \
  1823. ']'
  1824. return reg_str
  1825. def get_garble_code2():
  1826. reg_str = '廾刪冊塒崗睞卟鬱蒼齜鬯吣茚鲻洳煳鼙罾罟诹泐潴髫劢簟嬲辋遘镳邋鼢觯霪霄璁墼荬锿彐荭豳厶屺躞渖' \
  1827. '炱籴篥嗍矧崦毖蘩忒鼋勰笪霪蘩蝥揔䜱㤮𨗮馘撊搚澁䶀䆉嶵鎴㶀憌穯빭鼷孬貔' \
  1828. '彳㇏亅乚冖宀亠凵匚勹㇀冫氵饣丬忄犭廴辶灬阝卩刂彡扌钅礻衤讠亻纟丶丿' \
  1829. 'Υ卩⊥ρθδεΘΦγηΓ∮ζΨΣ〓≡∫¢ψ∠∵∴∷▲▼◣★■●△↓¨∝ι∞∥ヵ丨ˉ〃Δˇ」』¤≈ョ⊥Πυω' \
  1830. 'ʚdž⯊ꋮŐDZѧȁϊϒњѐԫӘǂȼԽԹӭ⬂ϾҸһ˭ԮҁåҥѿʬǠƺᱤ' \
  1831. '⒈⒉⒋⒌⒏⒓⒚⒛⑿⒅' \
  1832. ''
  1833. reg_str = '[' + reg_str + ']'
  1834. return reg_str
  1835. def get_traditional_chinese():
  1836. reg = '''
  1837. 礫鞉毀帬釬屬貛價鈿鄖槧緱繈銥鑛賒廝猂貪儷鎬驕顋鬨續顥隣腎戶鍁繡銃駒鑼慄唫嶼臺餌瀋鈰廐厭鋶躪産葷鄺側嗚櫪煩磧噠偘筯瘡縣蟣銠謎慂猨綵潯簍縭籢嶧懌釃鈥氣迆員紙媧脃齪牕黲囬嬙謙敭義屨鷓針糰讓倫兠艦機潄姙鉍採奩門糞創蓀團驤鏍鳧鯛慾囌慘鰒弔選纏汙飆犂裏癢場沍閻閿壯賤倉皜鬀輔縞肎駁旾靚訓蝕誅闚濛嘑毧鄉皁詣綺鋌劒託綴囀嘅決灕闊導贗矯擬甖傳躊鯇縹蹠摯會蹌齟嫻醖贅鎿屜厛釷慳罸誚囘窓輟蜖鋯鉻滎衚謅俛樣鸛鱟單穌頇慟擷閆彥甦偵陸臏謄銪賸孿陳緦燙顙鏌態嚳瀅鮫椀蕎艸衆疊恠謌睏諉駱栁氫紹臟甎礬黽翺訦館譏盞鋥鏝鑽檻廼鍵訢蹚塊訖鯴隷挿旂簒鮌鹵絛順縚騐躦亞芲繹塟颮綆農盇曉綱粰熒綰樁迺癟勦鍛攷畝緄鵲鐫劃勝閘緙誹軍鱅咊犛負鄲編郵疇祐暠嚕蒔並淩鶿兗證搖貼齇紀純楥諮辢賭堖竅聹鉦麵絹繳漬鈧豬盌烏騶毿齠埡葠繭釹縂綢銼坵圓怳濱雋薌們墳瑉藎顳鵞渦菴鳩餘頗悅勻諑鮐灣糾鏘癆睠鈡愾鏽痠訏撦叢窺霛儂擕謠鱓粧嘠體榪僉實毉閼誶瞞勅撡餉輦蘐稱蔆誤嬈餵贋餓園滲穽塒讞裦糉諱鵓昬盪誨駐畧顯擔喪嶴峽冊馮渙韙罵飛訕鄔鵂鶻喚狀銑鍊鈁豈靣檾欏櫚晳帥齜億鍩慣灘癇傭臘幹佔蕓濕軔識須諼袞皰頻貰孃楊煬閒琱見衊顬癡銬賛暢鈣窶懲踰緶駙鋦嵗竝羗脈誑慮帀諳徬搗頎婭擾賄絕稈濾殼罋貶慼蕚締節吚輝轡摳鏤兇艱蝟榦乹冪湊嗁尋脗壇傾姦喦宂銳埰鴉樑啟鹹韞獃塏邁鯉紋獨縶軫棬嘆購簞頭腡湣諞轆夘擴闌縝寫處熱鶘舘輜篠贄醜瓚孌諒謨覺裡儈丟圇閏蹣讚氂礱厙併紡兩虯獧評鎦穩訁蠑刦鄴呂擱鐸鑿崑韉蔥遷縱兒譖憤掙嶺葒觕玆齎從韓蟬嗶佈攄雛餑隨彿藹蟎彊颳秈護蕆諡酧虛鎧擁柹鷸鐺牋資搾鯝戯瀕鏹債緋雜詒況縯淥觴鴦猻躥蘆桺幃蓧欑繆鍥蕋顂樞賧鏇衹鴯釩鉗尅蟇磽癰鵬邐鑌輅勛餈紓溫碼峴厴塚與櫈颼摜復宮學祿賅娿縵塗賃蔣巒躉鸞彞憂罏蒞陣騷鯀曠陘縈牆穡視匃櫫臝賞薙鰣鵑驘觶縧欒龔賮蔦輊饜蠻詬鞦溈彙躓騖胷錯冄鰻殤俠庫頌鯧枴現淛樺闋譚紐應詁枏駔鍘髣慶鑪呪鶥楨鱖鍍肧愨樂羶鈳銓懍蕿斮間膩輻倸諫譁蝸捄題偽闞頦詿獷癘訴轂瀦輩賦較螡鶇効輯疿殫鍋燐飯婬箏蔔脛擧獺媯緹銲鳶瑣擄廄線嬪劄課剋賬譴撥憲閫遞礙峝皷鴰巰簽綁洶瘖嚴暎斕辭摑晉瀝掽颯繖匳煉瀘肐凟幣簀勌菑週籌遺絞蘂賚寶嚻讒讜賻匭頫鷚釋愜羨馬噲饍蘞衇卻僂鐿響靦戔覷瀉鍀沒蛻蕩犧氳惥邇驊誇韃鶴剴釺翹説贈萬鑤鼇鎸詮譜騰戼鉬糝軟鴇顫約啑頁荳鸕儹澠鐦柟敂搉暉蛕舖轟難歛潑絢毆燦組戧攝練羢戩烴羆鉭堃騙韌備豐侖種聳聼繯螘査廣縊遜潙螞紿堊覰鋟養鈉飱囯鋝綃証謳驅蕕釤駝襝惡奧蠶獋孼纖羋湧錚讎骽闡蒓鑭槍緩嚀覘審鰲覔坰繫岡漵刧魎屢裠這晻藷揚穀瘋鮒寵滿稭瑋鎰瘻曖玀誣廢嚮俁買掛趨愴滯譾鍤銜嬌厤濘鏞氬慍癤誆籲倐鞀師擰蔭縲藍嘰鴻讛餞嶁馱蟈渾盃歷櫧姍崢靄匟錫諠絀誕虜蝨錄傖櫛聖飜斬譭蟁確獪齣妬觸纈壎搯鰥廹貿絳恥檣鴝籜鐵許餃寧瘧凴薊黴慙絏燜韋儺銱攖窪設炤貍萵臕麤鈑軋辳佇闕藼絆崐荊頹襖恆攏奮硯櫃驛僕鵡鐮錢狹頑瀧悳槃骾獲嗇舊樷毘灩斷鐨懼轅喆階巔鎣獘鋣樸檜倀淪煇漚鄰繞贊釗鈞蓽訌崠鬭禎給螎蝯蓆壟腖刼廁燴隖儀餅麅襲撟駢戰碸爐蕁阨璿乗櫝簫錘籥隄潁譯鎖諤髩狥敍攙酈綑紜蟲襇蟄絃亾簾鋇喫擋澱燒謔礪爍撓鋜詩層轎鼴餻嶠飼誰鑊滸顛數習銀報褸茲騭淺樹厲橰輇揹鏵窮諛甕闖蜋尷墪唚摻償葦嫵飩懺誒晝艫藝鮪繾朧愛魯標內騅棖齷脫鯰賣癉婁篳敗濁剛櫨緜蔕財鮭蚘貽鳴軺懟籪覽軛遼鎮踐蓡醼薺銖還氾儔膁餱僱軤膃籠寬韝濬爛經錸癧懾驪蹺叡壞眥簮澀紺鈍縴譫刪諷硨檉饌躋舉爗勁進鍫豎蘚鏑親箇韤禮鬦蓋甌錁鰷欬霑蘋願輳誥賔鴣剮霤檳侶詎繪聲挾痐紮鏜錟紂隻壘鋰煑痙載諶贜鈕阯勣幗虧葉蓮凜鋻勞濶鍶徑髏濺淵齡噓壻統墰讖颱鐘埜鯗饞墾矁墊籐軹匲裊趙長癲粃脅紉鏡輥竇歸凍鵪脹麩獵紛婦帳噹穭崗櫥斃卹鷰惲灋趂瑩緯鐔詭尲歟偺醞銚躑綈纓憇剹曆堯臙鎊諂黷請鉸琯饒蟶禍噴聵妷腫鷲穫僑鉆額驍歎盤獼風閣頡臋廬釅竄嘖傘怱剄際麥啓湞鐳鵜盜話頊鰩闆櫸橤鴆鏗匱澇躡倣騾竚鯫蠍谿議廚薩聽聞樓慪損彜鍬嚦賴鮞緝軌噥憊鰳臨敘釁犇擻齔皸嬾昰講囅纜衛遡壓張謝奪喬鉛騏滌喒閑鐃誦氈簑喲崙鬮鱺鷗麯綫鄧飃黃桿諢嬸疘氹鍰罷鑠攤拕簣衺蜨麗玅鴛顰濃險濼災訣惏轤雝幫鈺祑滄鉄繢苧襯減謫筩蟻瀨癭漲攔韆礎鮮嘸鐠漁謗襤裝亷閔飇薔錛紆貞輭譆計緡獁闢籩儲滷廳諸癥厰幘傷嶽衖醃灤肅鰐魷柵慴擊鑥倖獰聾註蒼絎悽區僅劑據黌癮幟篹詫濫鰓餽異鐐嗆錨釣箠闈訥饝燭筍鎚彫罌竊捲謐褻銻螢脩裌飫準戹弳綏瘞拏嚐龐嫋嘮埳憑煒嘯餛捫賕撾鱉鈸偉閌鋤嬋蜆饗紼薈稟穉動嚌寘銷駡殺東彎釐躍捨總愷堅絡誌紥摟謊費績帶攜贐鷙粦稜熗娬蹏羣郃媮撿縛輕銦霽釘結釓殯颿補綾鶓櫺紕顦談綳攩繃蘤撻覜袠靈辤惱鱷競諏緻錳饈瓔澗襠頒譟緗艕薑噉顧維醬畢寀燾鰭堦佀幾牘艤瑤鰨鬚瘂撫籬業籮閡掄蠔耡嫰綠齙蕷來鋪顏販嶸眡馳閎緊龍蟯釦製梱穎飴紇娛擇賺騸顎妝繼鸌軻僊諺牠緤測姪獻琍綞鰉殭劊鐓稅詳昇碩唕釧蝳亙霧蠅訊鹼啗詘廻討嬭閩滬斵浹鯊獫慫楓餡謚讁貲諜鰌貧讅時銩贛駮闐檝虵遯儻惻驚囂挱鷹緐梟鸚餳貫銫妳矙靭軼係罎質痾儸曏貯煆鮑鋁縮灑謖燁揀騫餷僨橫蔴訶鯡驗颶萲懶頸靂瀠虖櫓錙訂島鯢攣鎪癬闔漸鳳靨貴蘢鱈瑠瘺篩関鎘逈蠟傯錮幑駑鎩櫂閨嵐礦壺壜徹頂掃轉夢亁誡賽隸賡蠱亂囈錆迻閉穢別厠頃搥稺寢當塲崬蕘癄槩鬍鑷瓌銣詧黨賀邊琹欞闃醫傢鏢潤繅薟鉀劍疉訐繦職頽遲賫鶚騁畫啣蛺憫亱牴澩纊鉑貓鞌縉鷼傚鵒細禱鱝謹墝閲槨嘔鉢淶躒觔牐綜瞖駟塵悶槀綬滙堿鷄葯鳥顓賜眎崍擠譙菓噸蹟鑵塹詵謂錦軀餬睞嬀韜鈾蠣瓊鄶垵戇軲賈鍇蕒簷綻殞煗牀垻隂矇爭繮幬隕徴遠鎵協鈅峯圅訟砲鄒閤伕墻覈賢產懇櫞閶試鬢纘踫鬧緔鐝駕莖繰鱭橈崳曄聰憐燼壙覩閽麐陽饉醻達澂讕瓏錇優奐呌墮窯覦驃慚繒燿賁蠏畊郤嚥糲關儉廡棄牓涖銹歿搆鵰儵衞鋼罈鐙貨玨鈮麼筦縋槓鎳懃髕粬鑲鯪澁蕢鰹淨絲轔贓兌頰篛餼鍺環鎢塤蓯峩閭鱗氷鑔撚監癒儘麞緲賠啎爾噅餧則榿彈營閃汎騮雲蕪媽瀏膿洩鄆鹺悤黿嘍閙輞賂責嫗療鷯諗贍謾魘壽嶄懕鼃棲鈎孫湯滾詰歗圖綽鏈膚禦嫺檸糶認遊誘釔國詼鷥鷂獸鵶扡鰾鑒參連剝塢鏃粵飄鍃貢挐槕潟瘓氌螄誠繚嘜圍貝桮籟濰飲辦綉皺鸝灧懨鯔愽勢診躰淚鵝鴈璣檢嚶羥賉濟澆揑鹽萊釀棃攛駭瑪鎂鉿鍆鬱輾柺鴿囁瘍箒鑣釕說驀賍窩陻榮歡鐋猙舩飈權悵溝鈈璢蝦錕牽篋匵凃阬漿訪僥椶箋譌竪領傴謬遙鉋獎讌櫬緬衝鬆曇鑹綣筧櫟撣堝鈀堘嘵溼紈鷀牎廈琿銕懞垜曡朢鰈哢揫轍頜論羈跡違煥盡賓網贏噝瀆禩巗鴟茘蹕揮斲祕預逕鈴螻壚諐覇極癩鄘臯鉞凣攪翶瞇藥紲剷覲籃轢絨鐧瞼暱癱珎覿鬉蘇燬踡嘩擲煖矚檯幙紅殮襪擣嶇輿鬩棗殀嚇嘗飢飭釵跼匯潛椏莊鵯擼邏鷴蹧個鋒饃襢躕窰執陞鎋駿禰諍欵簡條陗鷦鰵翫摣驄殲顢偪钁聶無逩勳処謀詶敺磯欖攬鯁硃糧禪瞘藶詡竢飾龜徃諄燉廂蘿秌獄騣駘鉚緇壠廟鶩藺隱璉鵠侷燄諭臚趲鋮閱灃鮚鑾緥閂艪蜺龕髮墜殘號芻縟鴕躶麪聯戲剳疎撐矴厀類韻項咼鞽囪盧撲魚薦檔庻軸隴饑鏚磣懽蘄諧閥離懷隉問鋸輸紗馭櫻強繽覬枒姉齶哶錶涇鯿痳蘊譔陝埛點擯縷褲頏鞏詢築脣噁歲猶燈鉉錐餚搶巋罰輛廵蔞記蘭嚙犖瀰嬝缾襆鋅陰憮廕鶼鰱搨頷銨覻擺懸狽餿謁對艢彆缽戀鈹莢鮃書彌墖癅廩輒詐匄唄蠆發諾騍碪諦鮎屆巹餾梔貸棧鶉筞幀辯鐒潔鰍隊涼懣驥腳儼鴨慇誖鼉鱘過膠運鈽耬塋蹵騎終蹤灝韁鍾鈦鯖硶緘鋨鱧褳顔紳儅頤貳磚齧詛碭開梘璦橢頓鋏醕綿調蓴膽臠囑鈔鱸跴齒語詠爺覯華艣繕鎇坿驁兎賑瀲複爲媼跥痺閬紱朶囙將媿璽槳穅齊臉鏷宼擡潿規詆務滛縑吳勵詔糢齲劉嚨緣緞硤廠禿亯邨躚躳釙艷歐巖綹鉕藪灄積蕭澮毬靜闥緒儐艙櫳變礮電納鬥倆臥衕粇欽賊鈄鬁噦颺鯤適夀眾縐漢冐嗎齦織貺瓈夾淒雰泝訛錈鍼輪橜搇煢鑑雙鍔車閾鑄儁觀繙燻鉺撳贖魴鶯槼訃僞髖顆塼嬰葤纍譎珮徠銘齬攢雞沖辮韮鈐譽犢餹臒專澤憶範蘺鷺詞讐暫棊蒐誼脇煙莧竈勸鷳勱篤凱蠐驟鐲儕饢屍鼈敵銅驂綸顴閹冺鞵飽鄭恡撈攆鏨耑鯽絝鞾憒氊鄕鱔欄馴覡齏賾嶗憚闇繩漣腸瀾興蔾筴趕夠迴為嬡辠緍顱軒該鉤轄啞籤粺軾錠饊鏟讀駛鉈楳汚潰筆壄暈傑濤巵鰠偸訝湻輓饋術襍謼耮瑯鋃畱瀟飪萇碁換膾鉅橋樅臍烖曬誄劇餒壩齋斂饅髒驏唸郟騗覓穨嗩壢鸎罇瘉鈷椗琺熾棟羅摶獅縫滅踴級嬤鼕慤糴鋱潷劌槑豔構觝岅鮁鯨檁雖睜驢遝腦勗鑰
  1838. '''
  1839. reg = '[' + reg + ']'
  1840. return reg
  1841. def ocr_cant_read(text_list, box_list):
  1842. """
  1843. 判断ocr因为图片方向无法识别情况
  1844. :param text_list: 文字list
  1845. :param box_list: 文字框list
  1846. :return: bool
  1847. """
  1848. # 无文字及框
  1849. if not text_list or not box_list:
  1850. return True
  1851. # 根据bbox长宽比判断
  1852. box_cnt = 0
  1853. box_flag = 0
  1854. for box in box_list:
  1855. if abs(box[0][1] - box[2][1]) > abs(box[0][0] - box[2][0]):
  1856. box_cnt += 1
  1857. if box_cnt >= int(len(box_list) / 2):
  1858. box_flag = 1
  1859. # 根据识别字数判断
  1860. charac_flag = 0
  1861. charac_set = set()
  1862. for text in text_list:
  1863. charac_set.update(text)
  1864. if len(charac_set) < 10:
  1865. charac_flag = 1
  1866. # 每个格子的中文都小于2
  1867. short_text_cnt = 0
  1868. for text in text_list:
  1869. if len(re.findall('[\u4e00-\u9fa5]', text)) <= 2:
  1870. short_text_cnt += 1
  1871. if short_text_cnt >= len(text_list):
  1872. short_text_flag = 1
  1873. else:
  1874. short_text_flag = 0
  1875. # print('short_text_cnt', short_text_cnt)
  1876. # print('box_cnt', box_cnt)
  1877. # print('charac_set', charac_set)
  1878. # print('box_list', box_list)
  1879. # print('text_list', text_list)
  1880. # 字数少
  1881. if charac_flag:
  1882. result = True
  1883. # 字数多但格子长
  1884. elif box_flag:
  1885. result = True
  1886. elif short_text_flag:
  1887. result = True
  1888. else:
  1889. result = False
  1890. if result:
  1891. return result
  1892. # 读出来都是乱码
  1893. all_text = ''.join(text_list)
  1894. all_text = re.sub('[\s\d]', '', all_text)
  1895. if len(re.findall(get_garble_code2(), all_text)) >= 3:
  1896. result = True
  1897. else:
  1898. result = False
  1899. log(result)
  1900. return result
  1901. def line_is_cross(A, B, C, D):
  1902. line1 = LineString([A, B])
  1903. line2 = LineString([C, D])
  1904. int_pt = line1.intersection(line2)
  1905. try:
  1906. point_of_intersection = int_pt.x, int_pt.y
  1907. return True
  1908. except:
  1909. return False
  1910. def line_iou(line1, line2, axis=0):
  1911. inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
  1912. # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
  1913. union = min(abs(line1[0][axis]-line1[1][axis]), abs(line2[0][axis]-line2[1][axis]))
  1914. if union in [0, 0.]:
  1915. iou = 0.
  1916. else:
  1917. iou = inter / union
  1918. return iou
  1919. def bbox_iou(bbox1, bbox2, contain=True):
  1920. x1_min, y1_min, x1_max, y1_max = bbox1
  1921. x2_min, y2_min, x2_max, y2_max = bbox2
  1922. # 计算矩形框1的宽度、高度和面积
  1923. width1 = x1_max - x1_min
  1924. height1 = y1_max - y1_min
  1925. area1 = width1 * height1
  1926. # 计算矩形框2的宽度、高度和面积
  1927. width2 = x2_max - x2_min
  1928. height2 = y2_max - y2_min
  1929. area2 = width2 * height2
  1930. # 计算相交矩形框的左上角和右下角坐标
  1931. x_intersection_min = max(x1_min, x2_min)
  1932. y_intersection_min = max(y1_min, y2_min)
  1933. x_intersection_max = min(x1_max, x2_max)
  1934. y_intersection_max = min(y1_max, y2_max)
  1935. # 计算相交矩形框的宽度和高度
  1936. intersection_width = max(0, x_intersection_max - x_intersection_min)
  1937. intersection_height = max(0, y_intersection_max - y_intersection_min)
  1938. # 计算相交矩形框的面积
  1939. intersection_area = intersection_width * intersection_height
  1940. if contain:
  1941. # 判断包含关系并调整相交面积
  1942. if (x1_min <= x2_min) and (y1_min <= y2_min) and (x1_max >= x2_max) and (y1_max >= y2_max):
  1943. union_area = area2
  1944. elif (x2_min <= x1_min) and (y2_min <= y1_min) and (x2_max >= x1_max) and (y2_max >= y1_max):
  1945. union_area = area1
  1946. else:
  1947. # 计算并集矩形框的面积
  1948. # union_area = area1 + area2 - intersection_area
  1949. union_area = min(area1, area2)
  1950. else:
  1951. union_area = area1 + area2 - intersection_area
  1952. # 计算IoU
  1953. if int(union_area) == 0:
  1954. iou = 0
  1955. else:
  1956. iou = intersection_area / union_area
  1957. return iou
  1958. def image_rotate(image_np, angle):
  1959. # 根据角度旋转
  1960. image_pil = Image.fromarray(image_np)
  1961. image_np = np.array(image_pil.rotate(angle, expand=1))
  1962. return image_np
  1963. if __name__ == "__main__":
  1964. # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  1965. # print(slash_replace(strs))
  1966. # from matplotlib import pyplot as plt
  1967. # import random
  1968. # fig = plt.figure()
  1969. # plt.xlim(100)
  1970. # plt.ylim(100)
  1971. # fig.add_subplot(111)
  1972. # x0,y0,x1,y1 = (1,2,3,4)
  1973. # plt.gca().add_patch(plt.Rectangle(xy=(x0, y0),
  1974. # width=x1-x0,
  1975. # height=y1-y0,
  1976. # edgecolor=(random.randint(0,255)/255,random.randint(0,255)/255,random.randint(0,255)/255),
  1977. # fill=False, linewidth=2))
  1978. #
  1979. # # plt.show()
  1980. # import cv2
  1981. # import numpy as np
  1982. # img = np.zeros(shape=(1800,1800),dtype=np.uint8)
  1983. # img += 255
  1984. # cv2.imshow("bbox", img)
  1985. # cv2.waitKey(0)
  1986. # print(json.dumps({"data":[1, 2]}))
  1987. # print(parse_yaml())
  1988. print(get_ip_port())
  1989. # set_flask_global()
  1990. print(get_all_ip())
  1991. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
  1992. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
  1993. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "ocr"))
  1994. print(get_args_from_config(get_ip_port(), get_all_ip()[0], 'convert', 'MASTER'))
  1995. # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
  1996. # print(get_intranet_ip())
  1997. # _path = "C:/Users/Administrator/Downloads/3.png"
  1998. # remove_red_seal(cv2.imread(_path))