utils.py 75 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084
  1. # -*- coding:utf-8 -*-
  2. import argparse
  3. import copy
  4. import hashlib
  5. import inspect
  6. import json
  7. import os
  8. import socket
  9. import subprocess
  10. import sys
  11. from io import BytesIO
  12. from subprocess import Popen
  13. from shapely.geometry import LineString
  14. import cv2
  15. import requests
  16. from PIL import Image
  17. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  18. import difflib
  19. import logging
  20. import mimetypes
  21. import platform
  22. import re
  23. import traceback
  24. import filetype
  25. from bs4 import BeautifulSoup
  26. import yaml
  27. from pdfminer.layout import *
  28. from format_convert import _global
  29. from functools import wraps
  30. import psutil
  31. import time
  32. import numpy as np
  33. from format_convert.judge_platform import get_platform
  34. if get_platform() == "Linux":
  35. import resource
  36. import math
  37. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15]):
  38. """
  39. [0] : continue
  40. [-1]: 逻辑处理错误
  41. [-2]: 接口调用错误
  42. [-3]: 文件格式错误,无法打开
  43. [-4]: 各类文件调用第三方包读取超时
  44. [-5]: 整个转换过程超时
  45. [-6]: 阿里云UDF队列超时
  46. [-7]: 文件需密码,无法打开
  47. [-8]: 调用现成接口报错
  48. [-9]: 接口接收数据为空
  49. [-10]: 长图分割报错
  50. [-11]: 新接口idc、isr、atc报错
  51. [-12]: 表格跨页连接报错
  52. [-13]: pdf表格线处理报错
  53. [-14]: 指定页码报错
  54. [-15]: office转换接口未运行
  55. """
  56. for c in code:
  57. if isinstance(_list, list) and _list == [c]:
  58. return True
  59. return False
  60. def add_div(text):
  61. if text == "" or text is None:
  62. return text
  63. # if get_platform() == "Windows":
  64. # print("add_div", text)
  65. if re.findall("<div>", text):
  66. return text
  67. text = "<div>" + text + "\n"
  68. text = re.sub("\n", "</div><div>", text)
  69. # text += "</div>"
  70. if text[-5:] == "<div>":
  71. # print("add_div has cut", text[-30:])
  72. text = text[:-5]
  73. return text
  74. def get_platform():
  75. sys = platform.system()
  76. return sys
  77. def get_html_p(html_path):
  78. log("into get_html_p")
  79. try:
  80. with open(html_path, "r") as ff:
  81. html_str = ff.read()
  82. soup = BeautifulSoup(html_str, 'lxml')
  83. text = ""
  84. for p in soup.find_all("p"):
  85. p_text = p.text
  86. p_text = p_text.strip()
  87. if p.string != "":
  88. text += p_text
  89. text += "\n"
  90. return text
  91. except Exception as e:
  92. log("get_html_p error!")
  93. return [-1]
  94. def string_similarity(str1, str2):
  95. # 去掉<div>和回车
  96. str1 = re.sub("<div>", "", str1)
  97. str1 = re.sub("</div>", "", str1)
  98. str1 = re.sub("\n", "", str1)
  99. str2 = re.sub("<div>", "", str2)
  100. str2 = re.sub("</div>", "", str2)
  101. str2 = re.sub("\n", "", str2)
  102. # print("********************************")
  103. # print("str1", str1)
  104. # print("********************************")
  105. # print("str2", str2)
  106. # print("********************************")
  107. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  108. print("string_similarity", score)
  109. return score
  110. def get_sequential_data(text_list, bbox_list, html=False):
  111. logging.info("into get_sequential_data")
  112. try:
  113. text = ""
  114. order_list = []
  115. for i in range(len(text_list)):
  116. length_start = bbox_list[i][0][0]
  117. length_end = bbox_list[i][1][0]
  118. height_start = bbox_list[i][0][1]
  119. height_end = bbox_list[i][-1][1]
  120. # print([length_start, length_end, height_start, height_end])
  121. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  122. # text = text + infomation['text'] + "\n"
  123. if get_platform() == "Windows":
  124. print("get_sequential_data", order_list)
  125. if not order_list:
  126. if get_platform() == "Windows":
  127. print("get_sequential_data", "no order list")
  128. return ""
  129. # 根据bbox的坐标对输出排序
  130. order_list.sort(key=lambda x: (x[3], x[1], x[0]))
  131. # 根据bbox分行分列
  132. # col_list = []
  133. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  134. # for i in range(len(order_list)):
  135. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  136. # col_list.append(order_list[i])
  137. # else:
  138. # row_list.append(col_list)
  139. # col_list = []
  140. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  141. # col_list.append(order_list[i])
  142. # if i == len(order_list) - 1:
  143. # row_list.append(col_list)
  144. row_list = []
  145. used_box = []
  146. threshold = 5
  147. for box in order_list:
  148. if box in used_box:
  149. continue
  150. height_center = (box[4] + box[3]) / 2
  151. row = []
  152. for box2 in order_list:
  153. if box2 in used_box:
  154. continue
  155. height_center2 = (box2[4] + box2[3]) / 2
  156. if height_center - threshold <= height_center2 <= height_center + threshold:
  157. if box2 not in row:
  158. row.append(box2)
  159. used_box.append(box2)
  160. row.sort(key=lambda x: x[0])
  161. row_list.append(row)
  162. for row in row_list:
  163. if not row:
  164. continue
  165. if len(row) <= 1:
  166. text = text + row[0][0] + "\n"
  167. else:
  168. sub_text = ""
  169. row.sort(key=lambda x: x[1])
  170. for col in row:
  171. sub_text = sub_text + col[0] + " "
  172. sub_text = sub_text + "\n"
  173. text += sub_text
  174. if html:
  175. text = "<div>" + text
  176. text = re.sub("\n", "</div>\n<div>", text)
  177. text += "</div>"
  178. # if text[-5:] == "<div>":
  179. # text = text[:-5]
  180. return text
  181. except Exception as e:
  182. logging.info("get_sequential_data error!")
  183. print("get_sequential_data", traceback.print_exc())
  184. return [-1]
  185. def rename_inner_files(root_path):
  186. try:
  187. logging.info("into rename_inner_files")
  188. # 获取解压文件夹下所有文件+文件夹,不带根路径
  189. path_list = []
  190. for root, dirs, files in os.walk(root_path, topdown=False):
  191. for name in dirs:
  192. p = os.path.join(root, name) + os.sep
  193. if get_platform() == "Windows":
  194. root_path = slash_replace(root_path)
  195. p = slash_replace(p)
  196. p = re.sub(root_path, "", p)
  197. root_path = slash_replace(root_path, True)
  198. p = slash_replace(p, True)
  199. else:
  200. p = re.sub(root_path, "", p)
  201. path_list.append(p)
  202. for name in files:
  203. p = os.path.join(root, name)
  204. if get_platform() == "Windows":
  205. root_path = slash_replace(root_path)
  206. p = slash_replace(p)
  207. p = re.sub(root_path, "", p)
  208. root_path = slash_replace(root_path, True)
  209. p = slash_replace(p, True)
  210. else:
  211. p = re.sub(root_path, "", p)
  212. path_list.append(p)
  213. # 按路径长度排序
  214. path_list.sort(key=lambda x: len(x), reverse=True)
  215. # 循环改名
  216. for old_path in path_list:
  217. # 按路径分隔符分割
  218. ss = old_path.split(os.sep)
  219. # 判断是否文件夹
  220. is_dir = 0
  221. file_type = ""
  222. if os.path.isdir(root_path + old_path):
  223. ss = ss[:-1]
  224. is_dir = 1
  225. else:
  226. if "." in old_path:
  227. file_type = "." + old_path.split(".")[-1]
  228. else:
  229. file_type = ""
  230. # 最后一级需要用hash改名
  231. new_path = ""
  232. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  233. current_level = 0
  234. for s in ss:
  235. # 路径拼接
  236. if current_level < len(ss) - 1:
  237. new_path += s + os.sep
  238. else:
  239. new_path += str(hash(s)) + file_type
  240. current_level += 1
  241. new_ab_path = root_path + new_path
  242. old_ab_path = root_path + old_path
  243. os.rename(old_ab_path, new_ab_path)
  244. # 重新获取解压文件夹下所有文件+文件夹
  245. new_path_list = []
  246. for root, dirs, files in os.walk(root_path, topdown=False):
  247. for name in dirs:
  248. new_path_list.append(os.path.join(root, name) + os.sep)
  249. for name in files:
  250. new_path_list.append(os.path.join(root, name))
  251. return new_path_list
  252. except:
  253. traceback.print_exc()
  254. return [-1]
  255. def judge_format(path):
  256. guess1 = mimetypes.guess_type(path)
  257. _type = None
  258. if guess1[0]:
  259. _type = guess1[0]
  260. else:
  261. guess2 = filetype.guess(path)
  262. if guess2:
  263. _type = guess2.mime
  264. if _type == "application/pdf":
  265. return "pdf"
  266. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  267. return "docx"
  268. if _type == "application/x-zip-compressed" or _type == "application/zip":
  269. return "zip"
  270. if _type == "application/x-rar-compressed" or _type == "application/rar":
  271. return "rar"
  272. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  273. return "xlsx"
  274. if _type == "application/msword":
  275. return "doc"
  276. if _type == "image/png":
  277. return "png"
  278. if _type == "image/jpeg":
  279. return "jpg"
  280. # 猜不到,返回None
  281. return None
  282. def draw_lines_plt(bboxes):
  283. import matplotlib.pyplot as plt
  284. plt.figure()
  285. for bbox in bboxes:
  286. x = [bbox[0], bbox[2]]
  287. y = [bbox[1], bbox[3]]
  288. plt.plot(x, y)
  289. plt.show()
  290. def slash_replace(_str, reverse=False):
  291. if reverse:
  292. _str = eval(repr(_str).replace('/', '\\\\'))
  293. else:
  294. _str = eval(repr(_str).replace('\\\\', '/'))
  295. return _str
  296. class LineTable:
  297. def recognize_table(self, list_textbox, list_line, sourceP_LB=True, splited=False, from_pdf=False):
  298. self.list_line = list_line
  299. self.list_crosspoints = self.recognize_crosspoints(list_line)
  300. self.from_pdf = from_pdf
  301. self.splited = splited
  302. self.connect_bbox_list = []
  303. # 聚类
  304. cluster_crosspoints = []
  305. for _point in self.list_crosspoints:
  306. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  307. while 1:
  308. _find = False
  309. new_cluster_crosspoints = []
  310. for l_point in cluster_crosspoints:
  311. _flag = False
  312. for l_n_point in new_cluster_crosspoints:
  313. line1 = l_point.get("lines")
  314. line2 = l_n_point.get("lines")
  315. if len(line1 & line2) > 0:
  316. _find = True
  317. _flag = True
  318. l_n_point["lines"] = line1.union(line2)
  319. l_n_point["points"].extend(l_point["points"])
  320. if not _flag:
  321. new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  322. cluster_crosspoints = new_cluster_crosspoints
  323. if not _find:
  324. break
  325. # need to sort to deal with the inner tables
  326. for clu_cp in cluster_crosspoints:
  327. points = clu_cp["points"]
  328. list_p = np.array([p["point"] for p in points])
  329. max_x = max(list_p[..., 0])
  330. min_x = min(list_p[..., 0])
  331. max_y = max(list_p[..., 1])
  332. min_y = min(list_p[..., 1])
  333. _area = (max_y - min_y) * (max_x - min_x)
  334. clu_cp["area"] = _area
  335. cluster_crosspoints.sort(key=lambda x: x["area"])
  336. list_l_rect = []
  337. for table_crosspoint in cluster_crosspoints:
  338. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  339. list_l_rect.append(list_rect)
  340. in_objs = set()
  341. list_tables = []
  342. for l_rect in list_l_rect:
  343. _ta = self.rect2table(list_textbox, l_rect, in_objs, sourceP_LB=sourceP_LB)
  344. if self.connect_bbox_list:
  345. return [], [], [], self.connect_bbox_list
  346. if _ta:
  347. list_tables.append(_ta)
  348. # 展示表格及文字
  349. # self._plot(list_line, list_textbox)
  350. return list_tables, in_objs, list_l_rect, []
  351. # def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  352. #
  353. # dump_margin = 5
  354. # list_rect_tmp = []
  355. # # 去重
  356. # for _rect in list_rect:
  357. # if (_rect.bbox[3] - _rect.bbox[1] < 10) or (abs(_rect.bbox[2] - _rect.bbox[0]) < 5):
  358. # continue
  359. # _find = False
  360. # for _tmp in list_rect_tmp:
  361. # for i in range(4):
  362. # if abs(_rect.bbox[i] - _tmp.bbox[i]) < dump_margin:
  363. # pass
  364. # else:
  365. # _find = False
  366. # break
  367. # if i == 3:
  368. # _find = True
  369. # if _find:
  370. # break
  371. # if not _find:
  372. # list_rect_tmp.append(_rect)
  373. #
  374. # # print("=====",len(list_rect),len(list_rect_tmp))
  375. # # print(list_rect_tmp)
  376. # # from matplotlib import pyplot as plt
  377. # # plt.figure()
  378. # # for _rect in list_rect_tmp:
  379. # # x0,y0,x1,y1 = _rect.bbox
  380. # # plt.boxplot(_rect.bbox)
  381. # # plt.show()
  382. #
  383. # cluster_rect = []
  384. # for _rect in list_rect:
  385. # _find = False
  386. # for cr in cluster_rect:
  387. # for cr_rect in cr:
  388. # if abs((cr_rect.bbox[2] - cr_rect.bbox[0] + _rect.bbox[2] - _rect.bbox[0]) - (
  389. # max(cr_rect.bbox[2], _rect.bbox[2]) - min(cr_rect.bbox[0], _rect.bbox[0]))) < margin:
  390. # _find = True
  391. # cr.append(_rect)
  392. # break
  393. # elif abs((cr_rect.bbox[3] - cr_rect.bbox[1] + _rect.bbox[3] - _rect.bbox[1]) - (
  394. # max(cr_rect.bbox[3], _rect.bbox[3]) - min(cr_rect.bbox[1], _rect.bbox[1]))) < margin:
  395. # _find = True
  396. # cr.append(_rect)
  397. # break
  398. # if _find:
  399. # break
  400. # if not _find:
  401. # cluster_rect.append([_rect])
  402. #
  403. # list_l_rect = cluster_rect
  404. #
  405. # in_objs = set()
  406. # list_tables = []
  407. # for l_rect in list_l_rect:
  408. # _ta = self.rect2table(list_textbox, l_rect, in_objs)
  409. # if _ta:
  410. # list_tables.append(_ta)
  411. # return list_tables, in_objs, list_l_rect
  412. def recognize_crosspoints(self, list_line, fixLine=True):
  413. list_crosspoints = []
  414. # print("lines num",len(list_line))
  415. def getMaxPoints(list_x, margin=5, reverse=False):
  416. clust_x = []
  417. for _x in list_x:
  418. _find = False
  419. for cx in clust_x:
  420. if abs(cx[0] - _x) < margin:
  421. _find = True
  422. cx.append(_x)
  423. break
  424. if not _find:
  425. clust_x.append([_x])
  426. clust_x.sort(key=lambda x: x, reverse=reverse)
  427. return clust_x[0][0], len(clust_x[0])
  428. for _i in range(len(list_line)):
  429. for _j in range(len(list_line)):
  430. line1 = list_line[_i].__dict__.get("bbox")
  431. line2 = list_line[_j].__dict__.get("bbox")
  432. exists, point = self.cross_point(line1, line2)
  433. if exists:
  434. list_crosspoints.append(point)
  435. if fixLine:
  436. # 聚类
  437. cluster_crosspoints = []
  438. for _point in list_crosspoints:
  439. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  440. while 1:
  441. _find = False
  442. new_cluster_crosspoints = []
  443. for l_point in cluster_crosspoints:
  444. _flag = False
  445. for l_n_point in new_cluster_crosspoints:
  446. line1 = l_point.get("lines")
  447. line2 = l_n_point.get("lines")
  448. if len(line1 & line2) > 0:
  449. _find = True
  450. _flag = True
  451. l_n_point["lines"] = line1.union(line2)
  452. l_n_point["points"].extend(l_point["points"])
  453. if not _flag:
  454. new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  455. cluster_crosspoints = new_cluster_crosspoints
  456. if not _find:
  457. break
  458. list_crosspoints = []
  459. for list_cp in cluster_crosspoints:
  460. points = list_cp.get("points")
  461. l_lines = []
  462. for p in points:
  463. l_lines.extend(p.get("p_lines"))
  464. l_lines = list(set(l_lines))
  465. l_lines.sort(key=lambda x: x[0])
  466. min_x, _count = getMaxPoints([l[0] for l in l_lines], reverse=False)
  467. if _count <= 2:
  468. min_x = None
  469. min_y, _count = getMaxPoints([l[1] for l in l_lines], reverse=False)
  470. if _count < 2:
  471. min_y = None
  472. max_x, _count = getMaxPoints([l[2] for l in l_lines], reverse=True)
  473. if _count <= 2:
  474. max_x = None
  475. max_y, _count = getMaxPoints([l[3] for l in l_lines], reverse=True)
  476. if _count <= 2:
  477. max_y = None
  478. if min_x and min_y and max_x and max_y:
  479. points.sort(key=lambda x: x["point"][0])
  480. if abs(min_x - points[0]["point"][0]) > 30:
  481. _line = LTLine(1, (min_x, min_y), (min_x, max_y))
  482. list_line.append(_line)
  483. l_lines.append(_line.bbox)
  484. # print("add=====",_line.bbox)
  485. if abs(max_x - points[-1]["point"][0]) > 30:
  486. _line = LTLine(1, (max_x, min_y), (max_x, max_y))
  487. list_line.append(_line)
  488. l_lines.append(_line.bbox)
  489. # print("add=====1",_line.bbox)
  490. points.sort(key=lambda x: x["point"][1])
  491. if abs(min_y - points[0]["point"][1]) > 30:
  492. _line = LTLine(1, (min_x, min_y), (max_x, min_y))
  493. list_line.append(_line)
  494. l_lines.append(_line.bbox)
  495. # print("add=====2",_line.bbox)
  496. if abs(max_y - points[-1]["point"][1]) > 30:
  497. _line = LTLine(1, (min_x, max_y), (max_x, max_y))
  498. list_line.append(_line)
  499. l_lines.append(_line.bbox)
  500. # print("add=====2",_line.bbox)
  501. for _i in range(len(l_lines)):
  502. for _j in range(len(l_lines)):
  503. line1 = l_lines[_i]
  504. line2 = l_lines[_j]
  505. exists, point = self.cross_point(line1, line2)
  506. if exists:
  507. list_crosspoints.append(point)
  508. # from matplotlib import pyplot as plt
  509. # plt.figure()
  510. # for _line in l_lines:
  511. # x0,y0,x1,y1 = _line
  512. # plt.plot([x0,x1],[y0,y1])
  513. # for point in list_crosspoints:
  514. # plt.scatter(point.get("point")[0],point.get("point")[1])
  515. # plt.show()
  516. # print(list_crosspoints)
  517. # print("points num",len(list_crosspoints))
  518. return list_crosspoints
  519. # def recognize_rect(self, _page):
  520. # list_line = []
  521. # for _obj in _page._objs:
  522. # if isinstance(_obj, (LTLine)):
  523. # list_line.append(_obj)
  524. # list_crosspoints = self.recognize_crosspoints(list_line)
  525. #
  526. # # 聚类
  527. # cluster_crosspoints = []
  528. # for _point in list_crosspoints:
  529. # cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  530. # while 1:
  531. # _find = False
  532. # new_cluster_crosspoints = []
  533. # for l_point in cluster_crosspoints:
  534. # _flag = False
  535. # for l_n_point in new_cluster_crosspoints:
  536. # line1 = l_point.get("lines")
  537. # line2 = l_n_point.get("lines")
  538. # if len(line1 & line2) > 0:
  539. # _find = True
  540. # _flag = True
  541. # l_n_point["lines"] = line1.union(line2)
  542. # l_n_point["points"].extend(l_point["points"])
  543. # if not _flag:
  544. # new_cluster_crosspoints.append({"lines": l_point.get("lines"), "points": l_point.get("points")})
  545. # cluster_crosspoints = new_cluster_crosspoints
  546. # if not _find:
  547. # break
  548. # # print(len(cluster_crosspoints))
  549. #
  550. # list_l_rect = []
  551. # for table_crosspoint in cluster_crosspoints:
  552. # list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  553. # list_l_rect.append(list_rect)
  554. #
  555. # return list_l_rect
  556. def crosspoint2rect(self, list_crosspoint, margin=10):
  557. dict_line_points = {}
  558. for _point in list_crosspoint:
  559. lines = list(_point.get("lines"))
  560. for _line in lines:
  561. if _line not in dict_line_points:
  562. dict_line_points[_line] = {"direct": None, "points": []}
  563. dict_line_points[_line]["points"].append(_point)
  564. # 排序
  565. for k, v in dict_line_points.items():
  566. list_x = []
  567. list_y = []
  568. for _p in v["points"]:
  569. list_x.append(_p.get("point")[0])
  570. list_y.append(_p.get("point")[1])
  571. if max(list_x) - min(list_x) > max(list_y) - min(list_y):
  572. v.get("points").sort(key=lambda x: x.get("point")[0])
  573. v["direct"] = "row"
  574. else:
  575. v.get("points").sort(key=lambda x: x.get("point")[1])
  576. v["direct"] = "column"
  577. list_rect = []
  578. for _point in list_crosspoint:
  579. if _point["buttom"] >= margin and _point["right"] >= margin:
  580. lines = list(_point.get("lines"))
  581. _line = lines[0]
  582. if dict_line_points[_line]["direct"] == "column":
  583. _line = lines[1]
  584. next_point = None
  585. for p1 in dict_line_points[_line]["points"]:
  586. if p1["buttom"] >= margin and p1["point"][0] > _point["point"][0]:
  587. next_point = p1
  588. break
  589. if not next_point:
  590. continue
  591. lines = list(next_point.get("lines"))
  592. _line = lines[0]
  593. if dict_line_points[_line]["direct"] == "row":
  594. _line = lines[1]
  595. final_point = None
  596. for p1 in dict_line_points[_line]["points"]:
  597. if p1["left"] >= margin and p1["point"][1] > next_point["point"][1]:
  598. final_point = p1
  599. break
  600. if not final_point:
  601. continue
  602. _r = LTRect(1,
  603. (_point["point"][0], _point["point"][1], final_point["point"][0], final_point["point"][1]))
  604. list_rect.append(_r)
  605. tmp_rect = []
  606. set_bbox = set()
  607. for _r in list_rect:
  608. _bbox = "%.2f-%.2f-%.2f-%.2f" % _r.bbox
  609. width = _r.bbox[2] - _r.bbox[0]
  610. height = _r.bbox[3] - _r.bbox[1]
  611. if width <= margin or height <= margin:
  612. continue
  613. if _bbox not in set_bbox:
  614. tmp_rect.append(_r)
  615. set_bbox.add(_bbox)
  616. list_rect = tmp_rect
  617. # _l = [x.get('point') for x in list_crosspoint]
  618. # _l.sort(key=lambda x: (x[0], x[1]))
  619. # print('list_crosspoint', _l)
  620. # print('list_rect', list_rect)
  621. # import cv2
  622. # import numpy as np
  623. # import random
  624. # img = np.zeros(shape=(1000,1000),dtype=np.uint8)
  625. # img += 255
  626. #
  627. # color = []
  628. # for rect in list_rect:
  629. # color += 10
  630. # x0,y0,x1,y1 = rect.bbox
  631. # x0 *= 10/18
  632. # y0 *= 10/18
  633. # x1 *= 10/18
  634. # y1 *= 10/18
  635. # print(rect.bbox)
  636. # cv2.rectangle(img, (int(x0),int(y0)),(int(x1),int(y1)), (color%255, (color+10)%255, (color+20)%255), 3)
  637. # cv2.imshow("bbox", img)
  638. # cv2.waitKey(0)
  639. return list_rect
  640. def cross_point(self, line1, line2, segment=True, margin=2):
  641. point_is_exist = False
  642. x = y = 0
  643. x1, y1, x2, y2 = line1
  644. x3, y3, x4, y4 = line2
  645. if (x2 - x1) == 0:
  646. k1 = None
  647. b1 = 0
  648. else:
  649. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  650. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  651. if (x4 - x3) == 0: # L2直线斜率不存在
  652. k2 = None
  653. b2 = 0
  654. else:
  655. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  656. b2 = y3 * 1.0 - x3 * k2 * 1.0
  657. if k1 is None:
  658. if not k2 is None:
  659. x = x1
  660. y = k2 * x1 + b2
  661. point_is_exist = True
  662. elif k2 is None:
  663. x = x3
  664. y = k1 * x3 + b1
  665. elif not k2 == k1:
  666. x = (b2 - b1) * 1.0 / (k1 - k2)
  667. y = k1 * x * 1.0 + b1 * 1.0
  668. point_is_exist = True
  669. left = 0
  670. right = 0
  671. top = 0
  672. buttom = 0
  673. if point_is_exist:
  674. if segment:
  675. if x >= (min(x1, x2) - margin) and x <= (max(x1, x2) + margin) and y >= (
  676. min(y1, y2) - margin) and y <= (max(y1, y2) + margin):
  677. if x >= (min(x3, x4) - margin) and x <= (max(x3, x4) + margin) and y >= (
  678. min(y3, y4) - margin) and y <= (max(y3, y4) + margin):
  679. point_is_exist = True
  680. left = abs(min(x1, x3) - x)
  681. right = abs(max(x2, x4) - x)
  682. top = abs(min(y1, y3) - y)
  683. buttom = abs(max(y2, y4) - y)
  684. else:
  685. point_is_exist = False
  686. else:
  687. point_is_exist = False
  688. line1_key = "%.2f-%.2f-%.2f-%.2f" % (x1, y1, x2, y2)
  689. line2_key = "%.2f-%.2f-%.2f-%.2f" % (x3, y3, x4, y4)
  690. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  691. "top": top, "buttom": buttom, "lines": set([line1_key, line2_key]),
  692. "p_lines": [line1, line2]}
  693. # def unionTable(self, list_table, fixspan=True, margin=2):
  694. # set_x = set()
  695. # set_y = set()
  696. #
  697. # list_cell = []
  698. # for _t in list_table:
  699. # for _line in _t:
  700. # list_cell.extend(_line)
  701. #
  702. # clusters_rects = []
  703. # # 根据y1聚类
  704. # set_id = set()
  705. # list_cell_dump = []
  706. # for _cell in list_cell:
  707. # _id = id(_cell)
  708. # if _id in set_id:
  709. # continue
  710. # set_id.add(_id)
  711. # list_cell_dump.append(_cell)
  712. # list_cell = list_cell_dump
  713. # list_cell.sort(key=lambda x: x.get("bbox")[3])
  714. # for _rect in list_cell:
  715. # _y0 = _rect.get("bbox")[3]
  716. # _find = False
  717. # for l_cr in clusters_rects:
  718. # if abs(l_cr[0].get("bbox")[3] - _y0) < 2:
  719. # _find = True
  720. # l_cr.append(_rect)
  721. # break
  722. # if not _find:
  723. # clusters_rects.append([_rect])
  724. #
  725. # clusters_rects.sort(key=lambda x: x[0].get("bbox")[3], reverse=True)
  726. # for l_cr in clusters_rects:
  727. # l_cr.sort(key=lambda x: x.get("bbox")[0])
  728. #
  729. # # print("=============:")
  730. # # for l_r in clusters_rects:
  731. # # print(len(l_r))
  732. #
  733. # for _line in clusters_rects:
  734. # for _rect in _line:
  735. # (x0, y0, x1, y1) = _rect.get("bbox")
  736. # set_x.add(x0)
  737. # set_x.add(x1)
  738. # set_y.add(y0)
  739. # set_y.add(y1)
  740. # if len(set_x) == 0 or len(set_y) == 0:
  741. # return
  742. # list_x = list(set_x)
  743. # list_y = list(set_y)
  744. #
  745. # list_x.sort(key=lambda x: x)
  746. # list_y.sort(key=lambda x: x, reverse=True)
  747. # _table = []
  748. # line_i = 0
  749. # for _line in clusters_rects:
  750. #
  751. # table_line = []
  752. # cell_i = 0
  753. # for _rect in _line:
  754. # (x0, y0, x1, y1) = _rect.get("bbox")
  755. # _cell = {"bbox": (x0, y0, x1, y1), "rect": _rect.get("rect"),
  756. # "rowspan": self.getspan(list_y, y0, y1, margin),
  757. # "columnspan": self.getspan(list_x, x0, x1, margin), "text": _rect.get("text", "")}
  758. # table_line.append(_cell)
  759. #
  760. # cell_i += 1
  761. # line_i += 1
  762. # _table.append(table_line)
  763. #
  764. # # print("=====================>>")
  765. # # for _line in _table:
  766. # # for _cell in _line:
  767. # # print(_cell,end="\t")
  768. # # print("\n")
  769. # # print("=====================>>")
  770. #
  771. # # print(_table)
  772. # if fixspan:
  773. # for _line in _table:
  774. # extend_line = []
  775. # for c_i in range(len(_line)):
  776. # _cell = _line[c_i]
  777. # if _cell.get("columnspan") > 1:
  778. # _cospan = _cell.get("columnspan")
  779. # _cell["columnspan"] = 1
  780. # for i in range(1, _cospan):
  781. # extend_line.append({"index": c_i + 1, "cell": _cell})
  782. # extend_line.sort(key=lambda x: x["index"], reverse=True)
  783. # for _el in extend_line:
  784. # _line.insert(_el["index"], _el["cell"])
  785. # for l_i in range(len(_table)):
  786. # _line = _table[l_i]
  787. # for c_i in range(len(_line)):
  788. # _cell = _line[c_i]
  789. # if _cell.get("rowspan") > 1:
  790. # _rospan = _cell.get("rowspan")
  791. # _cell["rowspan"] = 1
  792. # for i in range(1, _rospan):
  793. # _table[l_i + i].insert(c_i, _cell)
  794. #
  795. # table_bbox = (_table[0][0].get("bbox")[0], _table[0][0].get("bbox")[1], _table[-1][-1].get("bbox")[2],
  796. # _table[-1][-1].get("bbox")[3])
  797. #
  798. # ta = {"bbox": table_bbox, "table": _table}
  799. # return ta
  800. # 获取点阵
  801. def getSpanLocation(self, _list, x0, x1, margin):
  802. list_location = []
  803. (x0, x1) = (min(x0, x1), max(x0, x1))
  804. for _x in _list:
  805. if _x >= (x0 - margin) and _x <= (x1 + margin):
  806. list_location.append(_x)
  807. return list_location
  808. def fixSpan(self, _table, list_x, list_y, sourceP_LB):
  809. def checkPosition(_line, _position, bbox, margin=5):
  810. # check y
  811. if len(_line) > 0:
  812. _bbox = _line[0].get("bbox")
  813. # check if has lap
  814. if (min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3])):
  815. # if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
  816. # print(_bbox)
  817. # print(bbox)
  818. # print("check position y false")
  819. return False
  820. # check x
  821. if _position <= len(_line) - 1:
  822. after_bbox = _line[_position].get("bbox")
  823. # the insert bbox.x1 should not less then the after bbox.x0
  824. if not (after_bbox[0] >= bbox[2]):
  825. # print("check position x after false")
  826. return False
  827. if _position - 1 > 0 and _position - 1 < len(_line):
  828. before_bbox = _line[_position - 1].get("bbox")
  829. # the insert bbox.x1 should less equal than the first bbox.x0
  830. if not (bbox[0] >= before_bbox[2]):
  831. # print("check position x before false")
  832. return False
  833. return True
  834. # 拓展columnspan的数据
  835. for _line in _table:
  836. c_i = 0
  837. while c_i < len(_line):
  838. _cell = _line[c_i]
  839. if _cell.get("columnspan") > 1:
  840. x0, y0, x1, y1 = _cell.get("bbox")
  841. _cospan = _cell.get("columnspan")
  842. locations = self.getSpanLocation(list_x, x0, x1, 10)
  843. if len(locations) == _cospan + 1:
  844. _cell["bbox"] = (x0, y0, locations[1], y1)
  845. _cell["columnspan"] = 1
  846. # len(locations)==_colspan+1
  847. for i in range(1, _cospan):
  848. n_cell = {}
  849. n_cell.update(_cell)
  850. n_cell["bbox"] = (locations[i], y0, locations[i + 1], y1)
  851. c_i += 1
  852. # check the position
  853. if checkPosition(_line, c_i, n_cell["bbox"]):
  854. _line.insert(c_i, n_cell)
  855. c_i += 1
  856. # 拓展rowspan的数据
  857. for l_i in range(len(_table)):
  858. _line = _table[l_i]
  859. c_i = 0
  860. while c_i < len(_line):
  861. _cell = _line[c_i]
  862. if _cell.get("rowspan") > 1:
  863. x0, y0, x1, y1 = _cell.get("bbox")
  864. _rospan = _cell.get("rowspan")
  865. locations = self.getSpanLocation(list_y, y0, y1, 10)
  866. if len(locations) == _rospan + 1:
  867. _cell["bbox"] = (x0, y0, x1, locations[1])
  868. _cell["rowspan"] = 1
  869. for i in range(1, _rospan):
  870. n_cell = {}
  871. n_cell.update(_cell)
  872. if l_i + i <= len(_table) - 1:
  873. # print(len(_table),l_i+i)
  874. n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
  875. if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
  876. _table[l_i + i].insert(c_i, n_cell)
  877. c_i += 1
  878. def fixRect(self, _table, list_x, list_y, sourceP_LB, margin):
  879. self.fixSpan(_table, list_x, list_y, sourceP_LB)
  880. # for line_i in range(len(_table)):
  881. # for cell_i in range(len(_table[line_i])):
  882. # _cell = _table[line_i][cell_i]
  883. # print(line_i,cell_i,_cell["bbox"],_cell["text"])
  884. for _line in _table:
  885. extend_line = []
  886. for c_i in range(len(_line)):
  887. c_cell = _line[c_i]
  888. # first cell missing
  889. if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
  890. _bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
  891. _cell = {"bbox": _bbox,
  892. "rect": LTRect(1, _bbox),
  893. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  894. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  895. "text": ""}
  896. extend_line.append({"index": c_i, "cell": _cell})
  897. # cell in the median missing
  898. if c_i < len(_line) - 1:
  899. n_cell = _line[c_i + 1]
  900. _bbox = c_cell["bbox"]
  901. n_bbox = n_cell["bbox"]
  902. if _bbox[0] == n_bbox[0] and _bbox[2] == n_bbox[2]:
  903. continue
  904. else:
  905. if abs(_bbox[2] - n_bbox[0]) > margin:
  906. _bbox = (_bbox[2], _bbox[1], n_bbox[0], _bbox[3])
  907. _cell = {"bbox": _bbox,
  908. "rect": LTRect(1, _bbox),
  909. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  910. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  911. "text": ""}
  912. extend_line.append({"index": c_i + 1, "cell": _cell})
  913. # last cell missing
  914. if c_i == len(_line) - 1:
  915. if abs(c_cell["bbox"][2] - list_x[-1]) > margin:
  916. _bbox = (c_cell["bbox"][2], c_cell["bbox"][1], list_x[-1], c_cell["bbox"][3])
  917. _cell = {"bbox": _bbox,
  918. "rect": LTRect(1, _bbox),
  919. "rowspan": self.getspan(list_y, _bbox[1], _bbox[3], margin),
  920. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  921. "text": ""}
  922. extend_line.append({"index": c_i + 1, "cell": _cell})
  923. extend_line.sort(key=lambda x: x["index"], reverse=True)
  924. for _tmp in extend_line:
  925. _line.insert(_tmp["index"], _tmp["cell"])
  926. def feedText2table(self, _table, list_textbox, in_objs, sourceP_LB):
  927. # find the suitable cell of the textbox
  928. list_cells = []
  929. for table_line in _table:
  930. for _cell in table_line:
  931. list_cells.append({"cell": _cell, "inbox_textbox_list": []})
  932. self.connect_bbox_list = []
  933. for textbox in list_textbox:
  934. list_iou = []
  935. for _d in list_cells:
  936. _cell = _d["cell"]
  937. _iou = self.getIOU(textbox.bbox, _cell["bbox"])
  938. list_iou.append(_iou)
  939. max_iou_index = np.argmax(list_iou)
  940. max_iou = list_iou[max_iou_index]
  941. # if self.from_pdf:
  942. # iou_threhold = 0.3
  943. # else:
  944. iou_threhold = 0.1
  945. if max_iou > iou_threhold and textbox not in in_objs:
  946. list_cells[max_iou_index]["inbox_textbox_list"].append(textbox)
  947. in_objs.add(textbox)
  948. if not self.from_pdf and not self.splited:
  949. # 多个iou大于0.3的,可能是ocr将两个文本合成一个了
  950. iou_index_list = np.where(np.array(list_iou) >= 0.3)[0].tolist()
  951. if len(iou_index_list) >= 2:
  952. # print('len(iou_index_list) >= 2 textbox', textbox)
  953. self.connect_bbox_list.append(textbox)
  954. has_matched_box_list = []
  955. for _d in list_cells:
  956. _cell = _d["cell"]
  957. inbox_textbox_list = _d["inbox_textbox_list"]
  958. # 分行,根据y重合
  959. all_match_box_list = []
  960. inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
  961. for i in range(len(inbox_textbox_list)):
  962. match_box_list = []
  963. box1 = inbox_textbox_list[i]
  964. if box1 in has_matched_box_list:
  965. continue
  966. min_y1 = box1.bbox[1] + 1 / 3 * abs(box1.bbox[3] - box1.bbox[1])
  967. max_y1 = box1.bbox[3] - 1 / 3 * abs(box1.bbox[3] - box1.bbox[1])
  968. match_box_list.append(
  969. [box1.get_text(), box1.bbox[0], box1.bbox[1], box1.bbox[2], box1.bbox[3], min_y1, max_y1])
  970. has_matched_box_list.append(box1)
  971. for j in range(i + 1, len(inbox_textbox_list)):
  972. box2 = inbox_textbox_list[j]
  973. if box2 in has_matched_box_list:
  974. continue
  975. # print(min_y1, box2.bbox[1], box2.bbox[3], max_y1)
  976. # print(min_y2, box1.bbox[3], max_y2)
  977. if min_y1 <= box2.bbox[1] <= max_y1 or \
  978. min_y1 <= box2.bbox[3] <= max_y1 or \
  979. box2.bbox[1] <= min_y1 <= max_y1 <= box2.bbox[3]:
  980. match_box_list.append(
  981. [box2.get_text(), box2.bbox[0], box2.bbox[1], box2.bbox[2], box2.bbox[3], min_y1, max_y1])
  982. has_matched_box_list.append(box2)
  983. match_box_list.sort(key=lambda x: x[1])
  984. all_match_box_list.append(match_box_list)
  985. # print("match_box_list", all_match_box_list)
  986. all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
  987. for box_list in all_match_box_list:
  988. for box in box_list:
  989. _cell["text"] += re.sub("\s", '', box[0])
  990. def makeTableByRect(self, list_rect, margin, sourceP_LB):
  991. _table = []
  992. set_x = set()
  993. set_y = set()
  994. clusters_rects = []
  995. # 根据y1聚类
  996. if sourceP_LB:
  997. list_rect.sort(key=lambda x: x.bbox[3])
  998. for _rect in list_rect:
  999. _y0 = _rect.bbox[3]
  1000. _y1 = _rect.bbox[1]
  1001. _find = False
  1002. for l_cr in clusters_rects:
  1003. if abs(l_cr[0].bbox[3] - _y0) < margin:
  1004. _find = True
  1005. l_cr.append(_rect)
  1006. break
  1007. if not _find:
  1008. clusters_rects.append([_rect])
  1009. else:
  1010. list_rect.sort(key=lambda x: x.bbox[1])
  1011. for _rect in list_rect:
  1012. _y0 = _rect.bbox[1]
  1013. _y1 = _rect.bbox[3]
  1014. _find = False
  1015. for l_cr in clusters_rects:
  1016. if abs(l_cr[0].bbox[1] - _y0) < margin:
  1017. _find = True
  1018. l_cr.append(_rect)
  1019. break
  1020. if not _find:
  1021. clusters_rects.append([_rect])
  1022. # print("textbox:===================")
  1023. # for _textbox in list_textbox:
  1024. # print(_textbox.get_text())
  1025. # print("textbox:======>>>>>>>>>>>>>")
  1026. # for c in clusters_rects:
  1027. # print("+"*30)
  1028. # for cc in c:
  1029. # print("rect", cc.)
  1030. # cul spans
  1031. for _line in clusters_rects:
  1032. for _rect in _line:
  1033. (x0, y0, x1, y1) = _rect.bbox
  1034. set_x.add(x0)
  1035. set_x.add(x1)
  1036. set_y.add(y0)
  1037. set_y.add(y1)
  1038. if len(set_x) == 0 or len(set_y) == 0:
  1039. return None, [], []
  1040. if len(list_rect) <= 1:
  1041. return None, [], []
  1042. list_x = list(set_x)
  1043. list_y = list(set_y)
  1044. list_x.sort(key=lambda x: x)
  1045. list_y.sort(key=lambda x: x, reverse=sourceP_LB)
  1046. # print("clusters_rects", len(clusters_rects))
  1047. if sourceP_LB:
  1048. clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
  1049. clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
  1050. for l_cr in clusters_rects:
  1051. l_cr.sort(key=lambda x: x.bbox[0])
  1052. pop_x = []
  1053. for i in range(len(list_x) - 1):
  1054. _i = len(list_x) - i - 1
  1055. l_i = _i - 1
  1056. if abs(list_x[_i] - list_x[l_i]) < 5:
  1057. pop_x.append(_i)
  1058. pop_x.sort(key=lambda x: x, reverse=True)
  1059. for _x in pop_x:
  1060. list_x.pop(_x)
  1061. #
  1062. pop_x = []
  1063. for i in range(len(list_y) - 1):
  1064. _i = len(list_y) - i - 1
  1065. l_i = _i - 1
  1066. if abs(list_y[_i] - list_y[l_i]) < 5:
  1067. pop_x.append(_i)
  1068. pop_x.sort(key=lambda x: x, reverse=True)
  1069. for _x in pop_x:
  1070. list_y.pop(_x)
  1071. # print("list_x", list_x)
  1072. # print("list_y", list_y)
  1073. line_i = 0
  1074. for _line in clusters_rects:
  1075. table_line = []
  1076. cell_i = 0
  1077. for _rect in _line:
  1078. (x0, y0, x1, y1) = _rect.bbox
  1079. _cell = {"bbox": (x0, y0, x1, y1),
  1080. "rect": _rect,
  1081. "rowspan": self.getspan(list_y, y0, y1, margin),
  1082. "columnspan": self.getspan(list_x, x0, x1, margin),
  1083. "text": ""}
  1084. cell_i += 1
  1085. table_line.append(_cell)
  1086. line_i += 1
  1087. _table.append(table_line)
  1088. return _table, list_x, list_y
  1089. def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=True):
  1090. def getIOU(bbox0, bbox1):
  1091. width = max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0]) - (bbox0[2] - bbox0[0] + bbox1[2] - bbox1[0])
  1092. height = max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1]) - (bbox0[3] - bbox0[1] + bbox1[3] - bbox1[1])
  1093. if width < 0 and height < 0:
  1094. return abs(width * height / min(abs((bbox0[2] - bbox0[0]) * (bbox0[3] - bbox0[1])),
  1095. abs((bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]))))
  1096. return 0
  1097. _table, list_x, list_y = self.makeTableByRect(list_rect, margin, sourceP_LB)
  1098. if _table is None:
  1099. return
  1100. self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
  1101. # print("table===========================>")
  1102. # for _line in _table:
  1103. # for _cell in _line:
  1104. # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
  1105. # print()
  1106. # print("table===========================>")
  1107. #
  1108. # print("------------")
  1109. # for _line in _table:
  1110. # for _cell in _line:
  1111. # print(_cell["text"],end="\t")
  1112. # print("\n")
  1113. # print("------------")
  1114. self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
  1115. # print("table===========================>")
  1116. # for _line in _table:
  1117. # for _cell in _line:
  1118. # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
  1119. # print()
  1120. # print("table===========================>")
  1121. self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
  1122. # feedText2table后,有textbox符合多个单元格iou的,可能是文本错误连接了,需拆开
  1123. if self.connect_bbox_list:
  1124. return {}
  1125. table_bbox = (_table[0][0].get("bbox")[0],
  1126. _table[0][0].get("bbox")[1],
  1127. _table[-1][-1].get("bbox")[2],
  1128. _table[-1][-1].get("bbox")[3])
  1129. # print("=======")
  1130. # for _line in _table:
  1131. # for _cell in _line:
  1132. # print(_cell["text"])
  1133. # print("\n")
  1134. # print("===========")
  1135. ta = {"bbox": table_bbox, "table": _table}
  1136. return ta
  1137. def inbox(self, bbox0, bbox_g, text=""):
  1138. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  1139. # return 1
  1140. # print("utils inbox", text, self.getIOU(bbox0,bbox_g), bbox0, bbox_g)
  1141. if self.getIOU(bbox0, bbox_g) > 0.2:
  1142. return 1
  1143. return 0
  1144. def getIOU(self, bbox0, bbox1):
  1145. bbox0 = [min(bbox0[0], bbox0[2]), min(bbox0[1], bbox0[3]), max(bbox0[0], bbox0[2]), max(bbox0[1], bbox0[3])]
  1146. bbox1 = [min(bbox1[0], bbox1[2]), min(bbox1[1], bbox1[3]), max(bbox1[0], bbox1[2]), max(bbox1[1], bbox1[3])]
  1147. width = abs(max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0])) - (
  1148. abs(bbox0[2] - bbox0[0]) + abs(bbox1[2] - bbox1[0]))
  1149. height = abs(max(bbox0[3], bbox1[3]) - min(bbox0[1], bbox1[1])) - (
  1150. abs(bbox0[3] - bbox0[1]) + abs(bbox1[3] - bbox1[1]))
  1151. if width < 0 and height < 0:
  1152. iou = abs(width * height / min(abs((bbox0[2] - bbox0[0]) * (bbox0[3] - bbox0[1])),
  1153. abs((bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]))))
  1154. # print("getIOU", iou)
  1155. return iou
  1156. return 0
  1157. def getspan(self, _list, x0, x1, margin):
  1158. _count = 0
  1159. (x0, x1) = (min(x0, x1), max(x0, x1))
  1160. for _x in _list:
  1161. if _x >= (x0 - margin) and _x <= (x1 + margin):
  1162. _count += 1
  1163. return _count - 1
  1164. def _plot(self, list_line, list_textbox):
  1165. from matplotlib import pyplot as plt
  1166. plt.figure()
  1167. for _line in list_line:
  1168. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  1169. plt.plot([x0, x1], [y0, y1])
  1170. for _line in list_line:
  1171. x0, y0, x1, y1 = _line.bbox
  1172. plt.plot([x0, x1], [y0, y1])
  1173. # for point in list_crosspoints:
  1174. # plt.scatter(point.get("point")[0],point.get("point")[1])
  1175. for textbox in list_textbox:
  1176. x0, y0, x1, y1 = textbox.bbox
  1177. plt.plot([x0, x1], [y0, y1])
  1178. plt.show()
  1179. def get_table_html(table):
  1180. html_text = '<table border="1">'
  1181. for row in table:
  1182. html_text += "<tr>"
  1183. for col in row:
  1184. row_span = col.get("rowspan")
  1185. col_span = col.get("columnspan")
  1186. bbox_text = col.get("text")
  1187. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1188. html_text += bbox_text + "</td>"
  1189. html_text += "</tr>"
  1190. html_text += "</table>"
  1191. return html_text
  1192. def sort_object(obj_list, is_reverse=False):
  1193. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  1194. obj_list = combine_object(obj_list)
  1195. if len(obj_list) == 0:
  1196. return obj_list
  1197. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  1198. obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
  1199. return obj_list
  1200. elif isinstance(obj_list[0], _Page):
  1201. obj_list.sort(key=lambda x: x.page_no)
  1202. return obj_list
  1203. else:
  1204. return obj_list
  1205. def combine_object(obj_list, threshold=5):
  1206. from format_convert.convert_tree import _Sentence
  1207. sentence_list = []
  1208. for obj in obj_list:
  1209. if isinstance(obj, _Sentence) and not obj.is_html:
  1210. obj.content = re.sub("\s", "", obj.content)
  1211. sentence_list.append(obj)
  1212. sentence_list.sort(key=lambda x: (x.y, x.x))
  1213. for sen in sentence_list:
  1214. obj_list.remove(sen)
  1215. delete_list = []
  1216. for i in range(1, len(sentence_list)):
  1217. sen1 = sentence_list[i - 1]
  1218. sen2 = sentence_list[i]
  1219. if sen1.combine is False or sen2.combine is False:
  1220. continue
  1221. if abs(sen2.y - sen1.y) <= threshold:
  1222. if sen2.x > sen1.x:
  1223. sen2.x = sen1.x
  1224. sen2.content = sen1.content + sen2.content
  1225. else:
  1226. sen2.content = sen2.content + sen1.content
  1227. if sen2.y > sen1.y:
  1228. sen2.y = sen1.y
  1229. delete_list.append(sen1)
  1230. for sen in delete_list:
  1231. sentence_list.remove(sen)
  1232. for sen in sentence_list:
  1233. obj_list.append(sen)
  1234. return obj_list
  1235. session_ocr = requests.Session()
  1236. session_otr = requests.Session()
  1237. session_all = requests.Session()
  1238. def request_post(url, param, time_out=1000, use_zlib=False):
  1239. fails = 0
  1240. text = json.dumps([-2])
  1241. while True:
  1242. try:
  1243. if fails >= 1:
  1244. break
  1245. headers = {'content-type': 'application/json'}
  1246. # result = requests.post(url, data=param, timeout=time_out)
  1247. if param.get("model_type") == "ocr":
  1248. result = session_ocr.post(url, data=param, timeout=time_out)
  1249. elif param.get("model_type") == "otr":
  1250. result = session_otr.post(url, data=param, timeout=time_out)
  1251. else:
  1252. result = session_all.post(url, data=param, timeout=time_out)
  1253. # print('result.status_code', result.status_code)
  1254. # print('result.text', result.text)
  1255. if result.status_code == 200:
  1256. text = result.text
  1257. break
  1258. else:
  1259. # print('result.status_code', result.status_code)
  1260. # print('result.text', result.text)
  1261. fails += 1
  1262. continue
  1263. except socket.timeout:
  1264. fails += 1
  1265. # print('timeout! fail times:', fails)
  1266. except:
  1267. fails += 1
  1268. # print('fail! fail times:', fails)
  1269. traceback.print_exc()
  1270. return text
  1271. def test_gpu():
  1272. print("=" * 30)
  1273. import paddle
  1274. paddle.utils.run_check()
  1275. # import tensorflow as tf
  1276. # print("tf gpu", tf.config.list_physical_devices('GPU'))
  1277. print("=" * 30)
  1278. def my_subprocess_call(*popenargs, timeout=None):
  1279. logging.info("into my_subprocess_call")
  1280. with Popen(*popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
  1281. try:
  1282. for line in p.stdout:
  1283. print("stdout", line)
  1284. for line in p.stderr:
  1285. print("stderr", line)
  1286. p.wait(timeout=timeout)
  1287. # p.communicate()
  1288. return p.pid, p.returncode
  1289. except: # Including KeyboardInterrupt, wait handled that.
  1290. p.kill()
  1291. # We don't call p.wait() again as p.__exit__ does that for us.
  1292. raise
  1293. finally:
  1294. logging.info("out my_subprocess_call")
  1295. p.kill()
  1296. def parse_yaml():
  1297. yaml_path = os.path.dirname(os.path.abspath(__file__)) + "/interface.yml"
  1298. with open(yaml_path, "r", encoding='utf-8') as f:
  1299. cfg = f.read()
  1300. params = yaml.load(cfg, Loader=yaml.SafeLoader)
  1301. return params
  1302. def get_ip_port(node_type=None, interface_type=None):
  1303. if node_type is None:
  1304. node_type_list = ["master", "slave"]
  1305. else:
  1306. node_type_list = [node_type]
  1307. if interface_type is None:
  1308. interface_type_list = ["convert", "ocr", "otr", "office", "path", "isr", "idc", "atc", "yolo"]
  1309. else:
  1310. interface_type_list = [interface_type]
  1311. ip_port_dict = {}
  1312. params = parse_yaml()
  1313. # 循环 master slave
  1314. for type1 in node_type_list:
  1315. node_type = type1.upper()
  1316. ip_list = params.get(node_type).get("ip")
  1317. # 循环多个IP
  1318. for j in range(len(ip_list)):
  1319. _ip = ip_list[j]
  1320. if ip_port_dict.get(_ip):
  1321. ip_port_dict.get(_ip).update({node_type: {}})
  1322. else:
  1323. ip_port_dict.update({_ip: {node_type: {}}})
  1324. # 有IP时,循环多个参数
  1325. for type2 in interface_type_list:
  1326. python_path = None
  1327. project_path = None
  1328. gunicorn_path = None
  1329. processes = 0
  1330. port_list = []
  1331. interface_type = type2.upper()
  1332. # if interface_type in ["convert".upper()]:
  1333. # _port = params.get(node_type).get(interface_type).get("port")
  1334. # if _port is None:
  1335. # port_list = []
  1336. # else:
  1337. # if interface_type == "convert".upper():
  1338. # processes = params.get(node_type).get(interface_type).get("processes")[j]
  1339. # port_list = [str(_port[j])]*int(processes)
  1340. # # port_list = [str(_port)]
  1341. if interface_type == "path".upper():
  1342. python_path = params.get(node_type).get(interface_type).get("python")[j]
  1343. project_path = params.get(node_type).get(interface_type).get("project")[j]
  1344. gunicorn_path = params.get(node_type).get(interface_type).get("gunicorn")[j]
  1345. else:
  1346. port_start = params.get(node_type).get(interface_type).get("port_start")
  1347. port_no = params.get(node_type).get(interface_type).get("port_no")
  1348. if port_start is None or port_no is None:
  1349. port_list = []
  1350. else:
  1351. if interface_type in ["office".upper()]:
  1352. port_list = [str(x) for x in range(port_start[j], port_start[j] + port_no[j], 1)]
  1353. else:
  1354. port_list = [str(port_start[j])] * port_no[j]
  1355. # if ip_list:
  1356. # for i in range(len(ip_list)):
  1357. # 参数放入dict
  1358. if port_list:
  1359. ip_port_dict.get(_ip).get(node_type).update({interface_type.lower(): port_list})
  1360. if processes:
  1361. ip_port_dict.get(_ip).get(node_type).update({interface_type.lower() + "_processes": processes})
  1362. if project_path and python_path and gunicorn_path:
  1363. ip_port_dict.get(_ip).get(node_type).update({"project_path": project_path,
  1364. "python_path": python_path,
  1365. "gunicorn_path": gunicorn_path})
  1366. # print("ip_port_dict", ip_port_dict)
  1367. return ip_port_dict
  1368. def get_ip_port_old(node_type=None, interface_type=None):
  1369. if node_type is None:
  1370. node_type_list = ["master", "slave"]
  1371. else:
  1372. node_type_list = [node_type]
  1373. if interface_type is None:
  1374. interface_type_list = ["convert", "ocr", "otr", "office", "path"]
  1375. else:
  1376. interface_type_list = [interface_type]
  1377. ip_port_dict = {}
  1378. params = parse_yaml()
  1379. for type1 in node_type_list:
  1380. node_type = type1.upper()
  1381. ip_list = params.get(node_type).get("ip")
  1382. for type2 in interface_type_list:
  1383. interface_type = type2.upper()
  1384. processes = 0
  1385. python_path = None
  1386. project_path = None
  1387. if interface_type in ["convert".upper()]:
  1388. _port = params.get(node_type).get(interface_type).get("port")
  1389. if _port is None:
  1390. port_list = []
  1391. else:
  1392. if interface_type == "convert".upper():
  1393. processes = params.get(node_type).get(interface_type).get("processes")
  1394. port_list = [str(_port)] * int(processes)
  1395. # port_list = [str(_port)]
  1396. elif interface_type == "path".upper():
  1397. python_path = params.get(node_type).get(interface_type).get("python")
  1398. project_path = params.get(node_type).get(interface_type).get("project")
  1399. else:
  1400. port_start = params.get(node_type).get(interface_type).get("port_start")
  1401. port_no = params.get(node_type).get(interface_type).get("port_no")
  1402. if port_start is None or port_no is None:
  1403. port_list = []
  1404. else:
  1405. port_list = [str(x) for x in range(port_start, port_start + port_no, 1)]
  1406. if ip_list:
  1407. for _ip in ip_list:
  1408. if _ip is None:
  1409. continue
  1410. if _ip in ip_port_dict.keys():
  1411. if port_list:
  1412. ip_port_dict.get(_ip).update({interface_type.lower(): port_list})
  1413. else:
  1414. if port_list:
  1415. ip_port_dict[_ip] = {interface_type.lower(): port_list}
  1416. if processes:
  1417. ip_port_dict.get(_ip).update({interface_type.lower() + "_processes": processes})
  1418. if project_path and python_path:
  1419. ip_port_dict.get(_ip).update({"project_path": project_path,
  1420. "python_path": python_path})
  1421. return ip_port_dict
  1422. def get_intranet_ip():
  1423. try:
  1424. # Create a new socket using the given address family,
  1425. # socket type and protocol number.
  1426. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  1427. # Connect to a remote socket at address.
  1428. # (The format of address depends on the address family.)
  1429. address = ("8.8.8.8", 80)
  1430. s.connect(address)
  1431. # Return the socket’s own address.
  1432. # This is useful to find out the port number of an IPv4/v6 socket, for instance.
  1433. # (The format of the address returned depends on the address family.)
  1434. sockname = s.getsockname()
  1435. ip = sockname[0]
  1436. port = sockname[1]
  1437. finally:
  1438. s.close()
  1439. return ip
  1440. def get_all_ip():
  1441. if get_platform() == "Windows":
  1442. ips = ['127.0.0.1']
  1443. else:
  1444. ips = [ip.split('/')[0] for ip in os.popen("ip addr | grep 'inet '|awk '{print $2}'").readlines()]
  1445. for i in range(len(ips)):
  1446. ips[i] = "http://" + ips[i]
  1447. return ips
  1448. def get_using_ip():
  1449. ip_port_dict = get_ip_port()
  1450. ips = get_all_ip()
  1451. ip = "http://127.0.0.1"
  1452. for key in ip_port_dict.keys():
  1453. if key in ips:
  1454. ip = key
  1455. break
  1456. return ip
  1457. def memory_decorator(func):
  1458. @wraps(func)
  1459. def get_memory_info(*args, **kwargs):
  1460. if get_platform() == "Windows":
  1461. return func(*args, **kwargs)
  1462. # 只有linux有resource包
  1463. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1464. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1465. start_time = time.time()
  1466. logging.info("----- memory info start - " + func.__qualname__
  1467. + " - " + str(os.getpid())
  1468. + " - " + str(round(usage, 2)) + " GB"
  1469. + " - " + str(round(time.time() - start_time, 2)) + " sec")
  1470. result = func(*args, **kwargs)
  1471. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1472. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1473. logging.info("----- memory info end - " + func.__qualname__
  1474. + " - " + str(os.getpid())
  1475. + " - " + str(round(usage, 2)) + " GB"
  1476. + " - " + str(round(time.time() - start_time, 2)) + " sec")
  1477. return result
  1478. return get_memory_info
  1479. def log(msg):
  1480. call_func_name = inspect.currentframe().f_back.f_code.co_name
  1481. logger = get_logger(call_func_name, {"md5": _global.get("md5"),
  1482. "port": _global.get("port")})
  1483. logger.info(msg)
  1484. # logging.info(msg)
  1485. def get_logger(_name, _dict):
  1486. extra = _dict
  1487. _format = '%(asctime)s - %(name)s - %(levelname)s - %(md5)s - %(port)s - %(message)s'
  1488. logger = logging.getLogger(_name)
  1489. create_new_flag = 1
  1490. handlers = logger.handlers
  1491. if handlers:
  1492. for h in handlers:
  1493. if h.formatter.__dict__.get("_fmt") == _format:
  1494. create_new_flag = 0
  1495. break
  1496. if create_new_flag:
  1497. formatter = logging.Formatter(_format)
  1498. handler = logging.StreamHandler()
  1499. handler.setFormatter(formatter)
  1500. logger.addHandler(handler)
  1501. logger.setLevel(logging.INFO)
  1502. logger.propagate = False
  1503. logger = logging.LoggerAdapter(logger, extra)
  1504. return logger
  1505. def set_flask_global():
  1506. # 接口轮询所需锁、参数
  1507. ip_port_flag = {}
  1508. # ip_flag = []
  1509. ip_port_dict = get_ip_port()
  1510. for _k in ip_port_dict.keys():
  1511. ip_port_flag.update({_k: {}})
  1512. for interface in ["ocr", "otr", "convert", "idc", "isr", "atc", 'yolo', "office"]:
  1513. if ip_port_dict.get(_k).get("MASTER"):
  1514. if ip_port_dict.get(_k).get("MASTER").get(interface):
  1515. ip_port_flag[_k][interface] = 0
  1516. else:
  1517. if ip_port_dict.get(_k).get("SLAVE").get(interface):
  1518. ip_port_flag[_k][interface] = 0
  1519. # ip_port_flag.update({_k: {"ocr": 0,
  1520. # "otr": 0,
  1521. # "convert": 0,
  1522. # "idc": 0,
  1523. # "isr": 0,
  1524. # "office": 0
  1525. # }})
  1526. # if ip_port_dict.get(_k).get("MASTER"):
  1527. # ip_flag.append([_k+"_master", 0])
  1528. # if ip_port_dict.get(_k).get("SLAVE"):
  1529. # ip_flag.append([_k+"_slave", 0])
  1530. _global.update({"ip_port_flag": ip_port_flag})
  1531. _global.update({"ip_port": ip_port_dict})
  1532. # _global.update({"ip_flag": ip_flag})
  1533. # print(globals().get("ip_port"))
  1534. def get_md5_from_bytes(_bytes):
  1535. def generate_fp(_b):
  1536. bio = BytesIO()
  1537. bio.write(_b)
  1538. return bio
  1539. _length = 0
  1540. try:
  1541. _md5 = hashlib.md5()
  1542. ff = generate_fp(_bytes)
  1543. ff.seek(0)
  1544. while True:
  1545. data = ff.read(4096)
  1546. if not data:
  1547. break
  1548. _length += len(data)
  1549. _md5.update(data)
  1550. return _md5.hexdigest(), _length
  1551. except Exception as e:
  1552. traceback.print_exc()
  1553. return None, _length
  1554. # def to_share_memory(np_data, name=None):
  1555. # # from multiprocessing.resource_tracker import unregister
  1556. # from multiprocessing import shared_memory
  1557. # if name is None:
  1558. # sm_name = "psm_" + str(os.getpid())
  1559. # else:
  1560. # sm_name = name
  1561. # logging.info("into from_share_memory sm_name " + sm_name)
  1562. # shm = shared_memory.SharedMemory(name=sm_name, create=True, size=np_data.nbytes)
  1563. # # unregister(sm_name, 'shared_memory')
  1564. # sm_data = np.ndarray(np_data.shape, dtype=np_data.dtype, buffer=shm.buf)
  1565. # sm_data[:] = np_data[:] # Copy the original data into shared memory
  1566. #
  1567. # shm.close()
  1568. # del sm_data
  1569. # return shm
  1570. # def from_share_memory(sm_name, _shape, _dtype, if_close=True):
  1571. # from multiprocessing import shared_memory
  1572. # logging.info("into from_share_memory sm_name " + sm_name)
  1573. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1574. # b = np.ndarray(_shape, dtype=_dtype, buffer=shm.buf)
  1575. # sm_data = copy.deepcopy(b)
  1576. # b[::] = 0
  1577. #
  1578. # if if_close:
  1579. # try:
  1580. # shm.close()
  1581. # shm.unlink()
  1582. # except Exception:
  1583. # log("file not found! " + sm_name)
  1584. # return sm_data
  1585. # def get_share_memory(sm_name):
  1586. # try:
  1587. # from multiprocessing import shared_memory
  1588. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1589. # return shm
  1590. # except:
  1591. # return None
  1592. # def release_share_memory(shm):
  1593. # try:
  1594. # if shm is None:
  1595. # return
  1596. # shm.close()
  1597. # shm.unlink()
  1598. # log(str(shm.name) + " release successfully!")
  1599. # except FileNotFoundError:
  1600. # log(str(shm.name) + " has released!")
  1601. # except Exception as e:
  1602. # traceback.print_exc()
  1603. # def get_share_memory_list(sm_list_name, list_size=None):
  1604. # # from multiprocessing.resource_tracker import unregister
  1605. # from multiprocessing import shared_memory
  1606. # if list_size is None:
  1607. # sm_list = shared_memory.ShareableList(name=sm_list_name)
  1608. # else:
  1609. # sm_list = shared_memory.ShareableList(name=sm_list_name, sequence=["0"]+[' '*2048]*(list_size-2)+["0"])
  1610. # # unregister(sm_list_name, 'shared_memory')
  1611. # return sm_list
  1612. # def close_share_memory_list(sm_list):
  1613. # try:
  1614. # sm_list.shm.close()
  1615. # except Exception:
  1616. # traceback.print_exc()
  1617. def get_np_type(_str):
  1618. _dtype = None
  1619. if _str == 'uint8':
  1620. _dtype = np.uint8
  1621. elif _str == 'float16':
  1622. _dtype = np.float16
  1623. elif _str == 'float32':
  1624. _dtype = np.float32
  1625. logging.info("get_np_type " + _str + " " + str(_dtype))
  1626. return _dtype
  1627. def namespace_to_dict(agrs_or_dict, reverse=False):
  1628. if reverse:
  1629. agrs_or_dict = argparse.Namespace(**agrs_or_dict)
  1630. else:
  1631. agrs_or_dict = vars(agrs_or_dict)
  1632. return agrs_or_dict
  1633. def get_args_from_config(ip_port_dict, ip, arg_type, node_type=None):
  1634. if node_type is None:
  1635. node_type = ["MASTER", "SLAVE"]
  1636. else:
  1637. node_type = [node_type]
  1638. arg_list = []
  1639. for _type in node_type:
  1640. if ip_port_dict.get(ip).get(_type):
  1641. if ip_port_dict.get(ip).get(_type).get(arg_type):
  1642. arg_list.append(ip_port_dict.get(ip).get(_type).get(arg_type))
  1643. return arg_list
  1644. def remove_red_seal(image_np):
  1645. """
  1646. 去除红色印章
  1647. """
  1648. cv2.namedWindow("image_np", 0)
  1649. cv2.resizeWindow("image_np", 1000, 800)
  1650. cv2.imshow("image_np", image_np)
  1651. height, width, c = image_np.shape
  1652. window_h = int(height / 15)
  1653. image_hsv = cv2.cvtColor(image_np, cv2.COLOR_BGR2HSV)
  1654. # 遍历numpy
  1655. red_point_list = []
  1656. image_list = image_np.tolist()
  1657. hsv_dict = {}
  1658. for index_1 in range(len(image_list)):
  1659. for index_2 in range(len(image_list[index_1])):
  1660. h, s, v = image_hsv[index_1][index_2]
  1661. if (0 <= h <= 10 or 156 <= h <= 180) and 43 <= s <= 255 and 46 <= v <= 255:
  1662. key = str(image_hsv[index_1][index_2].tolist())
  1663. red_point_list.append([key, index_1, index_2])
  1664. if hsv_dict.get(key):
  1665. hsv_dict[key] += 1
  1666. else:
  1667. hsv_dict[key] = 1
  1668. # 找出相同最多的hsv值
  1669. hsv_most_key = None
  1670. hsv_most_value = 0
  1671. for hsv in hsv_dict.keys():
  1672. if hsv_dict.get(hsv) > hsv_most_value:
  1673. hsv_most_value = hsv_dict.get(hsv)
  1674. hsv_most_key = hsv
  1675. # print(hsv_dict)
  1676. # 根据hsv判断其填充为黑色还是白色
  1677. hsv_most_key = eval(hsv_most_key)
  1678. for point in red_point_list:
  1679. if abs(eval(point[0])[2] - hsv_most_key[2]) <= 70:
  1680. image_np[point[1]][point[2]][0] = 255
  1681. image_np[point[1]][point[2]][1] = 255
  1682. image_np[point[1]][point[2]][2] = 255
  1683. else:
  1684. image_np[point[1]][point[2]][0] = 0
  1685. image_np[point[1]][point[2]][1] = 0
  1686. image_np[point[1]][point[2]][2] = 0
  1687. cv2.namedWindow("remove_red_seal", 0)
  1688. cv2.resizeWindow("remove_red_seal", 1000, 800)
  1689. cv2.imshow("remove_red_seal", image_np)
  1690. # cv2.imwrite("C:/Users/Administrator/Downloads/1.png", image_np)
  1691. cv2.waitKey(0)
  1692. return image_np
  1693. def pil_resize(image_np, height, width):
  1694. # limit pixels 89478485
  1695. if image_np.shape[0] * image_np.shape[1] * image_np.shape[2] >= 89478485:
  1696. # print("image too large, limit 89478485 pixels", image_np.shape)
  1697. ratio = image_np.shape[0] / image_np.shape[1]
  1698. if image_np.shape[0] >= image_np.shape[1]:
  1699. image_np = cv2.resize(image_np, (int(3000 / ratio), 3000), interpolation=cv2.INTER_AREA)
  1700. else:
  1701. image_np = cv2.resize(image_np, (3000, int(3000 * ratio)), interpolation=cv2.INTER_AREA)
  1702. image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
  1703. image_pil = image_pil.resize((int(width), int(height)), Image.BICUBIC)
  1704. image_np = cv2.cvtColor(np.asarray(image_pil), cv2.COLOR_RGB2BGR)
  1705. return image_np
  1706. def np2pil(image_np):
  1707. image_pil = Image.fromarray(cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB))
  1708. return image_pil
  1709. def pil2np(image_pil):
  1710. image_np = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
  1711. return image_np
  1712. def bytes2np(_b):
  1713. try:
  1714. # 二进制数据流转np.ndarray [np.uint8: 8位像素]
  1715. image_np = cv2.imdecode(np.frombuffer(_b, np.uint8), cv2.IMREAD_COLOR)
  1716. # 将rgb转为bgr
  1717. # image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
  1718. return image_np
  1719. except cv2.error as e:
  1720. if "src.empty()" in str(e):
  1721. log("bytes2np image is empty!")
  1722. return None
  1723. except:
  1724. traceback.print_exc()
  1725. return None
  1726. def np2bytes(image_np):
  1727. # numpy转为可序列化的string
  1728. success, img_encode = cv2.imencode(".jpg", image_np)
  1729. # numpy -> bytes
  1730. img_bytes = img_encode.tobytes()
  1731. return img_bytes
  1732. def ocr_cant_read(text_list, box_list):
  1733. """
  1734. 判断ocr因为图片方向无法识别情况
  1735. :param text_list: 文字list
  1736. :param box_list: 文字框list
  1737. :return: bool
  1738. """
  1739. # 无文字及框
  1740. if not text_list or not box_list:
  1741. return True
  1742. # 根据bbox长宽比判断
  1743. box_cnt = 0
  1744. box_flag = 0
  1745. for box in box_list:
  1746. if abs(box[0][1] - box[2][1]) > abs(box[0][0] - box[2][0]):
  1747. box_cnt += 1
  1748. if box_cnt >= int(len(box_list) / 2):
  1749. box_flag = 1
  1750. # 根据识别字数判断
  1751. charac_flag = 0
  1752. charac_set = set()
  1753. for text in text_list:
  1754. charac_set.update(text)
  1755. if len(charac_set) < 40:
  1756. charac_flag = 1
  1757. # 字数少
  1758. if charac_flag:
  1759. result = True
  1760. # 字数多但格子长
  1761. elif box_flag:
  1762. result = True
  1763. else:
  1764. result = False
  1765. log(result)
  1766. return result
  1767. def file_lock(file_name):
  1768. """
  1769. 获取文件排它锁,返回文件句柄,需手动close文件以释放排它锁
  1770. :param file_name:
  1771. :return:
  1772. """
  1773. import fcntl
  1774. if not os.path.exists(file_name):
  1775. with open(file_name, 'w') as f:
  1776. f.write('0')
  1777. file = open(file_name, 'r')
  1778. # 获取排它锁
  1779. fcntl.flock(file.fileno(), fcntl.LOCK_EX)
  1780. return file
  1781. def get_garble_code():
  1782. reg_str = '[ÿÝØÐÙÚÛÜÒÓÔÕÖÊÄẨòóôäåüúîïìþ¡¢£¤§èéêëȟš' + \
  1783. 'Ϸᱦ¼ŒÞ¾Çœø‡Æ�ϐ㏫⮰≧ڝⶹӇⰚڣༀងϦȠ⚓Ⴭᐬ⩔ⅮⰚࡦࣽ' + \
  1784. '䕆㶃䌛㻰䙹䔮㔭䶰䰬䉰䶰䘔䉥喌䶥䶰䛳䉙䄠' + \
  1785. ''.join(['\\x0' + str(x) for x in range(1, 10)]) + \
  1786. ''.join(['\\x' + str(x) for x in range(10, 20)]) + \
  1787. ']'
  1788. return reg_str
  1789. def line_is_cross(A, B, C, D):
  1790. line1 = LineString([A, B])
  1791. line2 = LineString([C, D])
  1792. int_pt = line1.intersection(line2)
  1793. try:
  1794. point_of_intersection = int_pt.x, int_pt.y
  1795. return True
  1796. except:
  1797. return False
  1798. if __name__ == "__main__":
  1799. # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  1800. # print(slash_replace(strs))
  1801. # from matplotlib import pyplot as plt
  1802. # import random
  1803. # fig = plt.figure()
  1804. # plt.xlim(100)
  1805. # plt.ylim(100)
  1806. # fig.add_subplot(111)
  1807. # x0,y0,x1,y1 = (1,2,3,4)
  1808. # plt.gca().add_patch(plt.Rectangle(xy=(x0, y0),
  1809. # width=x1-x0,
  1810. # height=y1-y0,
  1811. # edgecolor=(random.randint(0,255)/255,random.randint(0,255)/255,random.randint(0,255)/255),
  1812. # fill=False, linewidth=2))
  1813. #
  1814. # # plt.show()
  1815. # import cv2
  1816. # import numpy as np
  1817. # img = np.zeros(shape=(1800,1800),dtype=np.uint8)
  1818. # img += 255
  1819. # cv2.imshow("bbox", img)
  1820. # cv2.waitKey(0)
  1821. # print(json.dumps({"data":[1, 2]}))
  1822. # print(parse_yaml())
  1823. print(get_ip_port())
  1824. # set_flask_global()
  1825. # print(get_all_ip())
  1826. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "idc"))
  1827. print(get_args_from_config(get_ip_port(), get_all_ip()[0], "atc"))
  1828. # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
  1829. # print(get_intranet_ip())
  1830. # _path = "C:/Users/Administrator/Downloads/3.png"
  1831. # remove_red_seal(cv2.imread(_path))