utils.py 74 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034
  1. # -*- coding:utf-8 -*-
  2. import argparse
  3. import copy
  4. import hashlib
  5. import inspect
  6. import json
  7. import os
  8. import socket
  9. import subprocess
  10. import sys
  11. from io import BytesIO
  12. from subprocess import Popen
  13. import cv2
  14. import requests
  15. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  16. import difflib
  17. import logging
  18. import mimetypes
  19. import platform
  20. import re
  21. import traceback
  22. import filetype
  23. from bs4 import BeautifulSoup
  24. import yaml
  25. from pdfminer.layout import *
  26. from format_convert import _global
  27. from functools import wraps
  28. import psutil
  29. import time
  30. import numpy as np
  31. from format_convert.judge_platform import get_platform
  32. if get_platform() == "Linux":
  33. import resource
  34. import math
  35. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9]):
  36. """
  37. [0] : continue
  38. [-1]: 逻辑处理错误
  39. [-2]: 接口调用错误
  40. [-3]: 文件格式错误,无法打开
  41. [-4]: 各类文件调用第三方包读取超时
  42. [-5]: 整个转换过程超时
  43. [-6]: 阿里云UDF队列超时
  44. [-7]: 文件需密码,无法打开
  45. [-8]: 调用现成接口报错
  46. [-9]: 接口接收数据为空
  47. """
  48. for c in code:
  49. if _list == [c]:
  50. return True
  51. return False
  52. def add_div(text):
  53. if text == "" or text is None:
  54. return text
  55. # if get_platform() == "Windows":
  56. # print("add_div", text)
  57. if re.findall("<div>", text):
  58. return text
  59. text = "<div>" + text + "\n"
  60. text = re.sub("\n", "</div><div>", text)
  61. # text += "</div>"
  62. if text[-5:] == "<div>":
  63. # print("add_div has cut", text[-30:])
  64. text = text[:-5]
  65. return text
  66. def get_platform():
  67. sys = platform.system()
  68. return sys
  69. def get_html_p(html_path):
  70. log("into get_html_p")
  71. try:
  72. with open(html_path, "r") as ff:
  73. html_str = ff.read()
  74. soup = BeautifulSoup(html_str, 'lxml')
  75. text = ""
  76. for p in soup.find_all("p"):
  77. p_text = p.text
  78. p_text = p_text.strip()
  79. if p.string != "":
  80. text += p_text
  81. text += "\n"
  82. return text
  83. except Exception as e:
  84. log("get_html_p error!")
  85. return [-1]
  86. def string_similarity(str1, str2):
  87. # 去掉<div>和回车
  88. str1 = re.sub("<div>", "", str1)
  89. str1 = re.sub("</div>", "", str1)
  90. str1 = re.sub("\n", "", str1)
  91. str2 = re.sub("<div>", "", str2)
  92. str2 = re.sub("</div>", "", str2)
  93. str2 = re.sub("\n", "", str2)
  94. # print("********************************")
  95. # print("str1", str1)
  96. # print("********************************")
  97. # print("str2", str2)
  98. # print("********************************")
  99. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  100. print("string_similarity", score)
  101. return score
  102. def get_sequential_data(text_list, bbox_list, html=False):
  103. logging.info("into get_sequential_data")
  104. try:
  105. text = ""
  106. order_list = []
  107. for i in range(len(text_list)):
  108. length_start = bbox_list[i][0][0]
  109. length_end = bbox_list[i][1][0]
  110. height_start = bbox_list[i][0][1]
  111. height_end = bbox_list[i][-1][1]
  112. # print([length_start, length_end, height_start, height_end])
  113. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  114. # text = text + infomation['text'] + "\n"
  115. if get_platform() == "Windows":
  116. print("get_sequential_data", order_list)
  117. if not order_list:
  118. if get_platform() == "Windows":
  119. print("get_sequential_data", "no order list")
  120. return ""
  121. # 根据bbox的坐标对输出排序
  122. order_list.sort(key=lambda x: (x[3], x[1], x[0]))
  123. # 根据bbox分行分列
  124. # col_list = []
  125. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  126. # for i in range(len(order_list)):
  127. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  128. # col_list.append(order_list[i])
  129. # else:
  130. # row_list.append(col_list)
  131. # col_list = []
  132. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  133. # col_list.append(order_list[i])
  134. # if i == len(order_list) - 1:
  135. # row_list.append(col_list)
  136. row_list = []
  137. used_box = []
  138. threshold = 5
  139. for box in order_list:
  140. if box in used_box:
  141. continue
  142. height_center = (box[4] + box[3]) / 2
  143. row = []
  144. for box2 in order_list:
  145. if box2 in used_box:
  146. continue
  147. height_center2 = (box2[4] + box2[3]) / 2
  148. if height_center - threshold <= height_center2 <= height_center + threshold:
  149. if box2 not in row:
  150. row.append(box2)
  151. used_box.append(box2)
  152. row.sort(key=lambda x: x[0])
  153. row_list.append(row)
  154. for row in row_list:
  155. if not row:
  156. continue
  157. if len(row) <= 1:
  158. text = text + row[0][0] + "\n"
  159. else:
  160. sub_text = ""
  161. row.sort(key=lambda x: x[1])
  162. for col in row:
  163. sub_text = sub_text + col[0] + " "
  164. sub_text = sub_text + "\n"
  165. text += sub_text
  166. if html:
  167. text = "<div>" + text
  168. text = re.sub("\n", "</div>\n<div>", text)
  169. text += "</div>"
  170. # if text[-5:] == "<div>":
  171. # text = text[:-5]
  172. return text
  173. except Exception as e:
  174. logging.info("get_sequential_data error!")
  175. print("get_sequential_data", traceback.print_exc())
  176. return [-1]
  177. # def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  178. # logging.info("into get_formatted_table")
  179. # try:
  180. # # 重新定义text_bbox_list,[point, point, text]
  181. # text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  182. # range(len(text_bbox_list))]
  183. # # 按纵坐标排序
  184. # text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  185. # table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  186. #
  187. # # print("text_bbox_list", text_bbox_list)
  188. # # print("table_bbox_list", table_bbox_list)
  189. #
  190. # # bbox位置 threshold
  191. # threshold = 5
  192. #
  193. # # 根据split_line分区,可能有个区多个表格 [(), ()]
  194. # area_text_bbox_list = []
  195. # area_table_bbox_list = []
  196. # # print("get_formatted_table, split_line", split_line)
  197. # for j in range(1, len(split_line)):
  198. # last_y = split_line[j - 1][0][1]
  199. # current_y = split_line[j][0][1]
  200. # temp_text_bbox_list = []
  201. # temp_table_bbox_list = []
  202. #
  203. # # 找出该区域下text bbox
  204. # for text_bbox in text_bbox_list:
  205. # # 计算 text bbox 中心点
  206. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  207. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  208. # if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  209. # temp_text_bbox_list.append(text_bbox)
  210. # area_text_bbox_list.append(temp_text_bbox_list)
  211. #
  212. # # 找出该区域下table bbox
  213. # for table_bbox in table_bbox_list:
  214. # # 计算 table bbox 中心点
  215. # table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  216. # (table_bbox[1][1] + table_bbox[0][1]) / 2)
  217. # if last_y < table_bbox_center[1] < current_y:
  218. # temp_table_bbox_list.append(table_bbox)
  219. # area_table_bbox_list.append(temp_table_bbox_list)
  220. #
  221. # # for j in range(len(area_text_bbox_list)):
  222. # # print("area_text_bbox_list", j, area_text_bbox_list[j])
  223. #
  224. # # 对每个区域分别进行两个bbox匹配,生成表格
  225. # area_text_list = []
  226. # area_column_list = []
  227. # for j in range(len(area_text_bbox_list)):
  228. # # 每个区域的table bbox 和text bbox
  229. # temp_table_bbox_list = area_table_bbox_list[j]
  230. # temp_text_bbox_list = area_text_bbox_list[j]
  231. #
  232. # # 判断该区域有无表格bbox
  233. # # 若无表格,将该区域文字连接
  234. # if not temp_table_bbox_list:
  235. # # 找出该区域的所有text bbox
  236. # only_text_list = []
  237. # only_bbox_list = []
  238. # for text_bbox in temp_text_bbox_list:
  239. # only_text_list.append(text_bbox[2])
  240. # only_bbox_list.append([text_bbox[0], text_bbox[1]])
  241. # only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  242. # if only_text == [-1]:
  243. # return [-1], [-1]
  244. # area_text_list.append(only_text)
  245. # area_column_list.append(0)
  246. # continue
  247. #
  248. # # 有表格
  249. # # 文本对应的表格格子
  250. # text_in_table = {}
  251. # for i in range(len(temp_text_bbox_list)):
  252. # text_bbox = temp_text_bbox_list[i]
  253. #
  254. # # 计算 text bbox 中心点
  255. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  256. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  257. #
  258. # # 判断中心点在哪个table bbox中
  259. # for table_bbox in temp_table_bbox_list:
  260. # # 中心点在table bbox中,将text写入字典
  261. # if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  262. # table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  263. # if str(table_bbox) in text_in_table.keys():
  264. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  265. # else:
  266. # text_in_table[str(table_bbox)] = text_bbox[2]
  267. # break
  268. #
  269. # # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  270. # # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  271. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  272. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  273. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  274. # # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  275. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  276. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  277. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  278. # # if str(table_bbox) in text_in_table.keys():
  279. # # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  280. # # else:
  281. # # text_in_table[str(table_bbox)] = text_bbox[2]
  282. # # break
  283. #
  284. # # 对表格格子进行分行分列,并计算总计多少小列
  285. # # 放入坐标
  286. # all_col_list = []
  287. # all_row_list = []
  288. # for i in range(len(temp_table_bbox_list)):
  289. # table_bbox = temp_table_bbox_list[i]
  290. #
  291. # # 放入所有坐标x
  292. # if table_bbox[0][0] not in all_col_list:
  293. # all_col_list.append(table_bbox[0][0])
  294. # if table_bbox[1][0] not in all_col_list:
  295. # all_col_list.append(table_bbox[1][0])
  296. #
  297. # # 放入所有坐标y
  298. # if table_bbox[0][1] not in all_row_list:
  299. # all_row_list.append(table_bbox[0][1])
  300. # if table_bbox[1][1] not in all_row_list:
  301. # all_row_list.append(table_bbox[1][1])
  302. # all_col_list.sort(key=lambda x: x)
  303. # all_row_list.sort(key=lambda x: x)
  304. #
  305. # # 分行
  306. # row_list = []
  307. # rows = []
  308. # temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  309. # y_row = temp_table_bbox_list[0][0][1]
  310. # for i in range(len(temp_table_bbox_list)):
  311. # table_bbox = temp_table_bbox_list[i]
  312. #
  313. # if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  314. # rows.append(table_bbox)
  315. # else:
  316. # y_row = table_bbox[0][1]
  317. # if rows:
  318. # rows.sort(key=lambda x: x[0][0])
  319. # row_list.append(rows)
  320. # rows = []
  321. # rows.append(table_bbox)
  322. # # print("*" * 30)
  323. # # print(row_list)
  324. #
  325. # if i == len(temp_table_bbox_list) - 1:
  326. # if rows:
  327. # rows.sort(key=lambda x: x[0][0])
  328. # row_list.append(rows)
  329. #
  330. # # 生成表格,包括文字和格子宽度
  331. # area_column = []
  332. # text = '<table border="1">' + "\n"
  333. # for row in row_list:
  334. # text += "<tr>" + "\n"
  335. # for col in row:
  336. # # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  337. # row_span = 1
  338. # for y in all_row_list:
  339. # if col[0][1] < y < col[1][1]:
  340. # if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  341. # row_span += 1
  342. #
  343. # # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  344. # col_span = 1
  345. # for x in all_col_list:
  346. # if col[0][0] < x < col[1][0]:
  347. # if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  348. # col_span += 1
  349. #
  350. # text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  351. #
  352. # if str(col) in text_in_table.keys():
  353. # text += text_in_table.get(str(col))
  354. # else:
  355. # text += ""
  356. # text += "</td>" + "\n"
  357. # text += "</tr>" + "\n"
  358. # text += "</table>" + "\n"
  359. #
  360. # # 计算最大column
  361. # max_col_num = 0
  362. # for row in row_list:
  363. # col_num = 0
  364. # for col in row:
  365. # col_num += 1
  366. # if max_col_num < col_num:
  367. # max_col_num = col_num
  368. #
  369. # area_text_list.append(text)
  370. # area_column_list.append(max_col_num)
  371. #
  372. # text = ""
  373. # if get_platform() == "Windows":
  374. # print("get_formatted_table area_text_list", area_text_list)
  375. # for area_text in area_text_list:
  376. # text += area_text
  377. # return text, area_column_list
  378. # except Exception as e:
  379. # logging.info("get_formatted_table error!")
  380. # print("get_formatted_table", traceback.print_exc())
  381. # return [-1], [-1]
  382. def rename_inner_files(root_path):
  383. try:
  384. logging.info("into rename_inner_files")
  385. # 获取解压文件夹下所有文件+文件夹,不带根路径
  386. path_list = []
  387. for root, dirs, files in os.walk(root_path, topdown=False):
  388. for name in dirs:
  389. p = os.path.join(root, name) + os.sep
  390. if get_platform() == "Windows":
  391. root_path = slash_replace(root_path)
  392. p = slash_replace(p)
  393. p = re.sub(root_path, "", p)
  394. root_path = slash_replace(root_path, True)
  395. p = slash_replace(p, True)
  396. else:
  397. p = re.sub(root_path, "", p)
  398. path_list.append(p)
  399. for name in files:
  400. p = os.path.join(root, name)
  401. if get_platform() == "Windows":
  402. root_path = slash_replace(root_path)
  403. p = slash_replace(p)
  404. p = re.sub(root_path, "", p)
  405. root_path = slash_replace(root_path, True)
  406. p = slash_replace(p, True)
  407. else:
  408. p = re.sub(root_path, "", p)
  409. path_list.append(p)
  410. # 按路径长度排序
  411. path_list.sort(key=lambda x: len(x), reverse=True)
  412. # 循环改名
  413. for old_path in path_list:
  414. # 按路径分隔符分割
  415. ss = old_path.split(os.sep)
  416. # 判断是否文件夹
  417. is_dir = 0
  418. file_type = ""
  419. if os.path.isdir(root_path + old_path):
  420. ss = ss[:-1]
  421. is_dir = 1
  422. else:
  423. if "." in old_path:
  424. file_type = "." + old_path.split(".")[-1]
  425. else:
  426. file_type = ""
  427. # 最后一级需要用hash改名
  428. new_path = ""
  429. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  430. current_level = 0
  431. for s in ss:
  432. # 路径拼接
  433. if current_level < len(ss) - 1:
  434. new_path += s + os.sep
  435. else:
  436. new_path += str(hash(s)) + file_type
  437. current_level += 1
  438. new_ab_path = root_path + new_path
  439. old_ab_path = root_path + old_path
  440. os.rename(old_ab_path, new_ab_path)
  441. # 重新获取解压文件夹下所有文件+文件夹
  442. new_path_list = []
  443. for root, dirs, files in os.walk(root_path, topdown=False):
  444. for name in dirs:
  445. new_path_list.append(os.path.join(root, name) + os.sep)
  446. for name in files:
  447. new_path_list.append(os.path.join(root, name))
  448. return new_path_list
  449. except:
  450. traceback.print_exc()
  451. return [-1]
  452. def judge_format(path):
  453. guess1 = mimetypes.guess_type(path)
  454. _type = None
  455. if guess1[0]:
  456. _type = guess1[0]
  457. else:
  458. guess2 = filetype.guess(path)
  459. if guess2:
  460. _type = guess2.mime
  461. if _type == "application/pdf":
  462. return "pdf"
  463. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  464. return "docx"
  465. if _type == "application/x-zip-compressed" or _type == "application/zip":
  466. return "zip"
  467. if _type == "application/x-rar-compressed" or _type == "application/rar":
  468. return "rar"
  469. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  470. return "xlsx"
  471. if _type == "application/msword":
  472. return "doc"
  473. if _type == "image/png":
  474. return "png"
  475. if _type == "image/jpeg":
  476. return "jpg"
  477. # 猜不到,返回None
  478. return None
  479. def draw_lines_plt(bboxes):
  480. import matplotlib.pyplot as plt
  481. plt.figure()
  482. for bbox in bboxes:
  483. x = [bbox[0],bbox[2]]
  484. y = [bbox[1],bbox[3]]
  485. plt.plot(x,y)
  486. plt.show()
  487. def slash_replace(_str, reverse=False):
  488. if reverse:
  489. _str = eval(repr(_str).replace('/', '\\\\'))
  490. else:
  491. _str = eval(repr(_str).replace('\\\\', '/'))
  492. return _str
  493. class LineTable:
  494. def recognize_table(self,list_textbox, list_line,sourceP_LB=True):
  495. self.list_line = list_line
  496. self.list_crosspoints = self.recognize_crosspoints(list_line)
  497. # 聚类
  498. cluster_crosspoints = []
  499. for _point in self.list_crosspoints:
  500. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  501. while 1:
  502. _find = False
  503. new_cluster_crosspoints = []
  504. for l_point in cluster_crosspoints:
  505. _flag = False
  506. for l_n_point in new_cluster_crosspoints:
  507. line1 = l_point.get("lines")
  508. line2 = l_n_point.get("lines")
  509. if len(line1&line2) > 0:
  510. _find = True
  511. _flag = True
  512. l_n_point["lines"] = line1.union(line2)
  513. l_n_point["points"].extend(l_point["points"])
  514. if not _flag:
  515. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  516. cluster_crosspoints = new_cluster_crosspoints
  517. if not _find:
  518. break
  519. #need to sort to deal with the inner tables
  520. for clu_cp in cluster_crosspoints:
  521. points = clu_cp["points"]
  522. list_p = np.array([p["point"] for p in points])
  523. max_x = max(list_p[...,0])
  524. min_x = min(list_p[...,0])
  525. max_y = max(list_p[...,1])
  526. min_y = min(list_p[...,1])
  527. _area = (max_y-min_y)*(max_x-min_x)
  528. clu_cp["area"] = _area
  529. cluster_crosspoints.sort(key=lambda x:x["area"])
  530. list_l_rect = []
  531. for table_crosspoint in cluster_crosspoints:
  532. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  533. list_l_rect.append(list_rect)
  534. in_objs = set()
  535. list_tables = []
  536. for l_rect in list_l_rect:
  537. _ta = self.rect2table(list_textbox,l_rect,in_objs,sourceP_LB=sourceP_LB)
  538. if _ta:
  539. list_tables.append(_ta)
  540. # self._plot(list_line, list_textbox)
  541. return list_tables, in_objs, list_l_rect
  542. def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  543. dump_margin = 5
  544. list_rect_tmp = []
  545. # 去重
  546. for _rect in list_rect:
  547. if (_rect.bbox[3]-_rect.bbox[1] < 10) or (abs(_rect.bbox[2]-_rect.bbox[0]) < 5):
  548. continue
  549. _find = False
  550. for _tmp in list_rect_tmp:
  551. for i in range(4):
  552. if abs(_rect.bbox[i]-_tmp.bbox[i]) < dump_margin:
  553. pass
  554. else:
  555. _find = False
  556. break
  557. if i == 3:
  558. _find = True
  559. if _find:
  560. break
  561. if not _find:
  562. list_rect_tmp.append(_rect)
  563. # print("=====",len(list_rect),len(list_rect_tmp))
  564. # print(list_rect_tmp)
  565. # from matplotlib import pyplot as plt
  566. # plt.figure()
  567. # for _rect in list_rect_tmp:
  568. # x0,y0,x1,y1 = _rect.bbox
  569. # plt.boxplot(_rect.bbox)
  570. # plt.show()
  571. cluster_rect = []
  572. for _rect in list_rect:
  573. _find = False
  574. for cr in cluster_rect:
  575. for cr_rect in cr:
  576. if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
  577. _find = True
  578. cr.append(_rect)
  579. break
  580. elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
  581. _find = True
  582. cr.append(_rect)
  583. break
  584. if _find:
  585. break
  586. if not _find:
  587. cluster_rect.append([_rect])
  588. list_l_rect = cluster_rect
  589. in_objs = set()
  590. list_tables = []
  591. for l_rect in list_l_rect:
  592. _ta = self.rect2table(list_textbox,l_rect,in_objs)
  593. if _ta:
  594. list_tables.append(_ta)
  595. return list_tables,in_objs,list_l_rect
  596. def recognize_crosspoints(self, list_line,fixLine=True):
  597. list_crosspoints = []
  598. # print("lines num",len(list_line))
  599. def getMaxPoints(list_x,margin=5,reverse=False):
  600. clust_x = []
  601. for _x in list_x:
  602. _find = False
  603. for cx in clust_x:
  604. if abs(cx[0]-_x)<margin:
  605. _find = True
  606. cx.append(_x)
  607. break
  608. if not _find:
  609. clust_x.append([_x])
  610. clust_x.sort(key=lambda x:x,reverse=reverse)
  611. return clust_x[0][0],len(clust_x[0])
  612. for _i in range(len(list_line)):
  613. for _j in range(len(list_line)):
  614. line1 = list_line[_i].__dict__.get("bbox")
  615. line2 = list_line[_j].__dict__.get("bbox")
  616. exists,point = self.cross_point(line1,line2)
  617. if exists:
  618. list_crosspoints.append(point)
  619. if fixLine:
  620. #聚类
  621. cluster_crosspoints = []
  622. for _point in list_crosspoints:
  623. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  624. while 1:
  625. _find = False
  626. new_cluster_crosspoints = []
  627. for l_point in cluster_crosspoints:
  628. _flag = False
  629. for l_n_point in new_cluster_crosspoints:
  630. line1 = l_point.get("lines")
  631. line2 = l_n_point.get("lines")
  632. if len(line1&line2)>0:
  633. _find = True
  634. _flag = True
  635. l_n_point["lines"] = line1.union(line2)
  636. l_n_point["points"].extend(l_point["points"])
  637. if not _flag:
  638. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  639. cluster_crosspoints = new_cluster_crosspoints
  640. if not _find:
  641. break
  642. list_crosspoints = []
  643. for list_cp in cluster_crosspoints:
  644. points = list_cp.get("points")
  645. l_lines = []
  646. for p in points:
  647. l_lines.extend(p.get("p_lines"))
  648. l_lines = list(set(l_lines))
  649. l_lines.sort(key=lambda x:x[0])
  650. min_x,_count = getMaxPoints([l[0] for l in l_lines],reverse=False)
  651. if _count<=2:
  652. min_x = None
  653. min_y,_count = getMaxPoints([l[1] for l in l_lines],reverse=False)
  654. if _count<2:
  655. min_y = None
  656. max_x,_count = getMaxPoints([l[2] for l in l_lines],reverse=True)
  657. if _count<=2:
  658. max_x = None
  659. max_y,_count = getMaxPoints([l[3] for l in l_lines],reverse=True)
  660. if _count<=2:
  661. max_y = None
  662. if min_x and min_y and max_x and max_y:
  663. points.sort(key=lambda x:x["point"][0])
  664. if abs(min_x-points[0]["point"][0])>30:
  665. _line = LTLine(1,(min_x,min_y),(min_x,max_y))
  666. list_line.append(_line)
  667. l_lines.append(_line.bbox)
  668. # print("add=====",_line.bbox)
  669. if abs(max_x-points[-1]["point"][0])>30:
  670. _line = LTLine(1,(max_x,min_y),(max_x,max_y))
  671. list_line.append(_line)
  672. l_lines.append(_line.bbox)
  673. # print("add=====1",_line.bbox)
  674. points.sort(key=lambda x:x["point"][1])
  675. if abs(min_y-points[0]["point"][1])>30:
  676. _line = LTLine(1,(min_x,min_y),(max_x,min_y))
  677. list_line.append(_line)
  678. l_lines.append(_line.bbox)
  679. # print("add=====2",_line.bbox)
  680. if abs(max_y-points[-1]["point"][1])>30:
  681. _line = LTLine(1,(min_x,max_y),(max_x,max_y))
  682. list_line.append(_line)
  683. l_lines.append(_line.bbox)
  684. # print("add=====2",_line.bbox)
  685. for _i in range(len(l_lines)):
  686. for _j in range(len(l_lines)):
  687. line1 = l_lines[_i]
  688. line2 = l_lines[_j]
  689. exists,point = self.cross_point(line1,line2)
  690. if exists:
  691. list_crosspoints.append(point)
  692. # from matplotlib import pyplot as plt
  693. # plt.figure()
  694. # for _line in l_lines:
  695. # x0,y0,x1,y1 = _line
  696. # plt.plot([x0,x1],[y0,y1])
  697. # for point in list_crosspoints:
  698. # plt.scatter(point.get("point")[0],point.get("point")[1])
  699. # plt.show()
  700. # print(list_crosspoints)
  701. # print("points num",len(list_crosspoints))
  702. return list_crosspoints
  703. def recognize_rect(self, _page):
  704. list_line = []
  705. for _obj in _page._objs:
  706. if isinstance(_obj, (LTLine)):
  707. list_line.append(_obj)
  708. list_crosspoints = self.recognize_crosspoints(list_line)
  709. #聚类
  710. cluster_crosspoints = []
  711. for _point in list_crosspoints:
  712. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  713. while 1:
  714. _find = False
  715. new_cluster_crosspoints = []
  716. for l_point in cluster_crosspoints:
  717. _flag = False
  718. for l_n_point in new_cluster_crosspoints:
  719. line1 = l_point.get("lines")
  720. line2 = l_n_point.get("lines")
  721. if len(line1&line2)>0:
  722. _find = True
  723. _flag = True
  724. l_n_point["lines"] = line1.union(line2)
  725. l_n_point["points"].extend(l_point["points"])
  726. if not _flag:
  727. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  728. cluster_crosspoints = new_cluster_crosspoints
  729. if not _find:
  730. break
  731. # print(len(cluster_crosspoints))
  732. list_l_rect = []
  733. for table_crosspoint in cluster_crosspoints:
  734. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  735. list_l_rect.append(list_rect)
  736. return list_l_rect
  737. def crosspoint2rect(self, list_crosspoint, margin=10):
  738. dict_line_points = {}
  739. for _point in list_crosspoint:
  740. lines = list(_point.get("lines"))
  741. for _line in lines:
  742. if _line not in dict_line_points:
  743. dict_line_points[_line] = {"direct":None,"points":[]}
  744. dict_line_points[_line]["points"].append(_point)
  745. # 排序
  746. for k, v in dict_line_points.items():
  747. list_x = []
  748. list_y = []
  749. for _p in v["points"]:
  750. list_x.append(_p.get("point")[0])
  751. list_y.append(_p.get("point")[1])
  752. if max(list_x)-min(list_x)>max(list_y)-min(list_y):
  753. v.get("points").sort(key=lambda x:x.get("point")[0])
  754. v["direct"] = "row"
  755. else:
  756. v.get("points").sort(key=lambda x:x.get("point")[1])
  757. v["direct"] = "column"
  758. list_rect = []
  759. for _point in list_crosspoint:
  760. if _point["buttom"]>=margin and _point["right"]>=margin:
  761. lines = list(_point.get("lines"))
  762. _line = lines[0]
  763. if dict_line_points[_line]["direct"]=="column":
  764. _line = lines[1]
  765. next_point = None
  766. for p1 in dict_line_points[_line]["points"]:
  767. if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
  768. next_point = p1
  769. break
  770. if not next_point:
  771. continue
  772. lines = list(next_point.get("lines"))
  773. _line = lines[0]
  774. if dict_line_points[_line]["direct"]=="row":
  775. _line = lines[1]
  776. final_point = None
  777. for p1 in dict_line_points[_line]["points"]:
  778. if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
  779. final_point = p1
  780. break
  781. if not final_point:
  782. continue
  783. _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
  784. list_rect.append(_r)
  785. tmp_rect = []
  786. set_bbox = set()
  787. for _r in list_rect:
  788. _bbox = "%.2f-%.2f-%.2f-%.2f"%_r.bbox
  789. width = _r.bbox[2]-_r.bbox[0]
  790. height = _r.bbox[3]-_r.bbox[1]
  791. if width<=margin or height<=margin:
  792. continue
  793. if _bbox not in set_bbox:
  794. tmp_rect.append(_r)
  795. set_bbox.add(_bbox)
  796. list_rect = tmp_rect
  797. # import cv2
  798. # import numpy as np
  799. # import random
  800. # img = np.zeros(shape=(1000,1000),dtype=np.uint8)
  801. # img += 255
  802. #
  803. # color = []
  804. # for rect in list_rect:
  805. # color += 10
  806. # x0,y0,x1,y1 = rect.bbox
  807. # x0 *= 10/18
  808. # y0 *= 10/18
  809. # x1 *= 10/18
  810. # y1 *= 10/18
  811. # print(rect.bbox)
  812. # cv2.rectangle(img, (int(x0),int(y0)),(int(x1),int(y1)), (color%255, (color+10)%255, (color+20)%255), 3)
  813. # cv2.imshow("bbox", img)
  814. # cv2.waitKey(0)
  815. return list_rect
  816. def cross_point(self, line1, line2, segment=True, margin=2):
  817. point_is_exist = False
  818. x = y = 0
  819. x1, y1, x2, y2 = line1
  820. x3, y3, x4, y4 = line2
  821. if (x2 - x1) == 0:
  822. k1 = None
  823. b1 = 0
  824. else:
  825. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  826. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  827. if (x4 - x3) == 0: # L2直线斜率不存在
  828. k2 = None
  829. b2 = 0
  830. else:
  831. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  832. b2 = y3 * 1.0 - x3 * k2 * 1.0
  833. if k1 is None:
  834. if not k2 is None:
  835. x = x1
  836. y = k2 * x1 + b2
  837. point_is_exist = True
  838. elif k2 is None:
  839. x = x3
  840. y = k1 * x3 + b1
  841. elif not k2 == k1:
  842. x = (b2 - b1) * 1.0 / (k1 - k2)
  843. y = k1 * x * 1.0 + b1 * 1.0
  844. point_is_exist = True
  845. left = 0
  846. right = 0
  847. top = 0
  848. buttom = 0
  849. if point_is_exist:
  850. if segment:
  851. if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
  852. if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
  853. point_is_exist = True
  854. left = abs(min(x1,x3)-x)
  855. right = abs(max(x2,x4)-x)
  856. top = abs(min(y1,y3)-y)
  857. buttom = abs(max(y2,y4)-y)
  858. else:
  859. point_is_exist = False
  860. else:
  861. point_is_exist = False
  862. line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
  863. line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
  864. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  865. "top": top, "buttom": buttom, "lines": set([line1_key,line2_key]),"p_lines":[line1,line2]}
  866. def unionTable(self, list_table, fixspan=True, margin=2):
  867. set_x = set()
  868. set_y = set()
  869. list_cell = []
  870. for _t in list_table:
  871. for _line in _t:
  872. list_cell.extend(_line)
  873. clusters_rects = []
  874. #根据y1聚类
  875. set_id = set()
  876. list_cell_dump = []
  877. for _cell in list_cell:
  878. _id = id(_cell)
  879. if _id in set_id:
  880. continue
  881. set_id.add(_id)
  882. list_cell_dump.append(_cell)
  883. list_cell = list_cell_dump
  884. list_cell.sort(key=lambda x:x.get("bbox")[3])
  885. for _rect in list_cell:
  886. _y0 = _rect.get("bbox")[3]
  887. _find = False
  888. for l_cr in clusters_rects:
  889. if abs(l_cr[0].get("bbox")[3]-_y0)<2:
  890. _find = True
  891. l_cr.append(_rect)
  892. break
  893. if not _find:
  894. clusters_rects.append([_rect])
  895. clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
  896. for l_cr in clusters_rects:
  897. l_cr.sort(key=lambda x:x.get("bbox")[0])
  898. # print("=============:")
  899. # for l_r in clusters_rects:
  900. # print(len(l_r))
  901. for _line in clusters_rects:
  902. for _rect in _line:
  903. (x0,y0,x1,y1) = _rect.get("bbox")
  904. set_x.add(x0)
  905. set_x.add(x1)
  906. set_y.add(y0)
  907. set_y.add(y1)
  908. if len(set_x)==0 or len(set_y)==0:
  909. return
  910. list_x = list(set_x)
  911. list_y = list(set_y)
  912. list_x.sort(key=lambda x:x)
  913. list_y.sort(key=lambda x:x,reverse=True)
  914. _table = []
  915. line_i = 0
  916. for _line in clusters_rects:
  917. table_line = []
  918. cell_i = 0
  919. for _rect in _line:
  920. (x0,y0,x1,y1) = _rect.get("bbox")
  921. _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
  922. table_line.append(_cell)
  923. cell_i += 1
  924. line_i += 1
  925. _table.append(table_line)
  926. # print("=====================>>")
  927. # for _line in _table:
  928. # for _cell in _line:
  929. # print(_cell,end="\t")
  930. # print("\n")
  931. # print("=====================>>")
  932. # print(_table)
  933. if fixspan:
  934. for _line in _table:
  935. extend_line = []
  936. for c_i in range(len(_line)):
  937. _cell = _line[c_i]
  938. if _cell.get("columnspan")>1:
  939. _cospan = _cell.get("columnspan")
  940. _cell["columnspan"] = 1
  941. for i in range(1,_cospan):
  942. extend_line.append({"index":c_i+1,"cell":_cell})
  943. extend_line.sort(key=lambda x:x["index"],reverse=True)
  944. for _el in extend_line:
  945. _line.insert(_el["index"],_el["cell"])
  946. for l_i in range(len(_table)):
  947. _line = _table[l_i]
  948. for c_i in range(len(_line)):
  949. _cell = _line[c_i]
  950. if _cell.get("rowspan")>1:
  951. _rospan = _cell.get("rowspan")
  952. _cell["rowspan"] = 1
  953. for i in range(1,_rospan):
  954. _table[l_i+i].insert(c_i,_cell)
  955. table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
  956. ta = {"bbox":table_bbox,"table":_table}
  957. return ta
  958. #获取点阵
  959. def getSpanLocation(self,_list, x0, x1, margin):
  960. list_location = []
  961. (x0,x1) = (min(x0,x1),max(x0,x1))
  962. for _x in _list:
  963. if _x>=(x0-margin) and _x<=(x1+margin):
  964. list_location.append(_x)
  965. return list_location
  966. def fixSpan(self,_table,list_x,list_y):
  967. for _line in _table:
  968. c_i = 0
  969. while c_i<len(_line):
  970. _cell = _line[c_i]
  971. if _cell.get("columnspan")>1:
  972. x0,y0,x1,y1 = _cell.get("bbox")
  973. _cospan = _cell.get("columnspan")
  974. locations = self.getSpanLocation(list_x,x0,x1,10)
  975. if len(locations)==_cospan+1:
  976. _cell["bbox"] = (x0,y0,locations[1],y1)
  977. _cell["columnspan"] = 1
  978. #len(locations)==_colspan+1
  979. for i in range(1,_cospan):
  980. n_cell = {}
  981. n_cell.update(_cell)
  982. n_cell["bbox"] = (locations[i],y0,locations[i+1],y1)
  983. c_i += 1
  984. _line.insert(c_i,n_cell)
  985. c_i += 1
  986. for l_i in range(len(_table)):
  987. _line = _table[l_i]
  988. c_i = 0
  989. while c_i<len(_line):
  990. _cell = _line[c_i]
  991. if _cell.get("rowspan")>1:
  992. x0,y0,x1,y1 = _cell.get("bbox")
  993. _rospan = _cell.get("rowspan")
  994. locations = self.getSpanLocation(list_y,y0,y1,10)
  995. if len(locations)==_rospan+1:
  996. _cell["bbox"] = (x0,y0,x1,locations[1])
  997. _cell["rowspan"] = 1
  998. for i in range(1,_rospan):
  999. n_cell = {}
  1000. n_cell.update(_cell)
  1001. if l_i+i<=len(_table)-1:
  1002. # print(len(_table),l_i+i)
  1003. n_cell["bbox"] = (x0,locations[i],x1,locations[i+1])
  1004. _table[l_i+i].insert(c_i,n_cell)
  1005. c_i += 1
  1006. def fixRect(self,_table,list_x,list_y,sourceP_LB,margin):
  1007. self.fixSpan(_table,list_x,list_y)
  1008. # for line_i in range(len(_table)):
  1009. # for cell_i in range(len(_table[line_i])):
  1010. # _cell = _table[line_i][cell_i]
  1011. # print(line_i,cell_i,_cell["bbox"],_cell["text"])
  1012. for _line in _table:
  1013. extend_line = []
  1014. for c_i in range(len(_line)):
  1015. c_cell = _line[c_i]
  1016. #first cell missing
  1017. if c_i==0 and c_cell["bbox"][0]!=list_x[0]:
  1018. _bbox = (list_x[0],c_cell["bbox"][1], c_cell["bbox"][0],c_cell["bbox"][3])
  1019. _cell = {"bbox": _bbox,
  1020. "rect": LTRect(1,_bbox),
  1021. "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
  1022. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  1023. "text": ""}
  1024. extend_line.append({"index":c_i,"cell":_cell})
  1025. #cell in the median missing
  1026. if c_i<len(_line)-1:
  1027. n_cell = _line[c_i+1]
  1028. _bbox = c_cell["bbox"]
  1029. n_bbox = n_cell["bbox"]
  1030. if _bbox[0]==n_bbox[0] and _bbox[2]==n_bbox[2]:
  1031. continue
  1032. else:
  1033. if abs(_bbox[2]-n_bbox[0])>margin:
  1034. _bbox = (_bbox[2],_bbox[1], n_bbox[0],_bbox[3])
  1035. _cell = {"bbox": _bbox,
  1036. "rect": LTRect(1,_bbox),
  1037. "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
  1038. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  1039. "text": ""}
  1040. extend_line.append({"index":c_i+1,"cell":_cell})
  1041. #last cell missing
  1042. if c_i==len(_line)-1:
  1043. if abs(c_cell["bbox"][2]-list_x[-1])>margin:
  1044. _bbox = (c_cell["bbox"][2],c_cell["bbox"][1], list_x[-1],c_cell["bbox"][3])
  1045. _cell = {"bbox": _bbox,
  1046. "rect": LTRect(1,_bbox),
  1047. "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
  1048. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  1049. "text": ""}
  1050. extend_line.append({"index":c_i+1,"cell":_cell})
  1051. extend_line.sort(key=lambda x: x["index"],reverse=True)
  1052. for _tmp in extend_line:
  1053. _line.insert(_tmp["index"],_tmp["cell"])
  1054. def feedText2table(self,_table,list_textbox,in_objs,sourceP_LB):
  1055. #find the suitable cell of the textbox
  1056. list_cells = []
  1057. for table_line in _table:
  1058. for _cell in table_line:
  1059. list_cells.append({"cell":_cell,"inbox_textbox_list":[]})
  1060. for textbox in list_textbox:
  1061. list_iou = []
  1062. for _d in list_cells:
  1063. _cell = _d["cell"]
  1064. _iou = self.getIOU(textbox.bbox,_cell["bbox"])
  1065. list_iou.append(_iou)
  1066. max_iou_index = np.argmax(list_iou)
  1067. max_iou = list_iou[max_iou_index]
  1068. if max_iou>0.1 and textbox not in in_objs:
  1069. list_cells[max_iou_index]["inbox_textbox_list"].append(textbox)
  1070. in_objs.add(textbox)
  1071. has_matched_box_list = []
  1072. for _d in list_cells:
  1073. _cell = _d["cell"]
  1074. inbox_textbox_list = _d["inbox_textbox_list"]
  1075. # 分行,根据y重合
  1076. all_match_box_list = []
  1077. inbox_textbox_list.sort(key=lambda x:x.bbox[1],reverse=sourceP_LB)
  1078. for i in range(len(inbox_textbox_list)):
  1079. match_box_list = []
  1080. box1 = inbox_textbox_list[i]
  1081. if box1 in has_matched_box_list:
  1082. continue
  1083. min_y1 = box1.bbox[1] + 1/3 * abs(box1.bbox[3]-box1.bbox[1])
  1084. max_y1 = box1.bbox[3] - 1/3 * abs(box1.bbox[3]-box1.bbox[1])
  1085. match_box_list.append([box1.get_text(), box1.bbox[0], box1.bbox[1], box1.bbox[2], box1.bbox[3],min_y1,max_y1])
  1086. has_matched_box_list.append(box1)
  1087. for j in range(i+1, len(inbox_textbox_list)):
  1088. box2 = inbox_textbox_list[j]
  1089. if box2 in has_matched_box_list:
  1090. continue
  1091. # print(min_y1, box2.bbox[1], box2.bbox[3], max_y1)
  1092. # print(min_y2, box1.bbox[3], max_y2)
  1093. if min_y1 <= box2.bbox[1] <= max_y1 or \
  1094. min_y1 <= box2.bbox[3] <= max_y1 or \
  1095. box2.bbox[1] <= min_y1 <= max_y1 <= box2.bbox[3]:
  1096. match_box_list.append([box2.get_text(), box2.bbox[0], box2.bbox[1], box2.bbox[2], box2.bbox[3],min_y1,max_y1])
  1097. has_matched_box_list.append(box2)
  1098. match_box_list.sort(key=lambda x: x[1])
  1099. all_match_box_list.append(match_box_list)
  1100. # print("match_box_list", all_match_box_list)
  1101. all_match_box_list.sort(key=lambda x:(x[0][2]+x[0][4])/2,reverse=sourceP_LB)
  1102. for box_list in all_match_box_list:
  1103. for box in box_list:
  1104. _cell["text"] += re.sub("\s",'',box[0])
  1105. def makeTableByRect(self,list_rect,margin,sourceP_LB):
  1106. _table = []
  1107. set_x = set()
  1108. set_y = set()
  1109. clusters_rects = []
  1110. # 根据y1聚类
  1111. if sourceP_LB:
  1112. list_rect.sort(key=lambda x:x.bbox[3])
  1113. for _rect in list_rect:
  1114. _y0 = _rect.bbox[3]
  1115. _find = False
  1116. for l_cr in clusters_rects:
  1117. if abs(l_cr[0].bbox[3]-_y0)<margin:
  1118. _find = True
  1119. l_cr.append(_rect)
  1120. break
  1121. if not _find:
  1122. clusters_rects.append([_rect])
  1123. else:
  1124. list_rect.sort(key=lambda x:x.bbox[1])
  1125. for _rect in list_rect:
  1126. _y0 = _rect.bbox[1]
  1127. _find = False
  1128. for l_cr in clusters_rects:
  1129. if abs(l_cr[0].bbox[1]-_y0)<margin:
  1130. _find = True
  1131. l_cr.append(_rect)
  1132. break
  1133. if not _find:
  1134. clusters_rects.append([_rect])
  1135. # print("textbox:===================")
  1136. # for _textbox in list_textbox:
  1137. # print(_textbox.get_text())
  1138. # print("textbox:======>>>>>>>>>>>>>")
  1139. # for c in clusters_rects:
  1140. # print("+"*30)
  1141. # for cc in c:
  1142. # print("rect", cc.)
  1143. # cul spans
  1144. for _line in clusters_rects:
  1145. for _rect in _line:
  1146. (x0,y0,x1,y1) = _rect.bbox
  1147. set_x.add(x0)
  1148. set_x.add(x1)
  1149. set_y.add(y0)
  1150. set_y.add(y1)
  1151. if len(set_x)==0 or len(set_y)==0:
  1152. return
  1153. if len(list_rect)<=1:
  1154. return
  1155. list_x = list(set_x)
  1156. list_y = list(set_y)
  1157. list_x.sort(key=lambda x:x)
  1158. list_y.sort(key=lambda x:x,reverse=sourceP_LB)
  1159. # print("clusters_rects", len(clusters_rects))
  1160. if sourceP_LB:
  1161. clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
  1162. else:
  1163. clusters_rects.sort(key=lambda x:x[0].bbox[1],reverse=sourceP_LB)
  1164. for l_cr in clusters_rects:
  1165. l_cr.sort(key=lambda x:x.bbox[0])
  1166. pop_x = []
  1167. for i in range(len(list_x)-1):
  1168. _i = len(list_x)-i-1
  1169. l_i = _i-1
  1170. if abs(list_x[_i]-list_x[l_i])<5:
  1171. pop_x.append(_i)
  1172. pop_x.sort(key=lambda x:x,reverse=True)
  1173. for _x in pop_x:
  1174. list_x.pop(_x)
  1175. #
  1176. pop_x = []
  1177. for i in range(len(list_y)-1):
  1178. _i = len(list_y)-i-1
  1179. l_i = _i-1
  1180. if abs(list_y[_i]-list_y[l_i])<5:
  1181. pop_x.append(_i)
  1182. pop_x.sort(key=lambda x:x,reverse=True)
  1183. for _x in pop_x:
  1184. list_y.pop(_x)
  1185. # print(list_x)
  1186. # print(list_y)
  1187. line_i = 0
  1188. for _line in clusters_rects:
  1189. table_line = []
  1190. cell_i = 0
  1191. for _rect in _line:
  1192. (x0, y0, x1, y1) = _rect.bbox
  1193. _cell = {"bbox": (x0, y0, x1, y1),
  1194. "rect": _rect,
  1195. "rowspan": self.getspan(list_y, y0, y1, margin),
  1196. "columnspan": self.getspan(list_x, x0, x1, margin),
  1197. "text": ""}
  1198. cell_i += 1
  1199. table_line.append(_cell)
  1200. line_i += 1
  1201. _table.append(table_line)
  1202. return _table,list_x,list_y
  1203. def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=True):
  1204. def getIOU(bbox0,bbox1):
  1205. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  1206. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  1207. if width<0 and height<0:
  1208. return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  1209. return 0
  1210. _table,list_x,list_y = self.makeTableByRect(list_rect,margin,sourceP_LB)
  1211. self.feedText2table(_table,list_textbox,in_objs,sourceP_LB)
  1212. self.fixRect(_table,list_x,list_y,sourceP_LB,margin)
  1213. self.feedText2table(_table,list_textbox,in_objs,sourceP_LB)
  1214. # print("table===========================>")
  1215. # for _line in _table:
  1216. # for _cell in _line:
  1217. # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
  1218. # print()
  1219. # print("table===========================>")
  1220. # print("------------")
  1221. # for _line in _table:
  1222. # for _cell in _line:
  1223. # print(_cell["text"])
  1224. # print("\n")
  1225. # print("------------")
  1226. table_bbox = (_table[0][0].get("bbox")[0],
  1227. _table[0][0].get("bbox")[1],
  1228. _table[-1][-1].get("bbox")[2],
  1229. _table[-1][-1].get("bbox")[3])
  1230. # print("=======")
  1231. # for _line in _table:
  1232. # for _cell in _line:
  1233. # print(_cell["text"])
  1234. # print("\n")
  1235. # print("===========")
  1236. ta = {"bbox": table_bbox, "table": _table}
  1237. return ta
  1238. def inbox(self, bbox0, bbox_g, text=""):
  1239. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  1240. # return 1
  1241. # print("utils inbox", text, self.getIOU(bbox0,bbox_g), bbox0, bbox_g)
  1242. if self.getIOU(bbox0,bbox_g)>0.2:
  1243. return 1
  1244. return 0
  1245. def getIOU(self, bbox0, bbox1):
  1246. width = abs(max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0]))-(abs(bbox0[2]-bbox0[0])+abs(bbox1[2]-bbox1[0]))
  1247. height = abs(max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1]))-(abs(bbox0[3]-bbox0[1])+abs(bbox1[3]-bbox1[1]))
  1248. if width < 0 and height < 0:
  1249. iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
  1250. abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  1251. # print("getIOU", iou)
  1252. return iou
  1253. return 0
  1254. def getspan(self, _list, x0, x1, margin):
  1255. _count = 0
  1256. (x0,x1) = (min(x0,x1),max(x0,x1))
  1257. for _x in _list:
  1258. if _x>=(x0-margin) and _x<=(x1+margin):
  1259. _count += 1
  1260. return _count-1
  1261. def _plot(self, list_line, list_textbox):
  1262. from matplotlib import pyplot as plt
  1263. plt.figure()
  1264. for _line in list_line:
  1265. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  1266. plt.plot([x0, x1], [y0, y1])
  1267. for _line in list_line:
  1268. x0, y0, x1, y1 = _line.bbox
  1269. plt.plot([x0, x1], [y0, y1])
  1270. # for point in list_crosspoints:
  1271. # plt.scatter(point.get("point")[0],point.get("point")[1])
  1272. for textbox in list_textbox:
  1273. x0, y0, x1, y1 = textbox.bbox
  1274. plt.plot([x0, x1], [y0, y1])
  1275. plt.show()
  1276. def get_table_html(table):
  1277. html_text = '<table border="1">'
  1278. for row in table:
  1279. html_text += "<tr>"
  1280. for col in row:
  1281. row_span = col.get("rowspan")
  1282. col_span = col.get("columnspan")
  1283. bbox_text = col.get("text")
  1284. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1285. html_text += bbox_text + "</td>"
  1286. html_text += "</tr>"
  1287. html_text += "</table>"
  1288. return html_text
  1289. def sort_object(obj_list, is_reverse=False):
  1290. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  1291. obj_list = combine_object(obj_list)
  1292. if len(obj_list) == 0:
  1293. return obj_list
  1294. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  1295. obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
  1296. return obj_list
  1297. elif isinstance(obj_list[0], _Page):
  1298. obj_list.sort(key=lambda x: x.page_no)
  1299. return obj_list
  1300. else:
  1301. return obj_list
  1302. def combine_object(obj_list, threshold=5):
  1303. from format_convert.convert_tree import _Sentence
  1304. sentence_list = []
  1305. for obj in obj_list:
  1306. if isinstance(obj, _Sentence):
  1307. obj.content = re.sub("\s", "", obj.content)
  1308. sentence_list.append(obj)
  1309. sentence_list.sort(key=lambda x: (x.y, x.x))
  1310. for sen in sentence_list:
  1311. obj_list.remove(sen)
  1312. delete_list = []
  1313. for i in range(1, len(sentence_list)):
  1314. sen1 = sentence_list[i-1]
  1315. sen2 = sentence_list[i]
  1316. if abs(sen2.y - sen1.y) <= threshold:
  1317. if sen2.x > sen1.x:
  1318. sen2.x = sen1.x
  1319. sen2.content = sen1.content + sen2.content
  1320. else:
  1321. sen2.content = sen2.content + sen1.content
  1322. if sen2.y > sen1.y:
  1323. sen2.y = sen1.y
  1324. delete_list.append(sen1)
  1325. for sen in delete_list:
  1326. sentence_list.remove(sen)
  1327. for sen in sentence_list:
  1328. obj_list.append(sen)
  1329. return obj_list
  1330. session_ocr = requests.Session()
  1331. session_otr = requests.Session()
  1332. session_all = requests.Session()
  1333. def request_post(url, param, time_out=1000, use_zlib=False):
  1334. fails = 0
  1335. text = json.dumps([-2])
  1336. while True:
  1337. try:
  1338. if fails >= 1:
  1339. break
  1340. headers = {'content-type': 'application/json'}
  1341. # result = requests.post(url, data=param, timeout=time_out)
  1342. if param.get("model_type") == "ocr":
  1343. result = session_ocr.post(url, data=param, timeout=time_out)
  1344. elif param.get("model_type") == "otr":
  1345. result = session_otr.post(url, data=param, timeout=time_out)
  1346. else:
  1347. result = session_all.post(url, data=param, timeout=time_out)
  1348. # print('result.status_code', result.status_code)
  1349. # print('result.text', result.text)
  1350. if result.status_code == 200:
  1351. text = result.text
  1352. break
  1353. else:
  1354. print('result.status_code', result.status_code)
  1355. print('result.text', result.text)
  1356. fails += 1
  1357. continue
  1358. except socket.timeout:
  1359. fails += 1
  1360. print('timeout! fail times:', fails)
  1361. except:
  1362. fails += 1
  1363. print('fail! fail times:', fails)
  1364. traceback.print_exc()
  1365. return text
  1366. def test_gpu():
  1367. print("="*30)
  1368. import paddle
  1369. paddle.utils.run_check()
  1370. # import tensorflow as tf
  1371. # print("tf gpu", tf.config.list_physical_devices('GPU'))
  1372. print("="*30)
  1373. def my_subprocess_call(*popenargs, timeout=None):
  1374. logging.info("into my_subprocess_call")
  1375. with Popen(*popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
  1376. try:
  1377. for line in p.stdout:
  1378. print("stdout", line)
  1379. for line in p.stderr:
  1380. print("stderr", line)
  1381. p.wait(timeout=timeout)
  1382. # p.communicate()
  1383. return p.pid, p.returncode
  1384. except: # Including KeyboardInterrupt, wait handled that.
  1385. p.kill()
  1386. # We don't call p.wait() again as p.__exit__ does that for us.
  1387. raise
  1388. finally:
  1389. logging.info("out my_subprocess_call")
  1390. p.kill()
  1391. def parse_yaml():
  1392. yaml_path = os.path.dirname(os.path.abspath(__file__)) + "/interface.yml"
  1393. with open(yaml_path, "r", encoding='utf-8') as f:
  1394. cfg = f.read()
  1395. params = yaml.load(cfg, Loader=yaml.SafeLoader)
  1396. return params
  1397. def get_ip_port(node_type=None, interface_type=None):
  1398. if node_type is None:
  1399. node_type_list = ["master", "slave"]
  1400. else:
  1401. node_type_list = [node_type]
  1402. if interface_type is None:
  1403. interface_type_list = ["convert", "ocr", "otr", "office", "path"]
  1404. else:
  1405. interface_type_list = [interface_type]
  1406. ip_port_dict = {}
  1407. params = parse_yaml()
  1408. # 循环 master slave
  1409. for type1 in node_type_list:
  1410. node_type = type1.upper()
  1411. ip_list = params.get(node_type).get("ip")
  1412. # 循环多个IP
  1413. for j in range(len(ip_list)):
  1414. _ip = ip_list[j]
  1415. if ip_port_dict.get(_ip):
  1416. ip_port_dict.get(_ip).update({node_type: {}})
  1417. else:
  1418. ip_port_dict.update({_ip: {node_type: {}}})
  1419. # 有IP时,循环多个参数
  1420. for type2 in interface_type_list:
  1421. python_path = None
  1422. project_path = None
  1423. gunicorn_path = None
  1424. processes = 0
  1425. port_list = []
  1426. interface_type = type2.upper()
  1427. if interface_type in ["convert".upper()]:
  1428. _port = params.get(node_type).get(interface_type).get("port")
  1429. if _port is None:
  1430. port_list = []
  1431. else:
  1432. if interface_type == "convert".upper():
  1433. processes = params.get(node_type).get(interface_type).get("processes")[j]
  1434. port_list = [str(_port[j])]*int(processes)
  1435. # port_list = [str(_port)]
  1436. elif interface_type == "path".upper():
  1437. python_path = params.get(node_type).get(interface_type).get("python")[j]
  1438. project_path = params.get(node_type).get(interface_type).get("project")[j]
  1439. gunicorn_path = params.get(node_type).get(interface_type).get("gunicorn")[j]
  1440. else:
  1441. port_start = params.get(node_type).get(interface_type).get("port_start")
  1442. port_no = params.get(node_type).get(interface_type).get("port_no")
  1443. if port_start is None or port_no is None:
  1444. port_list = []
  1445. else:
  1446. port_list = [str(x) for x in range(port_start[j], port_start[j]+port_no[j], 1)]
  1447. # if ip_list:
  1448. # for i in range(len(ip_list)):
  1449. # 参数放入dict
  1450. if port_list:
  1451. ip_port_dict.get(_ip).get(node_type).update({interface_type.lower(): port_list})
  1452. if processes:
  1453. ip_port_dict.get(_ip).get(node_type).update({interface_type.lower()+"_processes": processes})
  1454. if project_path and python_path and gunicorn_path:
  1455. ip_port_dict.get(_ip).get(node_type).update({"project_path": project_path,
  1456. "python_path": python_path,
  1457. "gunicorn_path": gunicorn_path})
  1458. # print("ip_port_dict", ip_port_dict)
  1459. return ip_port_dict
  1460. def get_ip_port_old(node_type=None, interface_type=None):
  1461. if node_type is None:
  1462. node_type_list = ["master", "slave"]
  1463. else:
  1464. node_type_list = [node_type]
  1465. if interface_type is None:
  1466. interface_type_list = ["convert", "ocr", "otr", "office", "path"]
  1467. else:
  1468. interface_type_list = [interface_type]
  1469. ip_port_dict = {}
  1470. params = parse_yaml()
  1471. for type1 in node_type_list:
  1472. node_type = type1.upper()
  1473. ip_list = params.get(node_type).get("ip")
  1474. for type2 in interface_type_list:
  1475. interface_type = type2.upper()
  1476. processes = 0
  1477. python_path = None
  1478. project_path = None
  1479. if interface_type in ["convert".upper()]:
  1480. _port = params.get(node_type).get(interface_type).get("port")
  1481. if _port is None:
  1482. port_list = []
  1483. else:
  1484. if interface_type == "convert".upper():
  1485. processes = params.get(node_type).get(interface_type).get("processes")
  1486. port_list = [str(_port)]*int(processes)
  1487. # port_list = [str(_port)]
  1488. elif interface_type == "path".upper():
  1489. python_path = params.get(node_type).get(interface_type).get("python")
  1490. project_path = params.get(node_type).get(interface_type).get("project")
  1491. else:
  1492. port_start = params.get(node_type).get(interface_type).get("port_start")
  1493. port_no = params.get(node_type).get(interface_type).get("port_no")
  1494. if port_start is None or port_no is None:
  1495. port_list = []
  1496. else:
  1497. port_list = [str(x) for x in range(port_start, port_start+port_no, 1)]
  1498. if ip_list:
  1499. for _ip in ip_list:
  1500. if _ip is None:
  1501. continue
  1502. if _ip in ip_port_dict.keys():
  1503. if port_list:
  1504. ip_port_dict.get(_ip).update({interface_type.lower(): port_list})
  1505. else:
  1506. if port_list:
  1507. ip_port_dict[_ip] = {interface_type.lower(): port_list}
  1508. if processes:
  1509. ip_port_dict.get(_ip).update({interface_type.lower()+"_processes": processes})
  1510. if project_path and python_path:
  1511. ip_port_dict.get(_ip).update({"project_path": project_path,
  1512. "python_path": python_path})
  1513. return ip_port_dict
  1514. def get_intranet_ip():
  1515. try:
  1516. # Create a new socket using the given address family,
  1517. # socket type and protocol number.
  1518. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  1519. # Connect to a remote socket at address.
  1520. # (The format of address depends on the address family.)
  1521. address = ("8.8.8.8", 80)
  1522. s.connect(address)
  1523. # Return the socket’s own address.
  1524. # This is useful to find out the port number of an IPv4/v6 socket, for instance.
  1525. # (The format of the address returned depends on the address family.)
  1526. sockname = s.getsockname()
  1527. ip = sockname[0]
  1528. port = sockname[1]
  1529. finally:
  1530. s.close()
  1531. return ip
  1532. def memory_decorator(func):
  1533. @wraps(func)
  1534. def get_memory_info(*args, **kwargs):
  1535. if get_platform() == "Windows":
  1536. return func(*args, **kwargs)
  1537. # 只有linux有resource包
  1538. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1539. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1540. start_time = time.time()
  1541. logging.info("----- memory info start - " + func.__qualname__
  1542. + " - " + str(os.getpid())
  1543. + " - " + str(round(usage, 2)) + " GB"
  1544. + " - " + str(round(time.time()-start_time, 2)) + " sec")
  1545. result = func(*args, **kwargs)
  1546. # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
  1547. usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
  1548. logging.info("----- memory info end - " + func.__qualname__
  1549. + " - " + str(os.getpid())
  1550. + " - " + str(round(usage, 2)) + " GB"
  1551. + " - " + str(round(time.time()-start_time, 2)) + " sec")
  1552. return result
  1553. return get_memory_info
  1554. def log(msg):
  1555. call_func_name = inspect.currentframe().f_back.f_code.co_name
  1556. logger = get_logger(call_func_name, {"md5": _global.get("md5"),
  1557. "port": _global.get("port")})
  1558. logger.info(msg)
  1559. # logging.info(msg)
  1560. def get_logger(_name, _dict):
  1561. extra = _dict
  1562. _format = '%(asctime)s - %(name)s - %(levelname)s - %(md5)s - %(port)s - %(message)s'
  1563. logger = logging.getLogger(_name)
  1564. create_new_flag = 1
  1565. handlers = logger.handlers
  1566. if handlers:
  1567. for h in handlers:
  1568. if h.formatter.__dict__.get("_fmt") == _format:
  1569. create_new_flag = 0
  1570. break
  1571. if create_new_flag:
  1572. formatter = logging.Formatter(_format)
  1573. handler = logging.StreamHandler()
  1574. handler.setFormatter(formatter)
  1575. logger.addHandler(handler)
  1576. logger.setLevel(logging.INFO)
  1577. logger.propagate = False
  1578. logger = logging.LoggerAdapter(logger, extra)
  1579. return logger
  1580. def set_flask_global():
  1581. # 接口轮询所需锁、参数
  1582. ip_port_flag = {}
  1583. ip_flag = []
  1584. ip_port_dict = get_ip_port()
  1585. for _k in ip_port_dict.keys():
  1586. ip_port_flag.update({_k: {"ocr": 0,
  1587. "otr": 0,
  1588. "convert": 0,
  1589. "office": 0
  1590. }})
  1591. if ip_port_dict.get(_k).get("MASTER"):
  1592. ip_flag.append([_k+"_master", 0])
  1593. if ip_port_dict.get(_k).get("SLAVE"):
  1594. ip_flag.append([_k+"_slave", 0])
  1595. _global.update({"ip_port_flag": ip_port_flag})
  1596. _global.update({"ip_port": ip_port_dict})
  1597. _global.update({"ip_flag": ip_flag})
  1598. # print(globals().get("ip_port"))
  1599. def get_md5_from_bytes(_bytes):
  1600. def generate_fp(_b):
  1601. bio = BytesIO()
  1602. bio.write(_b)
  1603. return bio
  1604. _length = 0
  1605. try:
  1606. _md5 = hashlib.md5()
  1607. ff = generate_fp(_bytes)
  1608. ff.seek(0)
  1609. while True:
  1610. data = ff.read(4096)
  1611. if not data:
  1612. break
  1613. _length += len(data)
  1614. _md5.update(data)
  1615. return _md5.hexdigest(), _length
  1616. except Exception as e:
  1617. traceback.print_exc()
  1618. return None, _length
  1619. # def to_share_memory(np_data, name=None):
  1620. # # from multiprocessing.resource_tracker import unregister
  1621. # from multiprocessing import shared_memory
  1622. # if name is None:
  1623. # sm_name = "psm_" + str(os.getpid())
  1624. # else:
  1625. # sm_name = name
  1626. # logging.info("into from_share_memory sm_name " + sm_name)
  1627. # shm = shared_memory.SharedMemory(name=sm_name, create=True, size=np_data.nbytes)
  1628. # # unregister(sm_name, 'shared_memory')
  1629. # sm_data = np.ndarray(np_data.shape, dtype=np_data.dtype, buffer=shm.buf)
  1630. # sm_data[:] = np_data[:] # Copy the original data into shared memory
  1631. #
  1632. # shm.close()
  1633. # del sm_data
  1634. # return shm
  1635. # def from_share_memory(sm_name, _shape, _dtype, if_close=True):
  1636. # from multiprocessing import shared_memory
  1637. # logging.info("into from_share_memory sm_name " + sm_name)
  1638. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1639. # b = np.ndarray(_shape, dtype=_dtype, buffer=shm.buf)
  1640. # sm_data = copy.deepcopy(b)
  1641. # b[::] = 0
  1642. #
  1643. # if if_close:
  1644. # try:
  1645. # shm.close()
  1646. # shm.unlink()
  1647. # except Exception:
  1648. # log("file not found! " + sm_name)
  1649. # return sm_data
  1650. # def get_share_memory(sm_name):
  1651. # try:
  1652. # from multiprocessing import shared_memory
  1653. # shm = shared_memory.SharedMemory(name=sm_name, create=False)
  1654. # return shm
  1655. # except:
  1656. # return None
  1657. # def release_share_memory(shm):
  1658. # try:
  1659. # if shm is None:
  1660. # return
  1661. # shm.close()
  1662. # shm.unlink()
  1663. # log(str(shm.name) + " release successfully!")
  1664. # except FileNotFoundError:
  1665. # log(str(shm.name) + " has released!")
  1666. # except Exception as e:
  1667. # traceback.print_exc()
  1668. # def get_share_memory_list(sm_list_name, list_size=None):
  1669. # # from multiprocessing.resource_tracker import unregister
  1670. # from multiprocessing import shared_memory
  1671. # if list_size is None:
  1672. # sm_list = shared_memory.ShareableList(name=sm_list_name)
  1673. # else:
  1674. # sm_list = shared_memory.ShareableList(name=sm_list_name, sequence=["0"]+[' '*2048]*(list_size-2)+["0"])
  1675. # # unregister(sm_list_name, 'shared_memory')
  1676. # return sm_list
  1677. # def close_share_memory_list(sm_list):
  1678. # try:
  1679. # sm_list.shm.close()
  1680. # except Exception:
  1681. # traceback.print_exc()
  1682. def get_np_type(_str):
  1683. _dtype = None
  1684. if _str == 'uint8':
  1685. _dtype = np.uint8
  1686. elif _str == 'float16':
  1687. _dtype = np.float16
  1688. elif _str == 'float32':
  1689. _dtype = np.float32
  1690. logging.info("get_np_type " + _str + " " + str(_dtype))
  1691. return _dtype
  1692. def namespace_to_dict(agrs_or_dict, reverse=False):
  1693. if reverse:
  1694. agrs_or_dict = argparse.Namespace(**agrs_or_dict)
  1695. else:
  1696. agrs_or_dict = vars(agrs_or_dict)
  1697. return agrs_or_dict
  1698. def get_args_from_config(ip_port_dict, ip, arg_type, node_type=None):
  1699. if node_type is None:
  1700. node_type = ["MASTER", "SLAVE"]
  1701. else:
  1702. node_type = [node_type]
  1703. arg_list = []
  1704. for _type in node_type:
  1705. if ip_port_dict.get(ip).get(_type):
  1706. if ip_port_dict.get(ip).get(_type).get(arg_type):
  1707. arg_list.append(ip_port_dict.get(ip).get(_type).get(arg_type))
  1708. return arg_list
  1709. def remove_red_seal(image_np):
  1710. """
  1711. 去除红色印章
  1712. """
  1713. cv2.namedWindow("image_np", 0)
  1714. cv2.resizeWindow("image_np", 1000, 800)
  1715. cv2.imshow("image_np", image_np)
  1716. height, width, c = image_np.shape
  1717. window_h = int(height / 15)
  1718. image_hsv = cv2.cvtColor(image_np, cv2.COLOR_BGR2HSV)
  1719. # 遍历numpy
  1720. red_point_list = []
  1721. image_list = image_np.tolist()
  1722. hsv_dict = {}
  1723. for index_1 in range(len(image_list)):
  1724. for index_2 in range(len(image_list[index_1])):
  1725. h, s, v = image_hsv[index_1][index_2]
  1726. if (0 <= h <= 10 or 156 <= h <= 180) and 43 <= s <= 255 and 46 <= v <= 255:
  1727. key = str(image_hsv[index_1][index_2].tolist())
  1728. red_point_list.append([key, index_1, index_2])
  1729. if hsv_dict.get(key):
  1730. hsv_dict[key] += 1
  1731. else:
  1732. hsv_dict[key] = 1
  1733. # 找出相同最多的hsv值
  1734. hsv_most_key = None
  1735. hsv_most_value = 0
  1736. for hsv in hsv_dict.keys():
  1737. if hsv_dict.get(hsv) > hsv_most_value:
  1738. hsv_most_value = hsv_dict.get(hsv)
  1739. hsv_most_key = hsv
  1740. # print(hsv_dict)
  1741. # 根据hsv判断其填充为黑色还是白色
  1742. hsv_most_key = eval(hsv_most_key)
  1743. for point in red_point_list:
  1744. if abs(eval(point[0])[2] - hsv_most_key[2]) <= 70:
  1745. image_np[point[1]][point[2]][0] = 255
  1746. image_np[point[1]][point[2]][1] = 255
  1747. image_np[point[1]][point[2]][2] = 255
  1748. else:
  1749. image_np[point[1]][point[2]][0] = 0
  1750. image_np[point[1]][point[2]][1] = 0
  1751. image_np[point[1]][point[2]][2] = 0
  1752. cv2.namedWindow("remove_red_seal", 0)
  1753. cv2.resizeWindow("remove_red_seal", 1000, 800)
  1754. cv2.imshow("remove_red_seal", image_np)
  1755. # cv2.imwrite("C:/Users/Administrator/Downloads/1.png", image_np)
  1756. cv2.waitKey(0)
  1757. return image_np
  1758. if __name__ == "__main__":
  1759. # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  1760. # print(slash_replace(strs))
  1761. # from matplotlib import pyplot as plt
  1762. # import random
  1763. # fig = plt.figure()
  1764. # plt.xlim(100)
  1765. # plt.ylim(100)
  1766. # fig.add_subplot(111)
  1767. # x0,y0,x1,y1 = (1,2,3,4)
  1768. # plt.gca().add_patch(plt.Rectangle(xy=(x0, y0),
  1769. # width=x1-x0,
  1770. # height=y1-y0,
  1771. # edgecolor=(random.randint(0,255)/255,random.randint(0,255)/255,random.randint(0,255)/255),
  1772. # fill=False, linewidth=2))
  1773. #
  1774. # # plt.show()
  1775. # import cv2
  1776. # import numpy as np
  1777. # img = np.zeros(shape=(1800,1800),dtype=np.uint8)
  1778. # img += 255
  1779. # cv2.imshow("bbox", img)
  1780. # cv2.waitKey(0)
  1781. # print(json.dumps({"data":[1, 2]}))
  1782. # print(parse_yaml())
  1783. # print(get_ip_port())
  1784. set_flask_global()
  1785. # print(get_args_from_config(get_ip_port(), "http://127.0.0.1", "gunicorn_path"))
  1786. # print(get_intranet_ip())
  1787. # _path = "C:/Users/Administrator/Downloads/3.png"
  1788. # remove_red_seal(cv2.imread(_path))