utils.py 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586
  1. import hashlib
  2. import inspect
  3. import json
  4. import os
  5. import socket
  6. import subprocess
  7. import sys
  8. from io import BytesIO
  9. from subprocess import Popen
  10. import requests
  11. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  12. import difflib
  13. import logging
  14. import mimetypes
  15. import platform
  16. import re
  17. import traceback
  18. import filetype
  19. from bs4 import BeautifulSoup
  20. import yaml
  21. from pdfminer.layout import *
  22. from format_convert import _global
  23. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9]):
  24. """
  25. [0] : continue
  26. [-1]: 逻辑处理错误
  27. [-2]: 接口调用错误
  28. [-3]: 文件格式错误,无法打开
  29. [-4]: 各类文件调用第三方包读取超时
  30. [-5]: 整个转换过程超时
  31. [-6]: 阿里云UDF队列超时
  32. [-7]: 文件需密码,无法打开
  33. [-8]: 调用现成接口报错
  34. [-9]: 接口接收数据为空
  35. """
  36. for c in code:
  37. if _list == [c]:
  38. return True
  39. return False
  40. def add_div(text):
  41. if text == "" or text is None:
  42. return text
  43. # if get_platform() == "Windows":
  44. # print("add_div", text)
  45. if re.findall("<div>", text):
  46. return text
  47. text = "<div>" + text + "\n"
  48. text = re.sub("\n", "</div>\n<div>", text)
  49. # text += "</div>"
  50. if text[-5:] == "<div>":
  51. # print("add_div has cut", text[-30:])
  52. text = text[:-5]
  53. return text
  54. def get_platform():
  55. sys = platform.system()
  56. return sys
  57. def get_html_p(html_path):
  58. logging.info("into get_html_p")
  59. try:
  60. with open(html_path, "r") as ff:
  61. html_str = ff.read()
  62. soup = BeautifulSoup(html_str, 'lxml')
  63. text = ""
  64. for p in soup.find_all("p"):
  65. p_text = p.text
  66. p_text = p_text.strip()
  67. if p.string != "":
  68. text += p_text
  69. text += "\n"
  70. return text
  71. except Exception as e:
  72. logging.info("get_html_p error!")
  73. print("get_html_p", traceback.print_exc())
  74. return [-1]
  75. def string_similarity(str1, str2):
  76. # 去掉<div>和回车
  77. str1 = re.sub("<div>", "", str1)
  78. str1 = re.sub("</div>", "", str1)
  79. str1 = re.sub("\n", "", str1)
  80. str2 = re.sub("<div>", "", str2)
  81. str2 = re.sub("</div>", "", str2)
  82. str2 = re.sub("\n", "", str2)
  83. # print("********************************")
  84. # print("str1", str1)
  85. # print("********************************")
  86. # print("str2", str2)
  87. # print("********************************")
  88. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  89. print("string_similarity", score)
  90. return score
  91. def get_sequential_data(text_list, bbox_list, html=False):
  92. logging.info("into get_sequential_data")
  93. try:
  94. text = ""
  95. order_list = []
  96. for i in range(len(text_list)):
  97. length_start = bbox_list[i][0][0]
  98. length_end = bbox_list[i][1][0]
  99. height_start = bbox_list[i][0][1]
  100. height_end = bbox_list[i][-1][1]
  101. # print([length_start, length_end, height_start, height_end])
  102. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  103. # text = text + infomation['text'] + "\n"
  104. if get_platform() == "Windows":
  105. print("get_sequential_data", order_list)
  106. if not order_list:
  107. if get_platform() == "Windows":
  108. print("get_sequential_data", "no order list")
  109. return ""
  110. # 根据bbox的坐标对输出排序
  111. order_list.sort(key=lambda x: (x[3], x[1], x[0]))
  112. # 根据bbox分行分列
  113. # col_list = []
  114. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  115. # for i in range(len(order_list)):
  116. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  117. # col_list.append(order_list[i])
  118. # else:
  119. # row_list.append(col_list)
  120. # col_list = []
  121. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  122. # col_list.append(order_list[i])
  123. # if i == len(order_list) - 1:
  124. # row_list.append(col_list)
  125. row_list = []
  126. used_box = []
  127. threshold = 5
  128. for box in order_list:
  129. if box in used_box:
  130. continue
  131. height_center = (box[4] + box[3]) / 2
  132. row = []
  133. for box2 in order_list:
  134. if box2 in used_box:
  135. continue
  136. height_center2 = (box2[4] + box2[3]) / 2
  137. if height_center - threshold <= height_center2 <= height_center + threshold:
  138. if box2 not in row:
  139. row.append(box2)
  140. used_box.append(box2)
  141. row.sort(key=lambda x: x[0])
  142. row_list.append(row)
  143. for row in row_list:
  144. if not row:
  145. continue
  146. if len(row) <= 1:
  147. text = text + row[0][0] + "\n"
  148. else:
  149. sub_text = ""
  150. row.sort(key=lambda x: x[1])
  151. for col in row:
  152. sub_text = sub_text + col[0] + " "
  153. sub_text = sub_text + "\n"
  154. text += sub_text
  155. if html:
  156. text = "<div>" + text
  157. text = re.sub("\n", "</div>\n<div>", text)
  158. text += "</div>"
  159. # if text[-5:] == "<div>":
  160. # text = text[:-5]
  161. return text
  162. except Exception as e:
  163. logging.info("get_sequential_data error!")
  164. print("get_sequential_data", traceback.print_exc())
  165. return [-1]
  166. # def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  167. # logging.info("into get_formatted_table")
  168. # try:
  169. # # 重新定义text_bbox_list,[point, point, text]
  170. # text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  171. # range(len(text_bbox_list))]
  172. # # 按纵坐标排序
  173. # text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  174. # table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  175. #
  176. # # print("text_bbox_list", text_bbox_list)
  177. # # print("table_bbox_list", table_bbox_list)
  178. #
  179. # # bbox位置 threshold
  180. # threshold = 5
  181. #
  182. # # 根据split_line分区,可能有个区多个表格 [(), ()]
  183. # area_text_bbox_list = []
  184. # area_table_bbox_list = []
  185. # # print("get_formatted_table, split_line", split_line)
  186. # for j in range(1, len(split_line)):
  187. # last_y = split_line[j - 1][0][1]
  188. # current_y = split_line[j][0][1]
  189. # temp_text_bbox_list = []
  190. # temp_table_bbox_list = []
  191. #
  192. # # 找出该区域下text bbox
  193. # for text_bbox in text_bbox_list:
  194. # # 计算 text bbox 中心点
  195. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  196. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  197. # if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  198. # temp_text_bbox_list.append(text_bbox)
  199. # area_text_bbox_list.append(temp_text_bbox_list)
  200. #
  201. # # 找出该区域下table bbox
  202. # for table_bbox in table_bbox_list:
  203. # # 计算 table bbox 中心点
  204. # table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  205. # (table_bbox[1][1] + table_bbox[0][1]) / 2)
  206. # if last_y < table_bbox_center[1] < current_y:
  207. # temp_table_bbox_list.append(table_bbox)
  208. # area_table_bbox_list.append(temp_table_bbox_list)
  209. #
  210. # # for j in range(len(area_text_bbox_list)):
  211. # # print("area_text_bbox_list", j, area_text_bbox_list[j])
  212. #
  213. # # 对每个区域分别进行两个bbox匹配,生成表格
  214. # area_text_list = []
  215. # area_column_list = []
  216. # for j in range(len(area_text_bbox_list)):
  217. # # 每个区域的table bbox 和text bbox
  218. # temp_table_bbox_list = area_table_bbox_list[j]
  219. # temp_text_bbox_list = area_text_bbox_list[j]
  220. #
  221. # # 判断该区域有无表格bbox
  222. # # 若无表格,将该区域文字连接
  223. # if not temp_table_bbox_list:
  224. # # 找出该区域的所有text bbox
  225. # only_text_list = []
  226. # only_bbox_list = []
  227. # for text_bbox in temp_text_bbox_list:
  228. # only_text_list.append(text_bbox[2])
  229. # only_bbox_list.append([text_bbox[0], text_bbox[1]])
  230. # only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  231. # if only_text == [-1]:
  232. # return [-1], [-1]
  233. # area_text_list.append(only_text)
  234. # area_column_list.append(0)
  235. # continue
  236. #
  237. # # 有表格
  238. # # 文本对应的表格格子
  239. # text_in_table = {}
  240. # for i in range(len(temp_text_bbox_list)):
  241. # text_bbox = temp_text_bbox_list[i]
  242. #
  243. # # 计算 text bbox 中心点
  244. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  245. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  246. #
  247. # # 判断中心点在哪个table bbox中
  248. # for table_bbox in temp_table_bbox_list:
  249. # # 中心点在table bbox中,将text写入字典
  250. # if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  251. # table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  252. # if str(table_bbox) in text_in_table.keys():
  253. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  254. # else:
  255. # text_in_table[str(table_bbox)] = text_bbox[2]
  256. # break
  257. #
  258. # # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  259. # # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  260. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  261. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  262. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  263. # # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  264. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  265. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  266. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  267. # # if str(table_bbox) in text_in_table.keys():
  268. # # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  269. # # else:
  270. # # text_in_table[str(table_bbox)] = text_bbox[2]
  271. # # break
  272. #
  273. # # 对表格格子进行分行分列,并计算总计多少小列
  274. # # 放入坐标
  275. # all_col_list = []
  276. # all_row_list = []
  277. # for i in range(len(temp_table_bbox_list)):
  278. # table_bbox = temp_table_bbox_list[i]
  279. #
  280. # # 放入所有坐标x
  281. # if table_bbox[0][0] not in all_col_list:
  282. # all_col_list.append(table_bbox[0][0])
  283. # if table_bbox[1][0] not in all_col_list:
  284. # all_col_list.append(table_bbox[1][0])
  285. #
  286. # # 放入所有坐标y
  287. # if table_bbox[0][1] not in all_row_list:
  288. # all_row_list.append(table_bbox[0][1])
  289. # if table_bbox[1][1] not in all_row_list:
  290. # all_row_list.append(table_bbox[1][1])
  291. # all_col_list.sort(key=lambda x: x)
  292. # all_row_list.sort(key=lambda x: x)
  293. #
  294. # # 分行
  295. # row_list = []
  296. # rows = []
  297. # temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  298. # y_row = temp_table_bbox_list[0][0][1]
  299. # for i in range(len(temp_table_bbox_list)):
  300. # table_bbox = temp_table_bbox_list[i]
  301. #
  302. # if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  303. # rows.append(table_bbox)
  304. # else:
  305. # y_row = table_bbox[0][1]
  306. # if rows:
  307. # rows.sort(key=lambda x: x[0][0])
  308. # row_list.append(rows)
  309. # rows = []
  310. # rows.append(table_bbox)
  311. # # print("*" * 30)
  312. # # print(row_list)
  313. #
  314. # if i == len(temp_table_bbox_list) - 1:
  315. # if rows:
  316. # rows.sort(key=lambda x: x[0][0])
  317. # row_list.append(rows)
  318. #
  319. # # 生成表格,包括文字和格子宽度
  320. # area_column = []
  321. # text = '<table border="1">' + "\n"
  322. # for row in row_list:
  323. # text += "<tr>" + "\n"
  324. # for col in row:
  325. # # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  326. # row_span = 1
  327. # for y in all_row_list:
  328. # if col[0][1] < y < col[1][1]:
  329. # if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  330. # row_span += 1
  331. #
  332. # # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  333. # col_span = 1
  334. # for x in all_col_list:
  335. # if col[0][0] < x < col[1][0]:
  336. # if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  337. # col_span += 1
  338. #
  339. # text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  340. #
  341. # if str(col) in text_in_table.keys():
  342. # text += text_in_table.get(str(col))
  343. # else:
  344. # text += ""
  345. # text += "</td>" + "\n"
  346. # text += "</tr>" + "\n"
  347. # text += "</table>" + "\n"
  348. #
  349. # # 计算最大column
  350. # max_col_num = 0
  351. # for row in row_list:
  352. # col_num = 0
  353. # for col in row:
  354. # col_num += 1
  355. # if max_col_num < col_num:
  356. # max_col_num = col_num
  357. #
  358. # area_text_list.append(text)
  359. # area_column_list.append(max_col_num)
  360. #
  361. # text = ""
  362. # if get_platform() == "Windows":
  363. # print("get_formatted_table area_text_list", area_text_list)
  364. # for area_text in area_text_list:
  365. # text += area_text
  366. # return text, area_column_list
  367. # except Exception as e:
  368. # logging.info("get_formatted_table error!")
  369. # print("get_formatted_table", traceback.print_exc())
  370. # return [-1], [-1]
  371. def rename_inner_files(root_path):
  372. try:
  373. logging.info("into rename_inner_files")
  374. # 获取解压文件夹下所有文件+文件夹,不带根路径
  375. path_list = []
  376. for root, dirs, files in os.walk(root_path, topdown=False):
  377. for name in dirs:
  378. p = os.path.join(root, name) + os.sep
  379. if get_platform() == "Windows":
  380. root_path = slash_replace(root_path)
  381. p = slash_replace(p)
  382. p = re.sub(root_path, "", p)
  383. root_path = slash_replace(root_path, True)
  384. p = slash_replace(p, True)
  385. else:
  386. p = re.sub(root_path, "", p)
  387. path_list.append(p)
  388. for name in files:
  389. p = os.path.join(root, name)
  390. if get_platform() == "Windows":
  391. root_path = slash_replace(root_path)
  392. p = slash_replace(p)
  393. p = re.sub(root_path, "", p)
  394. root_path = slash_replace(root_path, True)
  395. p = slash_replace(p, True)
  396. else:
  397. p = re.sub(root_path, "", p)
  398. path_list.append(p)
  399. # 按路径长度排序
  400. path_list.sort(key=lambda x: len(x), reverse=True)
  401. # 循环改名
  402. for old_path in path_list:
  403. # 按路径分隔符分割
  404. ss = old_path.split(os.sep)
  405. # 判断是否文件夹
  406. is_dir = 0
  407. file_type = ""
  408. if os.path.isdir(root_path + old_path):
  409. ss = ss[:-1]
  410. is_dir = 1
  411. else:
  412. if "." in old_path:
  413. file_type = "." + old_path.split(".")[-1]
  414. else:
  415. file_type = ""
  416. # 最后一级需要用hash改名
  417. new_path = ""
  418. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  419. current_level = 0
  420. for s in ss:
  421. # 路径拼接
  422. if current_level < len(ss) - 1:
  423. new_path += s + os.sep
  424. else:
  425. new_path += str(hash(s)) + file_type
  426. current_level += 1
  427. new_ab_path = root_path + new_path
  428. old_ab_path = root_path + old_path
  429. os.rename(old_ab_path, new_ab_path)
  430. # 重新获取解压文件夹下所有文件+文件夹
  431. new_path_list = []
  432. for root, dirs, files in os.walk(root_path, topdown=False):
  433. for name in dirs:
  434. new_path_list.append(os.path.join(root, name) + os.sep)
  435. for name in files:
  436. new_path_list.append(os.path.join(root, name))
  437. return new_path_list
  438. except:
  439. traceback.print_exc()
  440. return [-1]
  441. def judge_format(path):
  442. guess1 = mimetypes.guess_type(path)
  443. _type = None
  444. if guess1[0]:
  445. _type = guess1[0]
  446. else:
  447. guess2 = filetype.guess(path)
  448. if guess2:
  449. _type = guess2.mime
  450. if _type == "application/pdf":
  451. return "pdf"
  452. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  453. return "docx"
  454. if _type == "application/x-zip-compressed" or _type == "application/zip":
  455. return "zip"
  456. if _type == "application/x-rar-compressed" or _type == "application/rar":
  457. return "rar"
  458. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  459. return "xlsx"
  460. if _type == "application/msword":
  461. return "doc"
  462. if _type == "image/png":
  463. return "png"
  464. if _type == "image/jpeg":
  465. return "jpg"
  466. # 猜不到,返回None
  467. return None
  468. def slash_replace(_str, reverse=False):
  469. if reverse:
  470. _str = eval(repr(_str).replace('/', '\\\\'))
  471. else:
  472. _str = eval(repr(_str).replace('\\\\', '/'))
  473. return _str
  474. class LineTable:
  475. def recognize_table(self,list_textbox, list_line,sourceP_LB=True):
  476. self.list_line = list_line
  477. self.list_crosspoints = self.recognize_crosspoints(list_line)
  478. # 聚类
  479. cluster_crosspoints = []
  480. for _point in self.list_crosspoints:
  481. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  482. while 1:
  483. _find = False
  484. new_cluster_crosspoints = []
  485. for l_point in cluster_crosspoints:
  486. _flag = False
  487. for l_n_point in new_cluster_crosspoints:
  488. line1 = l_point.get("lines")
  489. line2 = l_n_point.get("lines")
  490. if len(line1&line2) > 0:
  491. _find = True
  492. _flag = True
  493. l_n_point["lines"] = line1.union(line2)
  494. l_n_point["points"].extend(l_point["points"])
  495. if not _flag:
  496. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  497. cluster_crosspoints = new_cluster_crosspoints
  498. if not _find:
  499. break
  500. list_l_rect = []
  501. for table_crosspoint in cluster_crosspoints:
  502. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  503. list_l_rect.append(list_rect)
  504. in_objs = set()
  505. list_tables = []
  506. for l_rect in list_l_rect:
  507. _ta = self.rect2table(list_textbox,l_rect,in_objs,sourceP_LB=sourceP_LB)
  508. if _ta:
  509. list_tables.append(_ta)
  510. # self._plot(list_line, list_textbox)
  511. return list_tables, in_objs, list_l_rect
  512. def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  513. dump_margin = 5
  514. list_rect_tmp = []
  515. # 去重
  516. for _rect in list_rect:
  517. if (_rect.bbox[3]-_rect.bbox[1] < 10) or (abs(_rect.bbox[2]-_rect.bbox[0]) < 5):
  518. continue
  519. _find = False
  520. for _tmp in list_rect_tmp:
  521. for i in range(4):
  522. if abs(_rect.bbox[i]-_tmp.bbox[i]) < dump_margin:
  523. pass
  524. else:
  525. _find = False
  526. break
  527. if i == 3:
  528. _find = True
  529. if _find:
  530. break
  531. if not _find:
  532. list_rect_tmp.append(_rect)
  533. # print("=====",len(list_rect),len(list_rect_tmp))
  534. # print(list_rect_tmp)
  535. # from matplotlib import pyplot as plt
  536. # plt.figure()
  537. # for _rect in list_rect_tmp:
  538. # x0,y0,x1,y1 = _rect.bbox
  539. # plt.boxplot(_rect.bbox)
  540. # plt.show()
  541. cluster_rect = []
  542. for _rect in list_rect:
  543. _find = False
  544. for cr in cluster_rect:
  545. for cr_rect in cr:
  546. if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
  547. _find = True
  548. cr.append(_rect)
  549. break
  550. elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
  551. _find = True
  552. cr.append(_rect)
  553. break
  554. if _find:
  555. break
  556. if not _find:
  557. cluster_rect.append([_rect])
  558. list_l_rect = cluster_rect
  559. in_objs = set()
  560. list_tables = []
  561. for l_rect in list_l_rect:
  562. _ta = self.rect2table(list_textbox,l_rect,in_objs)
  563. if _ta:
  564. list_tables.append(_ta)
  565. return list_tables,in_objs,list_l_rect
  566. def recognize_crosspoints(self, list_line,fixLine=True):
  567. list_crosspoints = []
  568. # print("lines num",len(list_line))
  569. def getMaxPoints(list_x,margin=5,reverse=False):
  570. clust_x = []
  571. for _x in list_x:
  572. _find = False
  573. for cx in clust_x:
  574. if abs(cx[0]-_x)<margin:
  575. _find = True
  576. cx.append(_x)
  577. break
  578. if not _find:
  579. clust_x.append([_x])
  580. clust_x.sort(key=lambda x:x,reverse=reverse)
  581. return clust_x[0][0],len(clust_x[0])
  582. for _i in range(len(list_line)):
  583. for _j in range(len(list_line)):
  584. line1 = list_line[_i].__dict__.get("bbox")
  585. line2 = list_line[_j].__dict__.get("bbox")
  586. exists,point = self.cross_point(line1,line2)
  587. if exists:
  588. list_crosspoints.append(point)
  589. if fixLine:
  590. #聚类
  591. cluster_crosspoints = []
  592. for _point in list_crosspoints:
  593. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  594. while 1:
  595. _find = False
  596. new_cluster_crosspoints = []
  597. for l_point in cluster_crosspoints:
  598. _flag = False
  599. for l_n_point in new_cluster_crosspoints:
  600. line1 = l_point.get("lines")
  601. line2 = l_n_point.get("lines")
  602. if len(line1&line2)>0:
  603. _find = True
  604. _flag = True
  605. l_n_point["lines"] = line1.union(line2)
  606. l_n_point["points"].extend(l_point["points"])
  607. if not _flag:
  608. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  609. cluster_crosspoints = new_cluster_crosspoints
  610. if not _find:
  611. break
  612. list_crosspoints = []
  613. for list_cp in cluster_crosspoints:
  614. points = list_cp.get("points")
  615. l_lines = []
  616. for p in points:
  617. l_lines.extend(p.get("p_lines"))
  618. l_lines = list(set(l_lines))
  619. l_lines.sort(key=lambda x:x[0])
  620. min_x,_count = getMaxPoints([l[0] for l in l_lines],reverse=False)
  621. if _count<=2:
  622. min_x = None
  623. min_y,_count = getMaxPoints([l[1] for l in l_lines],reverse=False)
  624. if _count<2:
  625. min_y = None
  626. max_x,_count = getMaxPoints([l[2] for l in l_lines],reverse=True)
  627. if _count<=2:
  628. max_x = None
  629. max_y,_count = getMaxPoints([l[3] for l in l_lines],reverse=True)
  630. if _count<=2:
  631. max_y = None
  632. if min_x and min_y and max_x and max_y:
  633. points.sort(key=lambda x:x["point"][0])
  634. if abs(min_x-points[0]["point"][0])>30:
  635. _line = LTLine(1,(min_x,min_y),(min_x,max_y))
  636. list_line.append(_line)
  637. l_lines.append(_line.bbox)
  638. # print("add=====",_line.bbox)
  639. if abs(max_x-points[-1]["point"][0])>30:
  640. _line = LTLine(1,(max_x,min_y),(max_x,max_y))
  641. list_line.append(_line)
  642. l_lines.append(_line.bbox)
  643. # print("add=====1",_line.bbox)
  644. points.sort(key=lambda x:x["point"][1])
  645. if abs(min_y-points[0]["point"][1])>30:
  646. _line = LTLine(1,(min_x,min_y),(max_x,min_y))
  647. list_line.append(_line)
  648. l_lines.append(_line.bbox)
  649. # print("add=====2",_line.bbox)
  650. if abs(max_y-points[-1]["point"][1])>30:
  651. _line = LTLine(1,(min_x,max_y),(max_x,max_y))
  652. list_line.append(_line)
  653. l_lines.append(_line.bbox)
  654. # print("add=====2",_line.bbox)
  655. for _i in range(len(l_lines)):
  656. for _j in range(len(l_lines)):
  657. line1 = l_lines[_i]
  658. line2 = l_lines[_j]
  659. exists,point = self.cross_point(line1,line2)
  660. if exists:
  661. list_crosspoints.append(point)
  662. # from matplotlib import pyplot as plt
  663. # plt.figure()
  664. # for _line in l_lines:
  665. # x0,y0,x1,y1 = _line
  666. # plt.plot([x0,x1],[y0,y1])
  667. # for point in list_crosspoints:
  668. # plt.scatter(point.get("point")[0],point.get("point")[1])
  669. # plt.show()
  670. # print(list_crosspoints)
  671. # print("points num",len(list_crosspoints))
  672. return list_crosspoints
  673. def recognize_rect(self, _page):
  674. list_line = []
  675. for _obj in _page._objs:
  676. if isinstance(_obj, (LTLine)):
  677. list_line.append(_obj)
  678. list_crosspoints = self.recognize_crosspoints(list_line)
  679. #聚类
  680. cluster_crosspoints = []
  681. for _point in list_crosspoints:
  682. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  683. while 1:
  684. _find = False
  685. new_cluster_crosspoints = []
  686. for l_point in cluster_crosspoints:
  687. _flag = False
  688. for l_n_point in new_cluster_crosspoints:
  689. line1 = l_point.get("lines")
  690. line2 = l_n_point.get("lines")
  691. if len(line1&line2)>0:
  692. _find = True
  693. _flag = True
  694. l_n_point["lines"] = line1.union(line2)
  695. l_n_point["points"].extend(l_point["points"])
  696. if not _flag:
  697. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  698. cluster_crosspoints = new_cluster_crosspoints
  699. if not _find:
  700. break
  701. # print(len(cluster_crosspoints))
  702. list_l_rect = []
  703. for table_crosspoint in cluster_crosspoints:
  704. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  705. list_l_rect.append(list_rect)
  706. return list_l_rect
  707. def crosspoint2rect(self, list_crosspoint, margin=5):
  708. dict_line_points = {}
  709. for _point in list_crosspoint:
  710. lines = list(_point.get("lines"))
  711. for _line in lines:
  712. if _line not in dict_line_points:
  713. dict_line_points[_line] = {"direct":None,"points":[]}
  714. dict_line_points[_line]["points"].append(_point)
  715. # 排序
  716. for k, v in dict_line_points.items():
  717. list_x = []
  718. list_y = []
  719. for _p in v["points"]:
  720. list_x.append(_p.get("point")[0])
  721. list_y.append(_p.get("point")[1])
  722. if max(list_x)-min(list_x)>max(list_y)-min(list_y):
  723. v.get("points").sort(key=lambda x:x.get("point")[0])
  724. v["direct"] = "row"
  725. else:
  726. v.get("points").sort(key=lambda x:x.get("point")[1])
  727. v["direct"] = "column"
  728. list_rect = []
  729. for _point in list_crosspoint:
  730. if _point["buttom"]>=margin and _point["right"]>=margin:
  731. lines = list(_point.get("lines"))
  732. _line = lines[0]
  733. if dict_line_points[_line]["direct"]=="column":
  734. _line = lines[1]
  735. next_point = None
  736. for p1 in dict_line_points[_line]["points"]:
  737. if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
  738. next_point = p1
  739. break
  740. if not next_point:
  741. continue
  742. lines = list(next_point.get("lines"))
  743. _line = lines[0]
  744. if dict_line_points[_line]["direct"]=="row":
  745. _line = lines[1]
  746. final_point = None
  747. for p1 in dict_line_points[_line]["points"]:
  748. if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
  749. final_point = p1
  750. break
  751. if not final_point:
  752. continue
  753. _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
  754. list_rect.append(_r)
  755. tmp_rect = []
  756. set_bbox = set()
  757. for _r in list_rect:
  758. _bbox = "%.2f-%.2f-%.2f-%.2f"%_r.bbox
  759. width = _r.bbox[2]-_r.bbox[0]
  760. height = _r.bbox[3]-_r.bbox[1]
  761. if width<=margin or height<=margin:
  762. continue
  763. if _bbox not in set_bbox:
  764. tmp_rect.append(_r)
  765. set_bbox.add(_bbox)
  766. list_rect = tmp_rect
  767. # import cv2
  768. # import numpy as np
  769. # import random
  770. # img = np.zeros(shape=(1000,1000),dtype=np.uint8)
  771. # img += 255
  772. #
  773. # color = []
  774. # for rect in list_rect:
  775. # color += 10
  776. # x0,y0,x1,y1 = rect.bbox
  777. # x0 *= 10/18
  778. # y0 *= 10/18
  779. # x1 *= 10/18
  780. # y1 *= 10/18
  781. # print(rect.bbox)
  782. # cv2.rectangle(img, (int(x0),int(y0)),(int(x1),int(y1)), (color%255, (color+10)%255, (color+20)%255), 3)
  783. # cv2.imshow("bbox", img)
  784. # cv2.waitKey(0)
  785. return list_rect
  786. def cross_point(self, line1, line2, segment=True, margin=2):
  787. point_is_exist = False
  788. x = y = 0
  789. x1, y1, x2, y2 = line1
  790. x3, y3, x4, y4 = line2
  791. if (x2 - x1) == 0:
  792. k1 = None
  793. b1 = 0
  794. else:
  795. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  796. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  797. if (x4 - x3) == 0: # L2直线斜率不存在
  798. k2 = None
  799. b2 = 0
  800. else:
  801. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  802. b2 = y3 * 1.0 - x3 * k2 * 1.0
  803. if k1 is None:
  804. if not k2 is None:
  805. x = x1
  806. y = k2 * x1 + b2
  807. point_is_exist = True
  808. elif k2 is None:
  809. x = x3
  810. y = k1 * x3 + b1
  811. elif not k2 == k1:
  812. x = (b2 - b1) * 1.0 / (k1 - k2)
  813. y = k1 * x * 1.0 + b1 * 1.0
  814. point_is_exist = True
  815. left = 0
  816. right = 0
  817. top = 0
  818. buttom = 0
  819. if point_is_exist:
  820. if segment:
  821. if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
  822. if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
  823. point_is_exist = True
  824. left = abs(min(x1,x3)-x)
  825. right = abs(max(x2,x4)-x)
  826. top = abs(min(y1,y3)-y)
  827. buttom = abs(max(y2,y4)-y)
  828. else:
  829. point_is_exist = False
  830. else:
  831. point_is_exist = False
  832. line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
  833. line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
  834. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  835. "top": top, "buttom": buttom, "lines": set([line1_key,line2_key]),"p_lines":[line1,line2]}
  836. def unionTable(self, list_table, fixspan=True, margin=2):
  837. set_x = set()
  838. set_y = set()
  839. list_cell = []
  840. for _t in list_table:
  841. for _line in _t:
  842. list_cell.extend(_line)
  843. clusters_rects = []
  844. #根据y1聚类
  845. set_id = set()
  846. list_cell_dump = []
  847. for _cell in list_cell:
  848. _id = id(_cell)
  849. if _id in set_id:
  850. continue
  851. set_id.add(_id)
  852. list_cell_dump.append(_cell)
  853. list_cell = list_cell_dump
  854. list_cell.sort(key=lambda x:x.get("bbox")[3])
  855. for _rect in list_cell:
  856. _y0 = _rect.get("bbox")[3]
  857. _find = False
  858. for l_cr in clusters_rects:
  859. if abs(l_cr[0].get("bbox")[3]-_y0)<2:
  860. _find = True
  861. l_cr.append(_rect)
  862. break
  863. if not _find:
  864. clusters_rects.append([_rect])
  865. clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
  866. for l_cr in clusters_rects:
  867. l_cr.sort(key=lambda x:x.get("bbox")[0])
  868. # print("=============:")
  869. # for l_r in clusters_rects:
  870. # print(len(l_r))
  871. for _line in clusters_rects:
  872. for _rect in _line:
  873. (x0,y0,x1,y1) = _rect.get("bbox")
  874. set_x.add(x0)
  875. set_x.add(x1)
  876. set_y.add(y0)
  877. set_y.add(y1)
  878. if len(set_x)==0 or len(set_y)==0:
  879. return
  880. list_x = list(set_x)
  881. list_y = list(set_y)
  882. list_x.sort(key=lambda x:x)
  883. list_y.sort(key=lambda x:x,reverse=True)
  884. _table = []
  885. for _line in clusters_rects:
  886. table_line = []
  887. for _rect in _line:
  888. (x0,y0,x1,y1) = _rect.get("bbox")
  889. _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
  890. table_line.append(_cell)
  891. _table.append(table_line)
  892. # print("=====================>>")
  893. # for _line in _table:
  894. # for _cell in _line:
  895. # print(_cell,end="\t")
  896. # print("\n")
  897. # print("=====================>>")
  898. # print(_table)
  899. if fixspan:
  900. for _line in _table:
  901. extend_line = []
  902. for c_i in range(len(_line)):
  903. _cell = _line[c_i]
  904. if _cell.get("columnspan")>1:
  905. _cospan = _cell.get("columnspan")
  906. _cell["columnspan"] = 1
  907. for i in range(1,_cospan):
  908. extend_line.append({"index":c_i+1,"cell":_cell})
  909. extend_line.sort(key=lambda x:x["index"],reverse=True)
  910. for _el in extend_line:
  911. _line.insert(_el["index"],_el["cell"])
  912. for l_i in range(len(_table)):
  913. _line = _table[l_i]
  914. for c_i in range(len(_line)):
  915. _cell = _line[c_i]
  916. if _cell.get("rowspan")>1:
  917. _rospan = _cell.get("rowspan")
  918. _cell["rowspan"] = 1
  919. for i in range(1,_rospan):
  920. _table[l_i+i].insert(c_i,_cell)
  921. table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
  922. ta = {"bbox":table_bbox,"table":_table}
  923. return ta
  924. def rect2table(self, list_textbox, list_rect, in_objs, margin=5, fixspan=True,sourceP_LB=True,fixRect=True):
  925. def getIOU(bbox0,bbox1):
  926. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  927. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  928. if width<0 and height<0:
  929. return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  930. return 0
  931. _table = []
  932. set_x = set()
  933. set_y = set()
  934. clusters_rects = []
  935. # 根据y1聚类
  936. if sourceP_LB:
  937. list_rect.sort(key=lambda x:x.bbox[3])
  938. for _rect in list_rect:
  939. _y0 = _rect.bbox[3]
  940. _find = False
  941. for l_cr in clusters_rects:
  942. if abs(l_cr[0].bbox[3]-_y0)<margin:
  943. _find = True
  944. l_cr.append(_rect)
  945. break
  946. if not _find:
  947. clusters_rects.append([_rect])
  948. else:
  949. list_rect.sort(key=lambda x:x.bbox[1])
  950. for _rect in list_rect:
  951. _y0 = _rect.bbox[1]
  952. _find = False
  953. for l_cr in clusters_rects:
  954. if abs(l_cr[0].bbox[1]-_y0)<margin:
  955. _find = True
  956. l_cr.append(_rect)
  957. break
  958. if not _find:
  959. clusters_rects.append([_rect])
  960. # cul spans
  961. for _line in clusters_rects:
  962. for _rect in _line:
  963. (x0,y0,x1,y1) = _rect.bbox
  964. set_x.add(x0)
  965. set_x.add(x1)
  966. set_y.add(y0)
  967. set_y.add(y1)
  968. if len(set_x)==0 or len(set_y)==0:
  969. return
  970. if len(list_rect)<=1:
  971. return
  972. list_x = list(set_x)
  973. list_y = list(set_y)
  974. list_x.sort(key=lambda x:x)
  975. list_y.sort(key=lambda x:x,reverse=sourceP_LB)
  976. # print("clusters_rects", len(clusters_rects))
  977. if sourceP_LB:
  978. clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
  979. else:
  980. clusters_rects.sort(key=lambda x:x[0].bbox[1],reverse=sourceP_LB)
  981. for l_cr in clusters_rects:
  982. l_cr.sort(key=lambda x:x.bbox[0])
  983. pop_x = []
  984. for i in range(len(list_x)-1):
  985. _i = len(list_x)-i-1
  986. l_i = _i-1
  987. if abs(list_x[_i]-list_x[l_i])<5:
  988. pop_x.append(_i)
  989. pop_x.sort(key=lambda x:x,reverse=True)
  990. for _x in pop_x:
  991. list_x.pop(_x)
  992. #
  993. pop_x = []
  994. for i in range(len(list_y)-1):
  995. _i = len(list_y)-i-1
  996. l_i = _i-1
  997. if abs(list_y[_i]-list_y[l_i])<5:
  998. pop_x.append(_i)
  999. pop_x.sort(key=lambda x:x,reverse=True)
  1000. for _x in pop_x:
  1001. list_y.pop(_x)
  1002. # print(list_x)
  1003. # print(list_y)
  1004. for _line in clusters_rects:
  1005. table_line = []
  1006. for _rect in _line:
  1007. (x0, y0, x1, y1) = _rect.bbox
  1008. _cell = {"bbox": (x0, y0, x1, y1),
  1009. "rect": _rect,
  1010. "rowspan": self.getspan(list_y, y0, y1, margin),
  1011. "columnspan": self.getspan(list_x, x0, x1, margin),
  1012. "text": ""}
  1013. table_line.append(_cell)
  1014. _table.append(table_line)
  1015. list_textbox.sort(key=lambda x:x.bbox[0])
  1016. list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
  1017. # print("list_textbox", list_textbox)
  1018. for textbox in list_textbox:
  1019. (x0,y0,x1,y1) = textbox.bbox
  1020. _text = textbox.get_text()
  1021. _find = False
  1022. for table_line in _table:
  1023. for _cell in table_line:
  1024. if self.inbox(textbox.bbox, _cell["bbox"], textbox.get_text()):
  1025. _cell["text"] += _text
  1026. in_objs.add(textbox)
  1027. _find = True
  1028. break
  1029. if _find:
  1030. break
  1031. if fixspan:
  1032. for _line in _table:
  1033. for c_i in range(len(_line)):
  1034. _cell = _line[c_i]
  1035. if _cell.get("columnspan")>1:
  1036. _cospan = _cell.get("columnspan")
  1037. _cell["columnspan"] = 1
  1038. n_cell = {}
  1039. n_cell.update(_cell)
  1040. for i in range(1,_cospan):
  1041. _line.insert(c_i,n_cell)
  1042. for l_i in range(len(_table)):
  1043. _line = _table[l_i]
  1044. for c_i in range(len(_line)):
  1045. _cell = _line[c_i]
  1046. if _cell.get("rowspan")>1:
  1047. _rospan = _cell.get("rowspan")
  1048. _cell["rowspan"] = 1
  1049. n_cell = {}
  1050. n_cell.update(_cell)
  1051. for i in range(1,_rospan):
  1052. if l_i+i<=len(_table)-1:
  1053. # print(len(_table),l_i+i)
  1054. _table[l_i+i].insert(c_i,n_cell)
  1055. # print("=======")
  1056. # for _line in _table:
  1057. # for _cell in _line:
  1058. # _text = _cell["text"][:2]+"_"+str(_cell["columnspan"])+"_"+str(_cell["rowspan"])
  1059. # if _text=="":
  1060. # _text = "=="
  1061. # print(_text,end="\t")
  1062. # print("\n")
  1063. # print("===========")
  1064. if fixRect:
  1065. for _line in _table:
  1066. extend_line = []
  1067. for c_i in range(len(_line)):
  1068. c_cell = _line[c_i]
  1069. if c_i==0 and c_cell["bbox"][0]!=list_x[0]:
  1070. _bbox = (list_x[0],c_cell["bbox"][1], c_cell["bbox"][0],c_cell["bbox"][3])
  1071. _cell = {"bbox": _bbox,
  1072. "rect": LTRect(1,_bbox),
  1073. "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
  1074. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  1075. "text": ""}
  1076. extend_line.append({"index":c_i,"cell":_cell})
  1077. if c_i<len(_line)-1:
  1078. n_cell = _line[c_i+1]
  1079. _bbox = c_cell["bbox"]
  1080. n_bbox = n_cell["bbox"]
  1081. if _bbox[0]==n_bbox[0] and _bbox[2]==n_bbox[2]:
  1082. continue
  1083. else:
  1084. if abs(_bbox[2]-n_bbox[0])>margin:
  1085. _bbox = (_bbox[2],_bbox[1], n_bbox[0],_bbox[3])
  1086. _cell = {"bbox": _bbox,
  1087. "rect": LTRect(1,_bbox),
  1088. "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
  1089. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  1090. "text": ""}
  1091. extend_line.append({"index":c_i+1,"cell":_cell})
  1092. if c_i==len(_line)-1:
  1093. if abs(c_cell["bbox"][2]-list_x[-1])>margin:
  1094. _bbox = (c_cell["bbox"][2],c_cell["bbox"][1], list_x[-1],c_cell["bbox"][3])
  1095. _cell = {"bbox": _bbox,
  1096. "rect": LTRect(1,_bbox),
  1097. "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
  1098. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  1099. "text": ""}
  1100. extend_line.append({"index":c_i+1,"cell":_cell})
  1101. extend_line.sort(key=lambda x:x["index"],reverse=True)
  1102. for _tmp in extend_line:
  1103. _line.insert(_tmp["index"],_tmp["cell"])
  1104. list_textbox.sort(key=lambda x:x.bbox[0])
  1105. list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
  1106. for textbox in list_textbox:
  1107. if textbox in in_objs:
  1108. continue
  1109. (x0,y0,x1,y1) = textbox.bbox
  1110. _text = textbox.get_text()
  1111. _find = False
  1112. for table_line in _table:
  1113. for _cell in table_line:
  1114. if self.inbox(textbox.bbox,_cell["bbox"], textbox.get_text()):
  1115. _cell["text"] += _text
  1116. in_objs.add(textbox)
  1117. _find = True
  1118. break
  1119. if _find:
  1120. break
  1121. # print("=======")
  1122. # for _line in _table:
  1123. # for _cell in _line:
  1124. # _text = _cell["text"][:2]
  1125. # if _text=="":
  1126. # _text = "=="
  1127. # print(_text,end="\t")
  1128. # print("\n")
  1129. # print("===========")
  1130. table_bbox = (_table[0][0].get("bbox")[0],
  1131. _table[0][0].get("bbox")[1],
  1132. _table[-1][-1].get("bbox")[2],
  1133. _table[-1][-1].get("bbox")[3])
  1134. ta = {"bbox": table_bbox, "table": _table}
  1135. return ta
  1136. def inbox(self, bbox0, bbox_g, text=""):
  1137. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  1138. # return 1
  1139. # print("utils inbox", text, self.getIOU(bbox0,bbox_g), bbox0, bbox_g)
  1140. if self.getIOU(bbox0,bbox_g)>0.5:
  1141. return 1
  1142. return 0
  1143. def getIOU(self, bbox0, bbox1):
  1144. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  1145. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  1146. if width < 0 and height < 0:
  1147. iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
  1148. abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  1149. # print("getIOU", iou)
  1150. return iou
  1151. return 0
  1152. def getspan(self, _list, x0, x1, margin):
  1153. _count = 0
  1154. (x0,x1) = (min(x0,x1),max(x0,x1))
  1155. for _x in _list:
  1156. if _x>=(x0-margin) and _x<=(x1+margin):
  1157. _count += 1
  1158. return _count-1
  1159. def _plot(self, list_line, list_textbox):
  1160. from matplotlib import pyplot as plt
  1161. plt.figure()
  1162. for _line in list_line:
  1163. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  1164. plt.plot([x0, x1], [y0, y1])
  1165. for _line in list_line:
  1166. x0, y0, x1, y1 = _line.bbox
  1167. plt.plot([x0, x1], [y0, y1])
  1168. # for point in list_crosspoints:
  1169. # plt.scatter(point.get("point")[0],point.get("point")[1])
  1170. for textbox in list_textbox:
  1171. x0, y0, x1, y1 = textbox.bbox
  1172. plt.plot([x0, x1], [y0, y1])
  1173. plt.show()
  1174. def get_table_html(table):
  1175. html_text = '<table border="1">' + "\n"
  1176. for row in table:
  1177. html_text += "<tr>" + "\n"
  1178. for col in row:
  1179. row_span = col.get("rowspan")
  1180. col_span = col.get("columnspan")
  1181. bbox_text = col.get("text")
  1182. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1183. html_text += bbox_text + "</td>" + "\n"
  1184. html_text += "</tr>" + "\n"
  1185. html_text += "</table>" + "\n"
  1186. return html_text
  1187. def sort_object(obj_list, is_reverse=False):
  1188. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  1189. if len(obj_list) == 0:
  1190. return obj_list
  1191. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  1192. obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
  1193. return obj_list
  1194. elif isinstance(obj_list[0], _Page):
  1195. obj_list.sort(key=lambda x: x.page_no)
  1196. return obj_list
  1197. else:
  1198. return obj_list
  1199. def request_post(url, param, time_out=1000):
  1200. fails = 0
  1201. text = json.dumps([-2])
  1202. while True:
  1203. try:
  1204. if fails >= 1:
  1205. break
  1206. headers = {'content-type': 'application/json'}
  1207. result = requests.post(url, data=param, timeout=time_out)
  1208. # print('result.status_code', result.status_code)
  1209. # print('result.text', result.text)
  1210. if result.status_code == 200:
  1211. text = result.text
  1212. break
  1213. else:
  1214. fails += 1
  1215. continue
  1216. except:
  1217. fails += 1
  1218. print('fail! fail times:', fails)
  1219. traceback.print_exc()
  1220. return text
  1221. def test_gpu():
  1222. print("="*30)
  1223. import paddle
  1224. paddle.utils.run_check()
  1225. # import tensorflow as tf
  1226. # print("tf gpu", tf.config.list_physical_devices('GPU'))
  1227. print("="*30)
  1228. def my_subprocess_call(*popenargs, timeout=None):
  1229. logging.info("into my_subprocess_call")
  1230. with Popen(*popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
  1231. try:
  1232. for line in p.stdout:
  1233. print("stdout", line)
  1234. for line in p.stderr:
  1235. print("stderr", line)
  1236. p.wait(timeout=timeout)
  1237. # p.communicate()
  1238. return p.pid, p.returncode
  1239. except: # Including KeyboardInterrupt, wait handled that.
  1240. p.kill()
  1241. # We don't call p.wait() again as p.__exit__ does that for us.
  1242. raise
  1243. finally:
  1244. logging.info("out my_subprocess_call")
  1245. p.kill()
  1246. def parse_yaml():
  1247. yaml_path = os.path.dirname(os.path.abspath(__file__)) + "/interface.yml"
  1248. with open(yaml_path, "r", encoding='utf-8') as f:
  1249. cfg = f.read()
  1250. params = yaml.load(cfg, Loader=yaml.SafeLoader)
  1251. return params
  1252. def get_ip_port(node_type=None, interface_type=None):
  1253. if node_type is None:
  1254. node_type_list = ["master", "slave"]
  1255. else:
  1256. node_type_list = [node_type]
  1257. if interface_type is None:
  1258. interface_type_list = ["convert", "ocr", "otr", "office", "path"]
  1259. else:
  1260. interface_type_list = [interface_type]
  1261. ip_port_dict = {}
  1262. params = parse_yaml()
  1263. for type1 in node_type_list:
  1264. node_type = type1.upper()
  1265. ip_list = params.get(node_type).get("ip")
  1266. for type2 in interface_type_list:
  1267. interface_type = type2.upper()
  1268. processes = 0
  1269. python_path = None
  1270. project_path = None
  1271. if interface_type in ["convert".upper()]:
  1272. _port = params.get(node_type).get(interface_type).get("port")
  1273. if _port is None:
  1274. port_list = []
  1275. else:
  1276. port_list = [str(_port)]
  1277. if interface_type == "convert".upper():
  1278. processes = params.get(node_type).get(interface_type).get("processes")
  1279. elif interface_type == "path".upper():
  1280. python_path = params.get(node_type).get(interface_type).get("python")
  1281. project_path = params.get(node_type).get(interface_type).get("project")
  1282. else:
  1283. port_start = params.get(node_type).get(interface_type).get("port_start")
  1284. port_no = params.get(node_type).get(interface_type).get("port_no")
  1285. if port_start is None or port_no is None:
  1286. port_list = []
  1287. else:
  1288. port_list = [str(x) for x in range(port_start, port_start+port_no, 1)]
  1289. if ip_list:
  1290. for _ip in ip_list:
  1291. if _ip is None:
  1292. continue
  1293. if _ip in ip_port_dict.keys():
  1294. if port_list:
  1295. ip_port_dict.get(_ip).update({interface_type.lower(): port_list})
  1296. else:
  1297. if port_list:
  1298. ip_port_dict[_ip] = {interface_type.lower(): port_list}
  1299. if processes:
  1300. ip_port_dict.get(_ip).update({interface_type.lower()+"_processes": processes})
  1301. if project_path and python_path:
  1302. ip_port_dict.get(_ip).update({"project_path": project_path,
  1303. "python_path": python_path})
  1304. return ip_port_dict
  1305. def get_intranet_ip():
  1306. try:
  1307. # Create a new socket using the given address family,
  1308. # socket type and protocol number.
  1309. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  1310. # Connect to a remote socket at address.
  1311. # (The format of address depends on the address family.)
  1312. address = ("8.8.8.8", 80)
  1313. s.connect(address)
  1314. # Return the socket’s own address.
  1315. # This is useful to find out the port number of an IPv4/v6 socket, for instance.
  1316. # (The format of the address returned depends on the address family.)
  1317. sockname = s.getsockname()
  1318. ip = sockname[0]
  1319. port = sockname[1]
  1320. finally:
  1321. s.close()
  1322. return ip
  1323. def log(msg):
  1324. call_func_name = inspect.currentframe().f_back.f_code.co_name
  1325. logger = get_logger(call_func_name, {"md5": _global.get("md5"),
  1326. "port": _global.get("port")})
  1327. logger.info(msg)
  1328. # logging.info(msg)
  1329. def get_logger(_name, _dict):
  1330. extra = _dict
  1331. _format = '%(asctime)s - %(name)s - %(levelname)s - %(md5)s - %(port)s - %(message)s'
  1332. logger = logging.getLogger(_name)
  1333. create_new_flag = 1
  1334. handlers = logger.handlers
  1335. if handlers:
  1336. for h in handlers:
  1337. if h.formatter.__dict__.get("_fmt") == _format:
  1338. create_new_flag = 0
  1339. break
  1340. if create_new_flag:
  1341. formatter = logging.Formatter(_format)
  1342. handler = logging.StreamHandler()
  1343. handler.setFormatter(formatter)
  1344. logger.addHandler(handler)
  1345. logger.setLevel(logging.INFO)
  1346. logger.propagate = False
  1347. logger = logging.LoggerAdapter(logger, extra)
  1348. return logger
  1349. def set_flask_global():
  1350. # 接口轮询所需锁、参数
  1351. ip_port_flag = {}
  1352. ip_port_dict = get_ip_port()
  1353. for _k in ip_port_dict.keys():
  1354. ip_port_flag.update({_k: {"ocr": 0,
  1355. "otr": 0,
  1356. "convert": 0,
  1357. "office": 0
  1358. }})
  1359. _global.update({"ip_port_flag": ip_port_flag})
  1360. _global.update({"ip_port": ip_port_dict})
  1361. # print(globals().get("ip_port"))
  1362. def get_md5_from_bytes(_bytes):
  1363. def generate_fp(_b):
  1364. bio = BytesIO()
  1365. bio.write(_b)
  1366. return bio
  1367. _length = 0
  1368. try:
  1369. _md5 = hashlib.md5()
  1370. ff = generate_fp(_bytes)
  1371. ff.seek(0)
  1372. while True:
  1373. data = ff.read(4096)
  1374. if not data:
  1375. break
  1376. _length += len(data)
  1377. _md5.update(data)
  1378. return _md5.hexdigest(), _length
  1379. except Exception as e:
  1380. traceback.print_exc()
  1381. return None, _length
  1382. if __name__ == "__main__":
  1383. # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  1384. # print(slash_replace(strs))
  1385. # from matplotlib import pyplot as plt
  1386. # import random
  1387. # fig = plt.figure()
  1388. # plt.xlim(100)
  1389. # plt.ylim(100)
  1390. # fig.add_subplot(111)
  1391. # x0,y0,x1,y1 = (1,2,3,4)
  1392. # plt.gca().add_patch(plt.Rectangle(xy=(x0, y0),
  1393. # width=x1-x0,
  1394. # height=y1-y0,
  1395. # edgecolor=(random.randint(0,255)/255,random.randint(0,255)/255,random.randint(0,255)/255),
  1396. # fill=False, linewidth=2))
  1397. #
  1398. # # plt.show()
  1399. # import cv2
  1400. # import numpy as np
  1401. # img = np.zeros(shape=(1800,1800),dtype=np.uint8)
  1402. # img += 255
  1403. # cv2.imshow("bbox", img)
  1404. # cv2.waitKey(0)
  1405. # print(json.dumps({"data":[1, 2]}))
  1406. # print(parse_yaml())
  1407. print(get_ip_port())
  1408. # print(get_intranet_ip())