utils.py 46 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. import difflib
  5. import logging
  6. import mimetypes
  7. import platform
  8. import re
  9. import traceback
  10. import filetype
  11. from bs4 import BeautifulSoup
  12. from pdfminer.layout import *
  13. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8]):
  14. """
  15. [0] : continue
  16. [-1]: 逻辑处理错误
  17. [-2]: 接口调用错误
  18. [-3]: 文件格式错误,无法打开
  19. [-4]: 各类文件调用第三方包读取超时
  20. [-5]: 整个转换过程超时
  21. [-6]: 阿里云UDF队列超时
  22. [-7]: 文件需密码,无法打开
  23. [-8]: 调用现成接口报错
  24. """
  25. for c in code:
  26. if _list == [c]:
  27. return True
  28. return False
  29. def add_div(text):
  30. if text == "" or text is None:
  31. return text
  32. # if get_platform() == "Windows":
  33. # print("add_div", text)
  34. if re.findall("<div>", text):
  35. return text
  36. text = "<div>" + text + "\n"
  37. text = re.sub("\n", "</div>\n<div>", text)
  38. # text += "</div>"
  39. if text[-5:] == "<div>":
  40. print("add_div has cut", text[-30:])
  41. text = text[:-5]
  42. return text
  43. def get_platform():
  44. sys = platform.system()
  45. return sys
  46. def get_html_p(html_path):
  47. logging.info("into get_html_p")
  48. try:
  49. with open(html_path, "r") as ff:
  50. html_str = ff.read()
  51. soup = BeautifulSoup(html_str, 'lxml')
  52. text = ""
  53. for p in soup.find_all("p"):
  54. p_text = p.text
  55. p_text = p_text.strip()
  56. if p.string != "":
  57. text += p_text
  58. text += "\n"
  59. return text
  60. except Exception as e:
  61. logging.info("get_html_p error!")
  62. print("get_html_p", traceback.print_exc())
  63. return [-1]
  64. def string_similarity(str1, str2):
  65. # 去掉<div>和回车
  66. str1 = re.sub("<div>", "", str1)
  67. str1 = re.sub("</div>", "", str1)
  68. str1 = re.sub("\n", "", str1)
  69. str2 = re.sub("<div>", "", str2)
  70. str2 = re.sub("</div>", "", str2)
  71. str2 = re.sub("\n", "", str2)
  72. # print("********************************")
  73. # print("str1", str1)
  74. # print("********************************")
  75. # print("str2", str2)
  76. # print("********************************")
  77. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  78. print("string_similarity", score)
  79. return score
  80. def get_sequential_data(text_list, bbox_list, html=False):
  81. logging.info("into get_sequential_data")
  82. try:
  83. text = ""
  84. order_list = []
  85. for i in range(len(text_list)):
  86. length_start = bbox_list[i][0][0]
  87. length_end = bbox_list[i][1][0]
  88. height_start = bbox_list[i][0][1]
  89. height_end = bbox_list[i][-1][1]
  90. # print([length_start, length_end, height_start, height_end])
  91. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  92. # text = text + infomation['text'] + "\n"
  93. if get_platform() == "Windows":
  94. print("get_sequential_data", order_list)
  95. if not order_list:
  96. if get_platform() == "Windows":
  97. print("get_sequential_data", "no order list")
  98. return ""
  99. # 根据bbox的坐标对输出排序
  100. order_list.sort(key=lambda x: (x[3], x[1]))
  101. # 根据bbox分行分列
  102. # col_list = []
  103. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  104. # for i in range(len(order_list)):
  105. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  106. # col_list.append(order_list[i])
  107. # else:
  108. # row_list.append(col_list)
  109. # col_list = []
  110. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  111. # col_list.append(order_list[i])
  112. # if i == len(order_list) - 1:
  113. # row_list.append(col_list)
  114. row_list = []
  115. used_box = []
  116. threshold = 5
  117. for box in order_list:
  118. if box in used_box:
  119. continue
  120. height_center = (box[4] + box[3]) / 2
  121. row = []
  122. for box2 in order_list:
  123. if box2 in used_box:
  124. continue
  125. height_center2 = (box2[4] + box2[3]) / 2
  126. if height_center - threshold <= height_center2 <= height_center + threshold:
  127. if box2 not in row:
  128. row.append(box2)
  129. used_box.append(box2)
  130. row.sort(key=lambda x: x[0])
  131. row_list.append(row)
  132. for row in row_list:
  133. if not row:
  134. continue
  135. if len(row) <= 1:
  136. text = text + row[0][0] + "\n"
  137. else:
  138. sub_text = ""
  139. row.sort(key=lambda x: x[1])
  140. for col in row:
  141. sub_text = sub_text + col[0] + " "
  142. sub_text = sub_text + "\n"
  143. text += sub_text
  144. if html:
  145. text = "<div>" + text
  146. text = re.sub("\n", "</div>\n<div>", text)
  147. text += "</div>"
  148. # if text[-5:] == "<div>":
  149. # text = text[:-5]
  150. return text
  151. except Exception as e:
  152. logging.info("get_sequential_data error!")
  153. print("get_sequential_data", traceback.print_exc())
  154. return [-1]
  155. # def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  156. # logging.info("into get_formatted_table")
  157. # try:
  158. # # 重新定义text_bbox_list,[point, point, text]
  159. # text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  160. # range(len(text_bbox_list))]
  161. # # 按纵坐标排序
  162. # text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  163. # table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  164. #
  165. # # print("text_bbox_list", text_bbox_list)
  166. # # print("table_bbox_list", table_bbox_list)
  167. #
  168. # # bbox位置 threshold
  169. # threshold = 5
  170. #
  171. # # 根据split_line分区,可能有个区多个表格 [(), ()]
  172. # area_text_bbox_list = []
  173. # area_table_bbox_list = []
  174. # # print("get_formatted_table, split_line", split_line)
  175. # for j in range(1, len(split_line)):
  176. # last_y = split_line[j - 1][0][1]
  177. # current_y = split_line[j][0][1]
  178. # temp_text_bbox_list = []
  179. # temp_table_bbox_list = []
  180. #
  181. # # 找出该区域下text bbox
  182. # for text_bbox in text_bbox_list:
  183. # # 计算 text bbox 中心点
  184. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  185. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  186. # if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  187. # temp_text_bbox_list.append(text_bbox)
  188. # area_text_bbox_list.append(temp_text_bbox_list)
  189. #
  190. # # 找出该区域下table bbox
  191. # for table_bbox in table_bbox_list:
  192. # # 计算 table bbox 中心点
  193. # table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  194. # (table_bbox[1][1] + table_bbox[0][1]) / 2)
  195. # if last_y < table_bbox_center[1] < current_y:
  196. # temp_table_bbox_list.append(table_bbox)
  197. # area_table_bbox_list.append(temp_table_bbox_list)
  198. #
  199. # # for j in range(len(area_text_bbox_list)):
  200. # # print("area_text_bbox_list", j, area_text_bbox_list[j])
  201. #
  202. # # 对每个区域分别进行两个bbox匹配,生成表格
  203. # area_text_list = []
  204. # area_column_list = []
  205. # for j in range(len(area_text_bbox_list)):
  206. # # 每个区域的table bbox 和text bbox
  207. # temp_table_bbox_list = area_table_bbox_list[j]
  208. # temp_text_bbox_list = area_text_bbox_list[j]
  209. #
  210. # # 判断该区域有无表格bbox
  211. # # 若无表格,将该区域文字连接
  212. # if not temp_table_bbox_list:
  213. # # 找出该区域的所有text bbox
  214. # only_text_list = []
  215. # only_bbox_list = []
  216. # for text_bbox in temp_text_bbox_list:
  217. # only_text_list.append(text_bbox[2])
  218. # only_bbox_list.append([text_bbox[0], text_bbox[1]])
  219. # only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  220. # if only_text == [-1]:
  221. # return [-1], [-1]
  222. # area_text_list.append(only_text)
  223. # area_column_list.append(0)
  224. # continue
  225. #
  226. # # 有表格
  227. # # 文本对应的表格格子
  228. # text_in_table = {}
  229. # for i in range(len(temp_text_bbox_list)):
  230. # text_bbox = temp_text_bbox_list[i]
  231. #
  232. # # 计算 text bbox 中心点
  233. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  234. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  235. #
  236. # # 判断中心点在哪个table bbox中
  237. # for table_bbox in temp_table_bbox_list:
  238. # # 中心点在table bbox中,将text写入字典
  239. # if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  240. # table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  241. # if str(table_bbox) in text_in_table.keys():
  242. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  243. # else:
  244. # text_in_table[str(table_bbox)] = text_bbox[2]
  245. # break
  246. #
  247. # # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  248. # # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  249. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  250. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  251. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  252. # # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  253. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  254. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  255. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  256. # # if str(table_bbox) in text_in_table.keys():
  257. # # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  258. # # else:
  259. # # text_in_table[str(table_bbox)] = text_bbox[2]
  260. # # break
  261. #
  262. # # 对表格格子进行分行分列,并计算总计多少小列
  263. # # 放入坐标
  264. # all_col_list = []
  265. # all_row_list = []
  266. # for i in range(len(temp_table_bbox_list)):
  267. # table_bbox = temp_table_bbox_list[i]
  268. #
  269. # # 放入所有坐标x
  270. # if table_bbox[0][0] not in all_col_list:
  271. # all_col_list.append(table_bbox[0][0])
  272. # if table_bbox[1][0] not in all_col_list:
  273. # all_col_list.append(table_bbox[1][0])
  274. #
  275. # # 放入所有坐标y
  276. # if table_bbox[0][1] not in all_row_list:
  277. # all_row_list.append(table_bbox[0][1])
  278. # if table_bbox[1][1] not in all_row_list:
  279. # all_row_list.append(table_bbox[1][1])
  280. # all_col_list.sort(key=lambda x: x)
  281. # all_row_list.sort(key=lambda x: x)
  282. #
  283. # # 分行
  284. # row_list = []
  285. # rows = []
  286. # temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  287. # y_row = temp_table_bbox_list[0][0][1]
  288. # for i in range(len(temp_table_bbox_list)):
  289. # table_bbox = temp_table_bbox_list[i]
  290. #
  291. # if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  292. # rows.append(table_bbox)
  293. # else:
  294. # y_row = table_bbox[0][1]
  295. # if rows:
  296. # rows.sort(key=lambda x: x[0][0])
  297. # row_list.append(rows)
  298. # rows = []
  299. # rows.append(table_bbox)
  300. # # print("*" * 30)
  301. # # print(row_list)
  302. #
  303. # if i == len(temp_table_bbox_list) - 1:
  304. # if rows:
  305. # rows.sort(key=lambda x: x[0][0])
  306. # row_list.append(rows)
  307. #
  308. # # 生成表格,包括文字和格子宽度
  309. # area_column = []
  310. # text = '<table border="1">' + "\n"
  311. # for row in row_list:
  312. # text += "<tr>" + "\n"
  313. # for col in row:
  314. # # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  315. # row_span = 1
  316. # for y in all_row_list:
  317. # if col[0][1] < y < col[1][1]:
  318. # if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  319. # row_span += 1
  320. #
  321. # # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  322. # col_span = 1
  323. # for x in all_col_list:
  324. # if col[0][0] < x < col[1][0]:
  325. # if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  326. # col_span += 1
  327. #
  328. # text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  329. #
  330. # if str(col) in text_in_table.keys():
  331. # text += text_in_table.get(str(col))
  332. # else:
  333. # text += ""
  334. # text += "</td>" + "\n"
  335. # text += "</tr>" + "\n"
  336. # text += "</table>" + "\n"
  337. #
  338. # # 计算最大column
  339. # max_col_num = 0
  340. # for row in row_list:
  341. # col_num = 0
  342. # for col in row:
  343. # col_num += 1
  344. # if max_col_num < col_num:
  345. # max_col_num = col_num
  346. #
  347. # area_text_list.append(text)
  348. # area_column_list.append(max_col_num)
  349. #
  350. # text = ""
  351. # if get_platform() == "Windows":
  352. # print("get_formatted_table area_text_list", area_text_list)
  353. # for area_text in area_text_list:
  354. # text += area_text
  355. # return text, area_column_list
  356. # except Exception as e:
  357. # logging.info("get_formatted_table error!")
  358. # print("get_formatted_table", traceback.print_exc())
  359. # return [-1], [-1]
  360. def rename_inner_files(root_path):
  361. try:
  362. logging.info("into rename_inner_files")
  363. # 获取解压文件夹下所有文件+文件夹,不带根路径
  364. path_list = []
  365. for root, dirs, files in os.walk(root_path, topdown=False):
  366. for name in dirs:
  367. p = os.path.join(root, name) + os.sep
  368. if get_platform() == "Windows":
  369. root_path = slash_replace(root_path)
  370. p = slash_replace(p)
  371. p = re.sub(root_path, "", p)
  372. root_path = slash_replace(root_path, True)
  373. p = slash_replace(p, True)
  374. else:
  375. p = re.sub(root_path, "", p)
  376. path_list.append(p)
  377. for name in files:
  378. p = os.path.join(root, name)
  379. if get_platform() == "Windows":
  380. root_path = slash_replace(root_path)
  381. p = slash_replace(p)
  382. p = re.sub(root_path, "", p)
  383. root_path = slash_replace(root_path, True)
  384. p = slash_replace(p, True)
  385. else:
  386. p = re.sub(root_path, "", p)
  387. path_list.append(p)
  388. # 按路径长度排序
  389. path_list.sort(key=lambda x: len(x), reverse=True)
  390. # 循环改名
  391. for old_path in path_list:
  392. # 按路径分隔符分割
  393. ss = old_path.split(os.sep)
  394. # 判断是否文件夹
  395. is_dir = 0
  396. file_type = ""
  397. if os.path.isdir(root_path + old_path):
  398. ss = ss[:-1]
  399. is_dir = 1
  400. else:
  401. if "." in old_path:
  402. file_type = "." + old_path.split(".")[-1]
  403. else:
  404. file_type = ""
  405. # 最后一级需要用hash改名
  406. new_path = ""
  407. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  408. current_level = 0
  409. for s in ss:
  410. # 路径拼接
  411. if current_level < len(ss) - 1:
  412. new_path += s + os.sep
  413. else:
  414. new_path += str(hash(s)) + file_type
  415. current_level += 1
  416. new_ab_path = root_path + new_path
  417. old_ab_path = root_path + old_path
  418. os.rename(old_ab_path, new_ab_path)
  419. # 重新获取解压文件夹下所有文件+文件夹
  420. new_path_list = []
  421. for root, dirs, files in os.walk(root_path, topdown=False):
  422. for name in dirs:
  423. new_path_list.append(os.path.join(root, name) + os.sep)
  424. for name in files:
  425. new_path_list.append(os.path.join(root, name))
  426. return new_path_list
  427. except:
  428. traceback.print_exc()
  429. return [-1]
  430. def judge_format(path):
  431. guess1 = mimetypes.guess_type(path)
  432. _type = None
  433. if guess1[0]:
  434. _type = guess1[0]
  435. else:
  436. guess2 = filetype.guess(path)
  437. if guess2:
  438. _type = guess2.mime
  439. if _type == "application/pdf":
  440. return "pdf"
  441. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  442. return "docx"
  443. if _type == "application/x-zip-compressed" or _type == "application/zip":
  444. return "zip"
  445. if _type == "application/x-rar-compressed" or _type == "application/rar":
  446. return "rar"
  447. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  448. return "xlsx"
  449. if _type == "application/msword":
  450. return "doc"
  451. if _type == "image/png":
  452. return "png"
  453. if _type == "image/jpeg":
  454. return "jpg"
  455. # 猜不到,返回None
  456. return None
  457. def slash_replace(_str, reverse=False):
  458. if reverse:
  459. _str = eval(repr(_str).replace('/', '\\\\'))
  460. else:
  461. _str = eval(repr(_str).replace('\\\\', '/'))
  462. return _str
  463. class LineTable:
  464. def recognize_table(self,list_textbox, list_line,sourceP_LB=True):
  465. self.list_line = list_line
  466. self.list_crosspoints = self.recognize_crosspoints(list_line)
  467. # 聚类
  468. cluster_crosspoints = []
  469. for _point in self.list_crosspoints:
  470. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  471. while 1:
  472. _find = False
  473. new_cluster_crosspoints = []
  474. for l_point in cluster_crosspoints:
  475. _flag = False
  476. for l_n_point in new_cluster_crosspoints:
  477. line1 = l_point.get("lines")
  478. line2 = l_n_point.get("lines")
  479. if len(line1&line2) > 0:
  480. _find = True
  481. _flag = True
  482. l_n_point["lines"] = line1.union(line2)
  483. l_n_point["points"].extend(l_point["points"])
  484. if not _flag:
  485. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  486. cluster_crosspoints = new_cluster_crosspoints
  487. if not _find:
  488. break
  489. list_l_rect = []
  490. for table_crosspoint in cluster_crosspoints:
  491. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  492. list_l_rect.append(list_rect)
  493. in_objs = set()
  494. list_tables = []
  495. for l_rect in list_l_rect:
  496. _ta = self.rect2table(list_textbox,l_rect,in_objs,sourceP_LB=sourceP_LB)
  497. if _ta:
  498. list_tables.append(_ta)
  499. self._plot(list_line, list_textbox)
  500. return list_tables, in_objs, list_l_rect
  501. def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  502. dump_margin = 5
  503. list_rect_tmp = []
  504. # 去重
  505. for _rect in list_rect:
  506. if (_rect.bbox[3]-_rect.bbox[1] < 10) or (abs(_rect.bbox[2]-_rect.bbox[0]) < 5):
  507. continue
  508. _find = False
  509. for _tmp in list_rect_tmp:
  510. for i in range(4):
  511. if abs(_rect.bbox[i]-_tmp.bbox[i]) < dump_margin:
  512. pass
  513. else:
  514. _find = False
  515. break
  516. if i == 3:
  517. _find = True
  518. if _find:
  519. break
  520. if not _find:
  521. list_rect_tmp.append(_rect)
  522. # print("=====",len(list_rect),len(list_rect_tmp))
  523. # print(list_rect_tmp)
  524. # from matplotlib import pyplot as plt
  525. # plt.figure()
  526. # for _rect in list_rect_tmp:
  527. # x0,y0,x1,y1 = _rect.bbox
  528. # plt.boxplot(_rect.bbox)
  529. # plt.show()
  530. cluster_rect = []
  531. for _rect in list_rect:
  532. _find = False
  533. for cr in cluster_rect:
  534. for cr_rect in cr:
  535. if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
  536. _find = True
  537. cr.append(_rect)
  538. break
  539. elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
  540. _find = True
  541. cr.append(_rect)
  542. break
  543. if _find:
  544. break
  545. if not _find:
  546. cluster_rect.append([_rect])
  547. list_l_rect = cluster_rect
  548. in_objs = set()
  549. list_tables = []
  550. for l_rect in list_l_rect:
  551. _ta = self.rect2table(list_textbox,l_rect,in_objs)
  552. if _ta:
  553. list_tables.append(_ta)
  554. return list_tables,in_objs,list_l_rect
  555. def recognize_crosspoints(self, list_line,fixLine=True):
  556. list_crosspoints = []
  557. # print("lines num",len(list_line))
  558. def getMaxPoints(list_x,margin=5,reverse=False):
  559. clust_x = []
  560. for _x in list_x:
  561. _find = False
  562. for cx in clust_x:
  563. if abs(cx[0]-_x)<margin:
  564. _find = True
  565. cx.append(_x)
  566. break
  567. if not _find:
  568. clust_x.append([_x])
  569. clust_x.sort(key=lambda x:x,reverse=reverse)
  570. return clust_x[0][0],len(clust_x[0])
  571. for _i in range(len(list_line)):
  572. for _j in range(len(list_line)):
  573. line1 = list_line[_i].__dict__.get("bbox")
  574. line2 = list_line[_j].__dict__.get("bbox")
  575. exists,point = self.cross_point(line1,line2)
  576. if exists:
  577. list_crosspoints.append(point)
  578. if fixLine:
  579. #聚类
  580. cluster_crosspoints = []
  581. for _point in list_crosspoints:
  582. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  583. while 1:
  584. _find = False
  585. new_cluster_crosspoints = []
  586. for l_point in cluster_crosspoints:
  587. _flag = False
  588. for l_n_point in new_cluster_crosspoints:
  589. line1 = l_point.get("lines")
  590. line2 = l_n_point.get("lines")
  591. if len(line1&line2)>0:
  592. _find = True
  593. _flag = True
  594. l_n_point["lines"] = line1.union(line2)
  595. l_n_point["points"].extend(l_point["points"])
  596. if not _flag:
  597. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  598. cluster_crosspoints = new_cluster_crosspoints
  599. if not _find:
  600. break
  601. list_crosspoints = []
  602. for list_cp in cluster_crosspoints:
  603. points = list_cp.get("points")
  604. l_lines = []
  605. for p in points:
  606. l_lines.extend(p.get("p_lines"))
  607. l_lines = list(set(l_lines))
  608. l_lines.sort(key=lambda x:x[0])
  609. min_x,_count = getMaxPoints([l[0] for l in l_lines],reverse=False)
  610. if _count<=2:
  611. min_x = None
  612. min_y,_count = getMaxPoints([l[1] for l in l_lines],reverse=False)
  613. if _count<2:
  614. min_y = None
  615. max_x,_count = getMaxPoints([l[2] for l in l_lines],reverse=True)
  616. if _count<=2:
  617. max_x = None
  618. max_y,_count = getMaxPoints([l[3] for l in l_lines],reverse=True)
  619. if _count<=2:
  620. max_y = None
  621. if min_x and min_y and max_x and max_y:
  622. points.sort(key=lambda x:x["point"][0])
  623. if abs(min_x-points[0]["point"][0])>30:
  624. _line = LTLine(1,(min_x,min_y),(min_x,max_y))
  625. list_line.append(_line)
  626. l_lines.append(_line.bbox)
  627. print("add=====",_line.bbox)
  628. if abs(max_x-points[-1]["point"][0])>30:
  629. _line = LTLine(1,(max_x,min_y),(max_x,max_y))
  630. list_line.append()
  631. l_lines.append(_line.bbox)
  632. print("add=====1",_line.bbox)
  633. points.sort(key=lambda x:x["point"][1])
  634. if abs(min_y-points[0]["point"][1])>30:
  635. _line = LTLine(1,(min_x,min_y),(max_x,min_y))
  636. list_line.append(_line)
  637. l_lines.append(_line.bbox)
  638. print("add=====2",_line.bbox)
  639. if abs(max_y-points[-1]["point"][1])>30:
  640. _line = LTLine(1,(min_x,max_y),(max_x,max_y))
  641. list_line.append(_line)
  642. l_lines.append(_line.bbox)
  643. print("add=====2",_line.bbox)
  644. for _i in range(len(l_lines)):
  645. for _j in range(len(l_lines)):
  646. line1 = l_lines[_i]
  647. line2 = l_lines[_j]
  648. exists,point = self.cross_point(line1,line2)
  649. if exists:
  650. list_crosspoints.append(point)
  651. from matplotlib import pyplot as plt
  652. plt.figure()
  653. for _line in l_lines:
  654. x0,y0,x1,y1 = _line
  655. plt.plot([x0,x1],[y0,y1])
  656. for point in list_crosspoints:
  657. plt.scatter(point.get("point")[0],point.get("point")[1])
  658. plt.show()
  659. # from matplotlib import pyplot as plt
  660. # plt.figure()
  661. # for _line in list_line:
  662. # x0,y0,x1,y1 = _line.__dict__.get("bbox")
  663. # plt.plot([x0,x1],[y0,y1])
  664. # for _line in list_line:
  665. # x0,y0,x1,y1 = _line.bbox
  666. # plt.plot([x0,x1],[y0,y1])
  667. # for point in list_crosspoints:
  668. # plt.scatter(point.get("point")[0],point.get("point")[1])
  669. # plt.show()
  670. # print(list_crosspoints)
  671. # print("points num",len(list_crosspoints))
  672. return list_crosspoints
  673. def recognize_rect(self, _page):
  674. list_line = []
  675. for _obj in _page._objs:
  676. if isinstance(_obj, (LTLine)):
  677. list_line.append(_obj)
  678. list_crosspoints = self.recognize_crosspoints(list_line)
  679. #聚类
  680. cluster_crosspoints = []
  681. for _point in list_crosspoints:
  682. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  683. while 1:
  684. _find = False
  685. new_cluster_crosspoints = []
  686. for l_point in cluster_crosspoints:
  687. _flag = False
  688. for l_n_point in new_cluster_crosspoints:
  689. line1 = l_point.get("lines")
  690. line2 = l_n_point.get("lines")
  691. if len(line1&line2)>0:
  692. _find = True
  693. _flag = True
  694. l_n_point["lines"] = line1.union(line2)
  695. l_n_point["points"].extend(l_point["points"])
  696. if not _flag:
  697. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  698. cluster_crosspoints = new_cluster_crosspoints
  699. if not _find:
  700. break
  701. # print(len(cluster_crosspoints))
  702. list_l_rect = []
  703. for table_crosspoint in cluster_crosspoints:
  704. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  705. list_l_rect.append(list_rect)
  706. return list_l_rect
  707. def crosspoint2rect(self, list_crosspoint, margin=4):
  708. dict_line_points = {}
  709. for _point in list_crosspoint:
  710. lines = list(_point.get("lines"))
  711. for _line in lines:
  712. if _line not in dict_line_points:
  713. dict_line_points[_line] = {"direct":None,"points":[]}
  714. dict_line_points[_line]["points"].append(_point)
  715. # 排序
  716. for k, v in dict_line_points.items():
  717. list_x = []
  718. list_y = []
  719. for _p in v["points"]:
  720. list_x.append(_p.get("point")[0])
  721. list_y.append(_p.get("point")[1])
  722. if max(list_x)-min(list_x)>max(list_y)-min(list_y):
  723. v.get("points").sort(key=lambda x:x.get("point")[0])
  724. v["direct"] = "row"
  725. else:
  726. v.get("points").sort(key=lambda x:x.get("point")[1])
  727. v["direct"] = "column"
  728. list_rect = []
  729. for _point in list_crosspoint:
  730. if _point["buttom"]>=margin and _point["right"]>=margin:
  731. lines = list(_point.get("lines"))
  732. _line = lines[0]
  733. if dict_line_points[_line]["direct"]=="column":
  734. _line = lines[1]
  735. next_point = None
  736. for p1 in dict_line_points[_line]["points"]:
  737. if p1["point"][0]>_point["point"][0]:
  738. if p1["buttom"]>=margin:
  739. next_point = p1
  740. break
  741. if not next_point:
  742. continue
  743. lines = list(next_point.get("lines"))
  744. _line = lines[0]
  745. if dict_line_points[_line]["direct"]=="row":
  746. _line = lines[1]
  747. final_point = None
  748. for p1 in dict_line_points[_line]["points"]:
  749. if p1["point"][1]>next_point["point"][1]:
  750. if p1["left"]>=margin:
  751. final_point = p1
  752. break
  753. if not final_point:
  754. next_point["buttom"] = 0
  755. continue
  756. _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
  757. list_rect.append(_r)
  758. # dump
  759. tmp_rect = []
  760. set_bbox = set()
  761. for _r in list_rect:
  762. _bbox = "%.2f-%.2f-%.2f-%.2f"%_r.bbox
  763. if not _bbox in set_bbox:
  764. tmp_rect.append(_r)
  765. set_bbox.add(_bbox)
  766. list_rect = tmp_rect
  767. return list_rect
  768. def cross_point(self, line1, line2, segment=True, margin=2):
  769. point_is_exist = False
  770. x = y = 0
  771. x1, y1, x2, y2 = line1
  772. x3, y3, x4, y4 = line2
  773. if (x2 - x1) == 0:
  774. k1 = None
  775. b1 = 0
  776. else:
  777. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  778. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  779. if (x4 - x3) == 0: # L2直线斜率不存在
  780. k2 = None
  781. b2 = 0
  782. else:
  783. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  784. b2 = y3 * 1.0 - x3 * k2 * 1.0
  785. if k1 is None:
  786. if not k2 is None:
  787. x = x1
  788. y = k2 * x1 + b2
  789. point_is_exist = True
  790. elif k2 is None:
  791. x = x3
  792. y = k1 * x3 + b1
  793. elif not k2 == k1:
  794. x = (b2 - b1) * 1.0 / (k1 - k2)
  795. y = k1 * x * 1.0 + b1 * 1.0
  796. point_is_exist = True
  797. left = 0
  798. right = 0
  799. top = 0
  800. buttom = 0
  801. if point_is_exist:
  802. if segment:
  803. if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
  804. if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
  805. point_is_exist = True
  806. left = abs(min(x1,x3)-x)
  807. right = abs(max(x2,x4)-x)
  808. top = abs(min(y1,y3)-y)
  809. buttom = abs(max(y2,y4)-y)
  810. else:
  811. point_is_exist = False
  812. else:
  813. point_is_exist = False
  814. line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
  815. line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
  816. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  817. "top": top, "buttom": buttom, "lines": set([line1_key,line2_key]),"p_lines":[line1,line2]}
  818. def unionTable(self, list_table, fixspan=True, margin=2):
  819. set_x = set()
  820. set_y = set()
  821. list_cell = []
  822. for _t in list_table:
  823. for _line in _t:
  824. list_cell.extend(_line)
  825. clusters_rects = []
  826. #根据y1聚类
  827. set_id = set()
  828. list_cell_dump = []
  829. for _cell in list_cell:
  830. _id = id(_cell)
  831. if _id in set_id:
  832. continue
  833. set_id.add(_id)
  834. list_cell_dump.append(_cell)
  835. list_cell = list_cell_dump
  836. list_cell.sort(key=lambda x:x.get("bbox")[3])
  837. for _rect in list_cell:
  838. _y0 = _rect.get("bbox")[3]
  839. _find = False
  840. for l_cr in clusters_rects:
  841. if abs(l_cr[0].get("bbox")[3]-_y0)<2:
  842. _find = True
  843. l_cr.append(_rect)
  844. break
  845. if not _find:
  846. clusters_rects.append([_rect])
  847. clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
  848. for l_cr in clusters_rects:
  849. l_cr.sort(key=lambda x:x.get("bbox")[0])
  850. print("=============:")
  851. for l_r in clusters_rects:
  852. print(len(l_r))
  853. for _line in clusters_rects:
  854. for _rect in _line:
  855. (x0,y0,x1,y1) = _rect.get("bbox")
  856. set_x.add(x0)
  857. set_x.add(x1)
  858. set_y.add(y0)
  859. set_y.add(y1)
  860. if len(set_x)==0 or len(set_y)==0:
  861. return
  862. list_x = list(set_x)
  863. list_y = list(set_y)
  864. list_x.sort(key=lambda x:x)
  865. list_y.sort(key=lambda x:x,reverse=True)
  866. _table = []
  867. for _line in clusters_rects:
  868. table_line = []
  869. for _rect in _line:
  870. (x0,y0,x1,y1) = _rect.get("bbox")
  871. _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
  872. table_line.append(_cell)
  873. _table.append(table_line)
  874. # print("=====================>>")
  875. # for _line in _table:
  876. # for _cell in _line:
  877. # print(_cell,end="\t")
  878. # print("\n")
  879. # print("=====================>>")
  880. # print(_table)
  881. if fixspan:
  882. for _line in _table:
  883. for c_i in range(len(_line)):
  884. _cell = _line[c_i]
  885. if _cell.get("columnspan")>1:
  886. _cospan = _cell.get("columnspan")
  887. _cell["columnspan"] = 1
  888. for i in range(1,_cospan):
  889. _line.insert(c_i,_cell)
  890. for l_i in range(len(_table)):
  891. _line = _table[l_i]
  892. for c_i in range(len(_line)):
  893. _cell = _line[c_i]
  894. if _cell.get("rowspan")>1:
  895. _rospan = _cell.get("rowspan")
  896. _cell["rowspan"] = 1
  897. for i in range(1,_rospan):
  898. _table[l_i+i].insert(c_i,_cell)
  899. table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
  900. ta = {"bbox":table_bbox,"table":_table}
  901. return ta
  902. def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=True,sourceP_LB=True,fixRect=True):
  903. _table = []
  904. set_x = set()
  905. set_y = set()
  906. clusters_rects = []
  907. # 根据y1聚类
  908. list_rect.sort(key=lambda x:x.bbox[3])
  909. for _rect in list_rect:
  910. _y0 = _rect.bbox[3]
  911. _find = False
  912. for l_cr in clusters_rects:
  913. if abs(l_cr[0].bbox[3]-_y0)<2:
  914. _find = True
  915. l_cr.append(_rect)
  916. break
  917. if not _find:
  918. clusters_rects.append([_rect])
  919. # cul spans
  920. for _line in clusters_rects:
  921. for _rect in _line:
  922. (x0,y0,x1,y1) = _rect.bbox
  923. set_x.add(x0)
  924. set_x.add(x1)
  925. set_y.add(y0)
  926. set_y.add(y1)
  927. if len(set_x)==0 or len(set_y)==0:
  928. return
  929. list_x = list(set_x)
  930. list_y = list(set_y)
  931. list_x.sort(key=lambda x:x)
  932. list_y.sort(key=lambda x:x,reverse=sourceP_LB)
  933. print("clusters_rects", len(clusters_rects))
  934. clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
  935. for l_cr in clusters_rects:
  936. l_cr.sort(key=lambda x:x.bbox[0])
  937. if fixRect:
  938. extend_cr = []
  939. for cr_i in range(len(l_cr)):
  940. if cr_i==0:
  941. if abs(l_cr[cr_i].bbox[0]-list_x[0])>5:
  942. extend_cr.append(LTRect(1,[list_x[0],l_cr[cr_i].bbox[1],l_cr[cr_i].bbox[0],l_cr[cr_i].bbox[3]]))
  943. print("=====extend0",[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],list_x[-1],l_cr[cr_i].bbox[3]])
  944. if cr_i>=0 and cr_i<len(l_cr)-1:
  945. if abs(l_cr[cr_i].bbox[2]-l_cr[cr_i+1].bbox[0])>5:
  946. extend_cr.append(LTRect(1,[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],l_cr[cr_i+1].bbox[2],l_cr[cr_i].bbox[3]]))
  947. print("=====extend1",[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],list_x[-1],l_cr[cr_i].bbox[3]])
  948. print(l_cr[cr_i].bbox)
  949. print(l_cr[cr_i+1].bbox)
  950. if cr_i==len(l_cr)-1:
  951. if abs(l_cr[cr_i].bbox[2]-list_x[-1])>5:
  952. print("=====extend",[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],list_x[-1],l_cr[cr_i].bbox[3]])
  953. extend_cr.append(LTRect(1,[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],list_x[-1],l_cr[cr_i].bbox[3]]))
  954. if extend_cr:
  955. l_cr.extend(extend_cr)
  956. l_cr.sort(key=lambda x:x.bbox[0])
  957. pop_x = []
  958. for i in range(len(list_x)-1):
  959. _i = len(list_x)-i-1
  960. l_i = _i-1
  961. if abs(list_x[_i]-list_x[l_i])<5:
  962. pop_x.append(_i)
  963. pop_x.sort(key=lambda x:x,reverse=True)
  964. for _x in pop_x:
  965. list_x.pop(_x)
  966. #
  967. pop_x = []
  968. for i in range(len(list_y)-1):
  969. _i = len(list_y)-i-1
  970. l_i = _i-1
  971. if abs(list_y[_i]-list_y[l_i])<5:
  972. pop_x.append(_i)
  973. pop_x.sort(key=lambda x:x,reverse=True)
  974. for _x in pop_x:
  975. list_y.pop(_x)
  976. # print(list_x)
  977. # print(list_y)
  978. for _line in clusters_rects:
  979. table_line = []
  980. for _rect in _line:
  981. (x0, y0, x1, y1) = _rect.bbox
  982. _cell = {"bbox": (x0, y0, x1, y1),
  983. "rect": _rect,
  984. "rowspan": self.getspan(list_y, y0, y1, margin),
  985. "columnspan": self.getspan(list_x, x0, x1, margin),
  986. "text": ""}
  987. table_line.append(_cell)
  988. _table.append(table_line)
  989. list_textbox.sort(key=lambda x:x.bbox[0])
  990. list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
  991. for textbox in list_textbox:
  992. (x0,y0,x1,y1) = textbox.bbox
  993. _text = textbox.get_text()
  994. _find = False
  995. for table_line in _table:
  996. for _cell in table_line:
  997. if self.inbox(textbox.bbox,_cell["bbox"]):
  998. _cell["text"] += _text
  999. in_objs.add(textbox)
  1000. _find = True
  1001. break
  1002. if _find:
  1003. break
  1004. if fixspan:
  1005. for _line in _table:
  1006. for c_i in range(len(_line)):
  1007. _cell = _line[c_i]
  1008. if _cell.get("columnspan")>1:
  1009. _cospan = _cell.get("columnspan")
  1010. _cell["columnspan"] = 1
  1011. for i in range(1,_cospan):
  1012. _line.insert(c_i,_cell)
  1013. for l_i in range(len(_table)):
  1014. _line = _table[l_i]
  1015. for c_i in range(len(_line)):
  1016. _cell = _line[c_i]
  1017. if _cell.get("rowspan")>1:
  1018. _rospan = _cell.get("rowspan")
  1019. _cell["rowspan"] = 1
  1020. for i in range(1,_rospan):
  1021. if l_i+i<len(_table)-1:
  1022. print(len(_table),l_i+i)
  1023. _table[l_i+i].insert(c_i,_cell)
  1024. print("table>=======>")
  1025. print(list_x)
  1026. print(list_y)
  1027. for _line in _table:
  1028. for _cell in _line:
  1029. print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
  1030. print("\n")
  1031. print("===========")
  1032. table_bbox = (_table[0][0].get("bbox")[0],
  1033. _table[0][0].get("bbox")[1],
  1034. _table[-1][-1].get("bbox")[2],
  1035. _table[-1][-1].get("bbox")[3])
  1036. ta = {"bbox": table_bbox, "table": _table}
  1037. return ta
  1038. def inbox(self, bbox0, bbox_g):
  1039. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  1040. # return 1
  1041. if self.getIOU(bbox0,bbox_g)>0.5:
  1042. return 1
  1043. return 0
  1044. def getIOU(self, bbox0, bbox1):
  1045. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  1046. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  1047. if width < 0 and height < 0:
  1048. iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
  1049. abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  1050. # print("getIOU", iou)
  1051. return iou
  1052. return 0
  1053. def getspan(self, _list, x0, x1, margin):
  1054. _count = 0
  1055. (x0,x1) = (min(x0,x1),max(x0,x1))
  1056. for _x in _list:
  1057. if _x >=(x0 - margin) and _x<=(x1 + margin):
  1058. _count += 1
  1059. return _count-1
  1060. def _plot(self, list_line, list_textbox):
  1061. from matplotlib import pyplot as plt
  1062. plt.figure()
  1063. for _line in list_line:
  1064. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  1065. plt.plot([x0, x1], [y0, y1])
  1066. for _line in list_line:
  1067. x0, y0, x1, y1 = _line.bbox
  1068. plt.plot([x0, x1], [y0, y1])
  1069. # for point in list_crosspoints:
  1070. # plt.scatter(point.get("point")[0],point.get("point")[1])
  1071. for textbox in list_textbox:
  1072. x0, y0, x1, y1 = textbox.bbox
  1073. plt.plot([x0, x1], [y0, y1])
  1074. plt.show()
  1075. def get_table_html(table):
  1076. html_text = '<table border="1">' + "\n"
  1077. for row in table:
  1078. html_text += "<tr>" + "\n"
  1079. for col in row:
  1080. row_span = col.get("rowspan")
  1081. col_span = col.get("columnspan")
  1082. bbox_text = col.get("text")
  1083. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1084. html_text += bbox_text + "</td>" + "\n"
  1085. html_text += "</tr>" + "\n"
  1086. html_text += "</table>" + "\n"
  1087. return html_text
  1088. def sort_object(obj_list):
  1089. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  1090. if len(obj_list) == 0:
  1091. return obj_list
  1092. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  1093. obj_list.sort(key=lambda x: x.y)
  1094. return obj_list
  1095. elif isinstance(obj_list[0], _Page):
  1096. obj_list.sort(key=lambda x: x.page_no)
  1097. return obj_list
  1098. else:
  1099. return obj_list
  1100. if __name__ == "__main__":
  1101. strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  1102. print(slash_replace(strs))