utils.py 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. import difflib
  5. import logging
  6. import mimetypes
  7. import platform
  8. import re
  9. import traceback
  10. import filetype
  11. from bs4 import BeautifulSoup
  12. from pdfminer.layout import *
  13. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8]):
  14. """
  15. [0] : continue
  16. [-1]: 逻辑处理错误
  17. [-2]: 接口调用错误
  18. [-3]: 文件格式错误,无法打开
  19. [-4]: 各类文件调用第三方包读取超时
  20. [-5]: 整个转换过程超时
  21. [-6]: 阿里云UDF队列超时
  22. [-7]: 文件需密码,无法打开
  23. [-8]: 调用现成接口报错
  24. """
  25. for c in code:
  26. if _list == [c]:
  27. return True
  28. return False
  29. def add_div(text):
  30. if text == "" or text is None:
  31. return text
  32. # if get_platform() == "Windows":
  33. # print("add_div", text)
  34. if re.findall("<div>", text):
  35. return text
  36. text = "<div>" + text + "\n"
  37. text = re.sub("\n", "</div>\n<div>", text)
  38. # text += "</div>"
  39. if text[-5:] == "<div>":
  40. # print("add_div has cut", text[-30:])
  41. text = text[:-5]
  42. return text
  43. def get_platform():
  44. sys = platform.system()
  45. return sys
  46. def get_html_p(html_path):
  47. logging.info("into get_html_p")
  48. try:
  49. with open(html_path, "r") as ff:
  50. html_str = ff.read()
  51. soup = BeautifulSoup(html_str, 'lxml')
  52. text = ""
  53. for p in soup.find_all("p"):
  54. p_text = p.text
  55. p_text = p_text.strip()
  56. if p.string != "":
  57. text += p_text
  58. text += "\n"
  59. return text
  60. except Exception as e:
  61. logging.info("get_html_p error!")
  62. print("get_html_p", traceback.print_exc())
  63. return [-1]
  64. def string_similarity(str1, str2):
  65. # 去掉<div>和回车
  66. str1 = re.sub("<div>", "", str1)
  67. str1 = re.sub("</div>", "", str1)
  68. str1 = re.sub("\n", "", str1)
  69. str2 = re.sub("<div>", "", str2)
  70. str2 = re.sub("</div>", "", str2)
  71. str2 = re.sub("\n", "", str2)
  72. # print("********************************")
  73. # print("str1", str1)
  74. # print("********************************")
  75. # print("str2", str2)
  76. # print("********************************")
  77. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  78. print("string_similarity", score)
  79. return score
  80. def get_sequential_data(text_list, bbox_list, html=False):
  81. logging.info("into get_sequential_data")
  82. try:
  83. text = ""
  84. order_list = []
  85. for i in range(len(text_list)):
  86. length_start = bbox_list[i][0][0]
  87. length_end = bbox_list[i][1][0]
  88. height_start = bbox_list[i][0][1]
  89. height_end = bbox_list[i][-1][1]
  90. # print([length_start, length_end, height_start, height_end])
  91. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  92. # text = text + infomation['text'] + "\n"
  93. if get_platform() == "Windows":
  94. print("get_sequential_data", order_list)
  95. if not order_list:
  96. if get_platform() == "Windows":
  97. print("get_sequential_data", "no order list")
  98. return ""
  99. # 根据bbox的坐标对输出排序
  100. order_list.sort(key=lambda x: (x[3], x[1]))
  101. # 根据bbox分行分列
  102. # col_list = []
  103. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  104. # for i in range(len(order_list)):
  105. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  106. # col_list.append(order_list[i])
  107. # else:
  108. # row_list.append(col_list)
  109. # col_list = []
  110. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  111. # col_list.append(order_list[i])
  112. # if i == len(order_list) - 1:
  113. # row_list.append(col_list)
  114. row_list = []
  115. used_box = []
  116. threshold = 5
  117. for box in order_list:
  118. if box in used_box:
  119. continue
  120. height_center = (box[4] + box[3]) / 2
  121. row = []
  122. for box2 in order_list:
  123. if box2 in used_box:
  124. continue
  125. height_center2 = (box2[4] + box2[3]) / 2
  126. if height_center - threshold <= height_center2 <= height_center + threshold:
  127. if box2 not in row:
  128. row.append(box2)
  129. used_box.append(box2)
  130. row.sort(key=lambda x: x[0])
  131. row_list.append(row)
  132. for row in row_list:
  133. if not row:
  134. continue
  135. if len(row) <= 1:
  136. text = text + row[0][0] + "\n"
  137. else:
  138. sub_text = ""
  139. row.sort(key=lambda x: x[1])
  140. for col in row:
  141. sub_text = sub_text + col[0] + " "
  142. sub_text = sub_text + "\n"
  143. text += sub_text
  144. if html:
  145. text = "<div>" + text
  146. text = re.sub("\n", "</div>\n<div>", text)
  147. text += "</div>"
  148. # if text[-5:] == "<div>":
  149. # text = text[:-5]
  150. return text
  151. except Exception as e:
  152. logging.info("get_sequential_data error!")
  153. print("get_sequential_data", traceback.print_exc())
  154. return [-1]
  155. # def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  156. # logging.info("into get_formatted_table")
  157. # try:
  158. # # 重新定义text_bbox_list,[point, point, text]
  159. # text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  160. # range(len(text_bbox_list))]
  161. # # 按纵坐标排序
  162. # text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  163. # table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  164. #
  165. # # print("text_bbox_list", text_bbox_list)
  166. # # print("table_bbox_list", table_bbox_list)
  167. #
  168. # # bbox位置 threshold
  169. # threshold = 5
  170. #
  171. # # 根据split_line分区,可能有个区多个表格 [(), ()]
  172. # area_text_bbox_list = []
  173. # area_table_bbox_list = []
  174. # # print("get_formatted_table, split_line", split_line)
  175. # for j in range(1, len(split_line)):
  176. # last_y = split_line[j - 1][0][1]
  177. # current_y = split_line[j][0][1]
  178. # temp_text_bbox_list = []
  179. # temp_table_bbox_list = []
  180. #
  181. # # 找出该区域下text bbox
  182. # for text_bbox in text_bbox_list:
  183. # # 计算 text bbox 中心点
  184. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  185. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  186. # if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  187. # temp_text_bbox_list.append(text_bbox)
  188. # area_text_bbox_list.append(temp_text_bbox_list)
  189. #
  190. # # 找出该区域下table bbox
  191. # for table_bbox in table_bbox_list:
  192. # # 计算 table bbox 中心点
  193. # table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  194. # (table_bbox[1][1] + table_bbox[0][1]) / 2)
  195. # if last_y < table_bbox_center[1] < current_y:
  196. # temp_table_bbox_list.append(table_bbox)
  197. # area_table_bbox_list.append(temp_table_bbox_list)
  198. #
  199. # # for j in range(len(area_text_bbox_list)):
  200. # # print("area_text_bbox_list", j, area_text_bbox_list[j])
  201. #
  202. # # 对每个区域分别进行两个bbox匹配,生成表格
  203. # area_text_list = []
  204. # area_column_list = []
  205. # for j in range(len(area_text_bbox_list)):
  206. # # 每个区域的table bbox 和text bbox
  207. # temp_table_bbox_list = area_table_bbox_list[j]
  208. # temp_text_bbox_list = area_text_bbox_list[j]
  209. #
  210. # # 判断该区域有无表格bbox
  211. # # 若无表格,将该区域文字连接
  212. # if not temp_table_bbox_list:
  213. # # 找出该区域的所有text bbox
  214. # only_text_list = []
  215. # only_bbox_list = []
  216. # for text_bbox in temp_text_bbox_list:
  217. # only_text_list.append(text_bbox[2])
  218. # only_bbox_list.append([text_bbox[0], text_bbox[1]])
  219. # only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  220. # if only_text == [-1]:
  221. # return [-1], [-1]
  222. # area_text_list.append(only_text)
  223. # area_column_list.append(0)
  224. # continue
  225. #
  226. # # 有表格
  227. # # 文本对应的表格格子
  228. # text_in_table = {}
  229. # for i in range(len(temp_text_bbox_list)):
  230. # text_bbox = temp_text_bbox_list[i]
  231. #
  232. # # 计算 text bbox 中心点
  233. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  234. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  235. #
  236. # # 判断中心点在哪个table bbox中
  237. # for table_bbox in temp_table_bbox_list:
  238. # # 中心点在table bbox中,将text写入字典
  239. # if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  240. # table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  241. # if str(table_bbox) in text_in_table.keys():
  242. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  243. # else:
  244. # text_in_table[str(table_bbox)] = text_bbox[2]
  245. # break
  246. #
  247. # # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  248. # # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  249. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  250. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  251. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  252. # # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  253. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  254. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  255. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  256. # # if str(table_bbox) in text_in_table.keys():
  257. # # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  258. # # else:
  259. # # text_in_table[str(table_bbox)] = text_bbox[2]
  260. # # break
  261. #
  262. # # 对表格格子进行分行分列,并计算总计多少小列
  263. # # 放入坐标
  264. # all_col_list = []
  265. # all_row_list = []
  266. # for i in range(len(temp_table_bbox_list)):
  267. # table_bbox = temp_table_bbox_list[i]
  268. #
  269. # # 放入所有坐标x
  270. # if table_bbox[0][0] not in all_col_list:
  271. # all_col_list.append(table_bbox[0][0])
  272. # if table_bbox[1][0] not in all_col_list:
  273. # all_col_list.append(table_bbox[1][0])
  274. #
  275. # # 放入所有坐标y
  276. # if table_bbox[0][1] not in all_row_list:
  277. # all_row_list.append(table_bbox[0][1])
  278. # if table_bbox[1][1] not in all_row_list:
  279. # all_row_list.append(table_bbox[1][1])
  280. # all_col_list.sort(key=lambda x: x)
  281. # all_row_list.sort(key=lambda x: x)
  282. #
  283. # # 分行
  284. # row_list = []
  285. # rows = []
  286. # temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  287. # y_row = temp_table_bbox_list[0][0][1]
  288. # for i in range(len(temp_table_bbox_list)):
  289. # table_bbox = temp_table_bbox_list[i]
  290. #
  291. # if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  292. # rows.append(table_bbox)
  293. # else:
  294. # y_row = table_bbox[0][1]
  295. # if rows:
  296. # rows.sort(key=lambda x: x[0][0])
  297. # row_list.append(rows)
  298. # rows = []
  299. # rows.append(table_bbox)
  300. # # print("*" * 30)
  301. # # print(row_list)
  302. #
  303. # if i == len(temp_table_bbox_list) - 1:
  304. # if rows:
  305. # rows.sort(key=lambda x: x[0][0])
  306. # row_list.append(rows)
  307. #
  308. # # 生成表格,包括文字和格子宽度
  309. # area_column = []
  310. # text = '<table border="1">' + "\n"
  311. # for row in row_list:
  312. # text += "<tr>" + "\n"
  313. # for col in row:
  314. # # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  315. # row_span = 1
  316. # for y in all_row_list:
  317. # if col[0][1] < y < col[1][1]:
  318. # if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  319. # row_span += 1
  320. #
  321. # # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  322. # col_span = 1
  323. # for x in all_col_list:
  324. # if col[0][0] < x < col[1][0]:
  325. # if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  326. # col_span += 1
  327. #
  328. # text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  329. #
  330. # if str(col) in text_in_table.keys():
  331. # text += text_in_table.get(str(col))
  332. # else:
  333. # text += ""
  334. # text += "</td>" + "\n"
  335. # text += "</tr>" + "\n"
  336. # text += "</table>" + "\n"
  337. #
  338. # # 计算最大column
  339. # max_col_num = 0
  340. # for row in row_list:
  341. # col_num = 0
  342. # for col in row:
  343. # col_num += 1
  344. # if max_col_num < col_num:
  345. # max_col_num = col_num
  346. #
  347. # area_text_list.append(text)
  348. # area_column_list.append(max_col_num)
  349. #
  350. # text = ""
  351. # if get_platform() == "Windows":
  352. # print("get_formatted_table area_text_list", area_text_list)
  353. # for area_text in area_text_list:
  354. # text += area_text
  355. # return text, area_column_list
  356. # except Exception as e:
  357. # logging.info("get_formatted_table error!")
  358. # print("get_formatted_table", traceback.print_exc())
  359. # return [-1], [-1]
  360. def rename_inner_files(root_path):
  361. try:
  362. logging.info("into rename_inner_files")
  363. # 获取解压文件夹下所有文件+文件夹,不带根路径
  364. path_list = []
  365. for root, dirs, files in os.walk(root_path, topdown=False):
  366. for name in dirs:
  367. p = os.path.join(root, name) + os.sep
  368. if get_platform() == "Windows":
  369. root_path = slash_replace(root_path)
  370. p = slash_replace(p)
  371. p = re.sub(root_path, "", p)
  372. root_path = slash_replace(root_path, True)
  373. p = slash_replace(p, True)
  374. else:
  375. p = re.sub(root_path, "", p)
  376. path_list.append(p)
  377. for name in files:
  378. p = os.path.join(root, name)
  379. if get_platform() == "Windows":
  380. root_path = slash_replace(root_path)
  381. p = slash_replace(p)
  382. p = re.sub(root_path, "", p)
  383. root_path = slash_replace(root_path, True)
  384. p = slash_replace(p, True)
  385. else:
  386. p = re.sub(root_path, "", p)
  387. path_list.append(p)
  388. # 按路径长度排序
  389. path_list.sort(key=lambda x: len(x), reverse=True)
  390. # 循环改名
  391. for old_path in path_list:
  392. # 按路径分隔符分割
  393. ss = old_path.split(os.sep)
  394. # 判断是否文件夹
  395. is_dir = 0
  396. file_type = ""
  397. if os.path.isdir(root_path + old_path):
  398. ss = ss[:-1]
  399. is_dir = 1
  400. else:
  401. if "." in old_path:
  402. file_type = "." + old_path.split(".")[-1]
  403. else:
  404. file_type = ""
  405. # 最后一级需要用hash改名
  406. new_path = ""
  407. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  408. current_level = 0
  409. for s in ss:
  410. # 路径拼接
  411. if current_level < len(ss) - 1:
  412. new_path += s + os.sep
  413. else:
  414. new_path += str(hash(s)) + file_type
  415. current_level += 1
  416. new_ab_path = root_path + new_path
  417. old_ab_path = root_path + old_path
  418. os.rename(old_ab_path, new_ab_path)
  419. # 重新获取解压文件夹下所有文件+文件夹
  420. new_path_list = []
  421. for root, dirs, files in os.walk(root_path, topdown=False):
  422. for name in dirs:
  423. new_path_list.append(os.path.join(root, name) + os.sep)
  424. for name in files:
  425. new_path_list.append(os.path.join(root, name))
  426. return new_path_list
  427. except:
  428. traceback.print_exc()
  429. return [-1]
  430. def judge_format(path):
  431. guess1 = mimetypes.guess_type(path)
  432. _type = None
  433. if guess1[0]:
  434. _type = guess1[0]
  435. else:
  436. guess2 = filetype.guess(path)
  437. if guess2:
  438. _type = guess2.mime
  439. if _type == "application/pdf":
  440. return "pdf"
  441. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  442. return "docx"
  443. if _type == "application/x-zip-compressed" or _type == "application/zip":
  444. return "zip"
  445. if _type == "application/x-rar-compressed" or _type == "application/rar":
  446. return "rar"
  447. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  448. return "xlsx"
  449. if _type == "application/msword":
  450. return "doc"
  451. if _type == "image/png":
  452. return "png"
  453. if _type == "image/jpeg":
  454. return "jpg"
  455. # 猜不到,返回None
  456. return None
  457. def slash_replace(_str, reverse=False):
  458. if reverse:
  459. _str = eval(repr(_str).replace('/', '\\\\'))
  460. else:
  461. _str = eval(repr(_str).replace('\\\\', '/'))
  462. return _str
  463. class LineTable:
  464. def recognize_table(self,list_textbox, list_line,sourceP_LB=True):
  465. self.list_line = list_line
  466. self.list_crosspoints = self.recognize_crosspoints(list_line)
  467. # 聚类
  468. cluster_crosspoints = []
  469. for _point in self.list_crosspoints:
  470. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  471. while 1:
  472. _find = False
  473. new_cluster_crosspoints = []
  474. for l_point in cluster_crosspoints:
  475. _flag = False
  476. for l_n_point in new_cluster_crosspoints:
  477. line1 = l_point.get("lines")
  478. line2 = l_n_point.get("lines")
  479. if len(line1&line2) > 0:
  480. _find = True
  481. _flag = True
  482. l_n_point["lines"] = line1.union(line2)
  483. l_n_point["points"].extend(l_point["points"])
  484. if not _flag:
  485. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  486. cluster_crosspoints = new_cluster_crosspoints
  487. if not _find:
  488. break
  489. list_l_rect = []
  490. for table_crosspoint in cluster_crosspoints:
  491. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  492. list_l_rect.append(list_rect)
  493. in_objs = set()
  494. list_tables = []
  495. for l_rect in list_l_rect:
  496. _ta = self.rect2table(list_textbox,l_rect,in_objs,sourceP_LB=sourceP_LB)
  497. if _ta:
  498. list_tables.append(_ta)
  499. self._plot(list_line, list_textbox)
  500. return list_tables, in_objs, list_l_rect
  501. def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  502. dump_margin = 5
  503. list_rect_tmp = []
  504. # 去重
  505. for _rect in list_rect:
  506. if (_rect.bbox[3]-_rect.bbox[1] < 10) or (abs(_rect.bbox[2]-_rect.bbox[0]) < 5):
  507. continue
  508. _find = False
  509. for _tmp in list_rect_tmp:
  510. for i in range(4):
  511. if abs(_rect.bbox[i]-_tmp.bbox[i]) < dump_margin:
  512. pass
  513. else:
  514. _find = False
  515. break
  516. if i == 3:
  517. _find = True
  518. if _find:
  519. break
  520. if not _find:
  521. list_rect_tmp.append(_rect)
  522. # print("=====",len(list_rect),len(list_rect_tmp))
  523. # print(list_rect_tmp)
  524. # from matplotlib import pyplot as plt
  525. # plt.figure()
  526. # for _rect in list_rect_tmp:
  527. # x0,y0,x1,y1 = _rect.bbox
  528. # plt.boxplot(_rect.bbox)
  529. # plt.show()
  530. cluster_rect = []
  531. for _rect in list_rect:
  532. _find = False
  533. for cr in cluster_rect:
  534. for cr_rect in cr:
  535. if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
  536. _find = True
  537. cr.append(_rect)
  538. break
  539. elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
  540. _find = True
  541. cr.append(_rect)
  542. break
  543. if _find:
  544. break
  545. if not _find:
  546. cluster_rect.append([_rect])
  547. list_l_rect = cluster_rect
  548. in_objs = set()
  549. list_tables = []
  550. for l_rect in list_l_rect:
  551. _ta = self.rect2table(list_textbox,l_rect,in_objs)
  552. if _ta:
  553. list_tables.append(_ta)
  554. return list_tables,in_objs,list_l_rect
  555. def recognize_crosspoints(self, list_line,fixLine=True):
  556. list_crosspoints = []
  557. # print("lines num",len(list_line))
  558. def getMaxPoints(list_x,margin=5):
  559. clust_x = []
  560. for _x in list_x:
  561. _find = False
  562. for cx in clust_x:
  563. if abs(cx[0]-_x)<margin:
  564. _find = True
  565. cx.append(_x)
  566. break
  567. if not _find:
  568. clust_x.append([_x])
  569. clust_x.sort(key=lambda x:len(x),reverse=True)
  570. return clust_x[0][0],len(clust_x[0])
  571. for _i in range(len(list_line)):
  572. for _j in range(len(list_line)):
  573. line1 = list_line[_i].__dict__.get("bbox")
  574. line2 = list_line[_j].__dict__.get("bbox")
  575. exists,point = self.cross_point(line1,line2)
  576. if exists:
  577. list_crosspoints.append(point)
  578. if fixLine:
  579. #聚类
  580. cluster_crosspoints = []
  581. for _point in list_crosspoints:
  582. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  583. while 1:
  584. _find = False
  585. new_cluster_crosspoints = []
  586. for l_point in cluster_crosspoints:
  587. _flag = False
  588. for l_n_point in new_cluster_crosspoints:
  589. line1 = l_point.get("lines")
  590. line2 = l_n_point.get("lines")
  591. if len(line1&line2)>0:
  592. _find = True
  593. _flag = True
  594. l_n_point["lines"] = line1.union(line2)
  595. l_n_point["points"].extend(l_point["points"])
  596. if not _flag:
  597. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  598. cluster_crosspoints = new_cluster_crosspoints
  599. if not _find:
  600. break
  601. for list_cp in cluster_crosspoints:
  602. points = list_cp.get("points")
  603. l_lines = []
  604. for p in points:
  605. l_lines.extend(p.get("p_lines"))
  606. l_lines = list(set(l_lines))
  607. l_lines.sort(key=lambda x:x[0])
  608. min_x,_count = getMaxPoints([l[0] for l in l_lines])
  609. if _count<2:
  610. min_x = None
  611. min_y,_count = getMaxPoints([l[1] for l in l_lines])
  612. if _count<2:
  613. min_y = None
  614. max_x,_count = getMaxPoints([l[2] for l in l_lines])
  615. if _count<2:
  616. max_x = None
  617. max_y,_count = getMaxPoints([l[3] for l in l_lines])
  618. if _count<2:
  619. max_y = None
  620. if min_x and min_y and max_x and max_y:
  621. points.sort(key=lambda x:x["point"][0])
  622. if abs(min_x-points[0]["point"][0])>10:
  623. list_line.append(LTLine(1,(min_x,min_y),(min_x,max_y)))
  624. if abs(max_x-points[-1]["point"][0])>10:
  625. list_line.append(LTLine(1,(max_x,min_y),(max_x,max_y)))
  626. points.sort(key=lambda x:x["point"][1])
  627. if abs(min_y-points[0]["point"][1])>10:
  628. list_line.append(LTLine(1,(min_x,min_y),(max_x,min_y)))
  629. if abs(max_y-points[-1]["point"][1])>10:
  630. list_line.append(LTLine(1,(min_x,max_y),(max_x,max_y)))
  631. list_crosspoints = []
  632. for _i in range(len(list_line)):
  633. for _j in range(len(list_line)):
  634. line1 = list_line[_i].__dict__.get("bbox")
  635. line2 = list_line[_j].__dict__.get("bbox")
  636. exists,point = self.cross_point(line1,line2)
  637. if exists:
  638. list_crosspoints.append(point)
  639. # plt.figure()
  640. # for _line in list_line:
  641. # x0,y0,x1,y1 = _line.__dict__.get("bbox")
  642. # plt.plot([x0,x1],[y0,y1])
  643. # for _line in list_line:
  644. # x0,y0,x1,y1 = _line.bbox
  645. # plt.plot([x0,x1],[y0,y1])
  646. # for point in list_crosspoints:
  647. # plt.scatter(point.get("point")[0],point.get("point")[1])
  648. # plt.show()
  649. from matplotlib import pyplot as plt
  650. plt.figure()
  651. for _line in list_line:
  652. x0,y0,x1,y1 = _line.__dict__.get("bbox")
  653. plt.plot([x0,x1],[y0,y1])
  654. for _line in list_line:
  655. x0,y0,x1,y1 = _line.bbox
  656. plt.plot([x0,x1],[y0,y1])
  657. for point in list_crosspoints:
  658. plt.scatter(point.get("point")[0],point.get("point")[1])
  659. plt.show()
  660. # print(list_crosspoints)
  661. # print("points num",len(list_crosspoints))
  662. return list_crosspoints
  663. def recognize_rect(self, _page):
  664. list_line = []
  665. for _obj in _page._objs:
  666. if isinstance(_obj, (LTLine)):
  667. list_line.append(_obj)
  668. list_crosspoints = self.recognize_crosspoints(list_line)
  669. #聚类
  670. cluster_crosspoints = []
  671. for _point in list_crosspoints:
  672. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  673. while 1:
  674. _find = False
  675. new_cluster_crosspoints = []
  676. for l_point in cluster_crosspoints:
  677. _flag = False
  678. for l_n_point in new_cluster_crosspoints:
  679. line1 = l_point.get("lines")
  680. line2 = l_n_point.get("lines")
  681. if len(line1&line2)>0:
  682. _find = True
  683. _flag = True
  684. l_n_point["lines"] = line1.union(line2)
  685. l_n_point["points"].extend(l_point["points"])
  686. if not _flag:
  687. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  688. cluster_crosspoints = new_cluster_crosspoints
  689. if not _find:
  690. break
  691. # print(len(cluster_crosspoints))
  692. list_l_rect = []
  693. for table_crosspoint in cluster_crosspoints:
  694. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  695. list_l_rect.append(list_rect)
  696. return list_l_rect
  697. def crosspoint2rect(self, list_crosspoint, margin=4):
  698. dict_line_points = {}
  699. for _point in list_crosspoint:
  700. lines = list(_point.get("lines"))
  701. for _line in lines:
  702. if _line not in dict_line_points:
  703. dict_line_points[_line] = {"direct":None,"points":[]}
  704. dict_line_points[_line]["points"].append(_point)
  705. # 排序
  706. for k, v in dict_line_points.items():
  707. list_x = []
  708. list_y = []
  709. for _p in v["points"]:
  710. list_x.append(_p.get("point")[0])
  711. list_y.append(_p.get("point")[1])
  712. if max(list_x)-min(list_x)>max(list_y)-min(list_y):
  713. v.get("points").sort(key=lambda x:x.get("point")[0])
  714. v["direct"] = "row"
  715. else:
  716. v.get("points").sort(key=lambda x:x.get("point")[1])
  717. v["direct"] = "column"
  718. list_rect = []
  719. for _point in list_crosspoint:
  720. if _point["buttom"]>=margin and _point["right"]>=margin:
  721. lines = list(_point.get("lines"))
  722. _line = lines[0]
  723. if dict_line_points[_line]["direct"]=="column":
  724. _line = lines[1]
  725. next_point = None
  726. for p1 in dict_line_points[_line]["points"]:
  727. if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
  728. next_point = p1
  729. break
  730. if not next_point:
  731. continue
  732. lines = list(next_point.get("lines"))
  733. _line = lines[0]
  734. if dict_line_points[_line]["direct"]=="row":
  735. _line = lines[1]
  736. final_point = None
  737. for p1 in dict_line_points[_line]["points"]:
  738. if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
  739. final_point = p1
  740. break
  741. if not final_point:
  742. continue
  743. _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
  744. list_rect.append(_r)
  745. tmp_rect = []
  746. set_bbox = set()
  747. for _r in list_rect:
  748. _bbox = "%.2f-%.2f-%.2f-%.2f"%_r.bbox
  749. if _bbox not in set_bbox:
  750. tmp_rect.append(_r)
  751. set_bbox.add(_bbox)
  752. list_rect = tmp_rect
  753. return list_rect
  754. def cross_point(self, line1, line2, segment=True, margin=2):
  755. point_is_exist = False
  756. x = y = 0
  757. x1, y1, x2, y2 = line1
  758. x3, y3, x4, y4 = line2
  759. if (x2 - x1) == 0:
  760. k1 = None
  761. b1 = 0
  762. else:
  763. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  764. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  765. if (x4 - x3) == 0: # L2直线斜率不存在
  766. k2 = None
  767. b2 = 0
  768. else:
  769. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  770. b2 = y3 * 1.0 - x3 * k2 * 1.0
  771. if k1 is None:
  772. if not k2 is None:
  773. x = x1
  774. y = k2 * x1 + b2
  775. point_is_exist = True
  776. elif k2 is None:
  777. x = x3
  778. y = k1 * x3 + b1
  779. elif not k2 == k1:
  780. x = (b2 - b1) * 1.0 / (k1 - k2)
  781. y = k1 * x * 1.0 + b1 * 1.0
  782. point_is_exist = True
  783. left = 0
  784. right = 0
  785. top = 0
  786. buttom = 0
  787. if point_is_exist:
  788. if segment:
  789. if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
  790. if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
  791. point_is_exist = True
  792. left = abs(min(x1,x3)-x)
  793. right = abs(max(x2,x4)-x)
  794. top = abs(min(y1,y3)-y)
  795. buttom = abs(max(y2,y4)-y)
  796. else:
  797. point_is_exist = False
  798. else:
  799. point_is_exist = False
  800. line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
  801. line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
  802. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  803. "top": top, "buttom": buttom, "lines": set([line1_key,line2_key]),"p_lines":[line1,line2]}
  804. def unionTable(self, list_table, fixspan=True, margin=2):
  805. set_x = set()
  806. set_y = set()
  807. list_cell = []
  808. for _t in list_table:
  809. for _line in _t:
  810. list_cell.extend(_line)
  811. clusters_rects = []
  812. #根据y1聚类
  813. set_id = set()
  814. list_cell_dump = []
  815. for _cell in list_cell:
  816. _id = id(_cell)
  817. if _id in set_id:
  818. continue
  819. set_id.add(_id)
  820. list_cell_dump.append(_cell)
  821. list_cell = list_cell_dump
  822. list_cell.sort(key=lambda x:x.get("bbox")[3])
  823. for _rect in list_cell:
  824. _y0 = _rect.get("bbox")[3]
  825. _find = False
  826. for l_cr in clusters_rects:
  827. if abs(l_cr[0].get("bbox")[3]-_y0)<2:
  828. _find = True
  829. l_cr.append(_rect)
  830. break
  831. if not _find:
  832. clusters_rects.append([_rect])
  833. clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
  834. for l_cr in clusters_rects:
  835. l_cr.sort(key=lambda x:x.get("bbox")[0])
  836. print("=============:")
  837. for l_r in clusters_rects:
  838. print(len(l_r))
  839. for _line in clusters_rects:
  840. for _rect in _line:
  841. (x0,y0,x1,y1) = _rect.get("bbox")
  842. set_x.add(x0)
  843. set_x.add(x1)
  844. set_y.add(y0)
  845. set_y.add(y1)
  846. if len(set_x)==0 or len(set_y)==0:
  847. return
  848. list_x = list(set_x)
  849. list_y = list(set_y)
  850. list_x.sort(key=lambda x:x)
  851. list_y.sort(key=lambda x:x,reverse=True)
  852. _table = []
  853. for _line in clusters_rects:
  854. table_line = []
  855. for _rect in _line:
  856. (x0,y0,x1,y1) = _rect.get("bbox")
  857. _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
  858. table_line.append(_cell)
  859. _table.append(table_line)
  860. # print("=====================>>")
  861. # for _line in _table:
  862. # for _cell in _line:
  863. # print(_cell,end="\t")
  864. # print("\n")
  865. # print("=====================>>")
  866. # print(_table)
  867. if fixspan:
  868. for _line in _table:
  869. for c_i in range(len(_line)):
  870. _cell = _line[c_i]
  871. if _cell.get("columnspan")>1:
  872. _cospan = _cell.get("columnspan")
  873. _cell["columnspan"] = 1
  874. for i in range(1,_cospan):
  875. _line.insert(c_i,_cell)
  876. for l_i in range(len(_table)):
  877. _line = _table[l_i]
  878. for c_i in range(len(_line)):
  879. _cell = _line[c_i]
  880. if _cell.get("rowspan")>1:
  881. _rospan = _cell.get("rowspan")
  882. _cell["rowspan"] = 1
  883. for i in range(1,_rospan):
  884. _table[l_i+i].insert(c_i,_cell)
  885. table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
  886. ta = {"bbox":table_bbox,"table":_table}
  887. return ta
  888. def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=False,sourceP_LB=True,fixRect=True):
  889. def getIOU(bbox0,bbox1):
  890. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  891. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  892. if width<0 and height<0:
  893. return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  894. return 0
  895. _table = []
  896. set_x = set()
  897. set_y = set()
  898. clusters_rects = []
  899. # 根据y1聚类
  900. list_rect.sort(key=lambda x:x.bbox[3])
  901. for _rect in list_rect:
  902. _y0 = _rect.bbox[3]
  903. _find = False
  904. for l_cr in clusters_rects:
  905. if abs(l_cr[0].bbox[3]-_y0)<2:
  906. _find = True
  907. l_cr.append(_rect)
  908. break
  909. if not _find:
  910. clusters_rects.append([_rect])
  911. # cul spans
  912. for _line in clusters_rects:
  913. for _rect in _line:
  914. (x0,y0,x1,y1) = _rect.bbox
  915. set_x.add(x0)
  916. set_x.add(x1)
  917. set_y.add(y0)
  918. set_y.add(y1)
  919. if len(set_x)==0 or len(set_y)==0:
  920. return
  921. list_x = list(set_x)
  922. list_y = list(set_y)
  923. list_x.sort(key=lambda x:x)
  924. list_y.sort(key=lambda x:x,reverse=sourceP_LB)
  925. print("clusters_rects", len(clusters_rects))
  926. clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
  927. for l_cr in clusters_rects:
  928. l_cr.sort(key=lambda x:x.bbox[0])
  929. if fixRect:
  930. pop_x = []
  931. for _i in range(len(l_cr)-1):
  932. cr_i = len(l_cr)-_i-1
  933. if getIOU(l_cr[cr_i].bbox,l_cr[cr_i-1].bbox)>0.5:
  934. x0,y0,x1,y1 = l_cr[cr_i].bbox
  935. x2,y2,x3,y3 = l_cr[cr_i-1].bbox
  936. l_cr[cr_i-1].bbox = [min(x0,x2),min(y0,y2),max(x1,x3),max(y1,y3)]
  937. pop_x.append(cr_i)
  938. for _x in pop_x:
  939. l_cr.pop(_x)
  940. l_cr.sort(key=lambda x:x.bbox[0])
  941. extend_cr = []
  942. for cr_i in range(len(l_cr)):
  943. if cr_i==0:
  944. if abs(l_cr[cr_i].bbox[0]-list_x[0])>5:
  945. extend_cr.append(LTRect(1,[list_x[0],l_cr[cr_i].bbox[1],l_cr[cr_i].bbox[0],l_cr[cr_i].bbox[3]]))
  946. if cr_i>=0 and cr_i<len(l_cr)-1:
  947. if abs(l_cr[cr_i].bbox[2]-l_cr[cr_i+1].bbox[0])>5:
  948. extend_cr.append(LTRect(1,[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],l_cr[cr_i+1].bbox[0],l_cr[cr_i].bbox[3]]))
  949. if cr_i==len(l_cr)-1:
  950. if abs(l_cr[cr_i].bbox[2]-list_x[-1])>5:
  951. extend_cr.append(LTRect(1,[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],list_x[-1],l_cr[cr_i].bbox[3]]))
  952. if extend_cr:
  953. l_cr.extend(extend_cr)
  954. l_cr.sort(key=lambda x:x.bbox[0])
  955. pop_x = []
  956. for i in range(len(list_x)-1):
  957. _i = len(list_x)-i-1
  958. l_i = _i-1
  959. if abs(list_x[_i]-list_x[l_i])<5:
  960. pop_x.append(_i)
  961. pop_x.sort(key=lambda x:x,reverse=True)
  962. for _x in pop_x:
  963. list_x.pop(_x)
  964. #
  965. pop_x = []
  966. for i in range(len(list_y)-1):
  967. _i = len(list_y)-i-1
  968. l_i = _i-1
  969. if abs(list_y[_i]-list_y[l_i])<5:
  970. pop_x.append(_i)
  971. pop_x.sort(key=lambda x:x,reverse=True)
  972. for _x in pop_x:
  973. list_y.pop(_x)
  974. # print(list_x)
  975. # print(list_y)
  976. for _line in clusters_rects:
  977. table_line = []
  978. for _rect in _line:
  979. (x0, y0, x1, y1) = _rect.bbox
  980. _cell = {"bbox": (x0, y0, x1, y1),
  981. "rect": _rect,
  982. "rowspan": self.getspan(list_y, y0, y1, margin),
  983. "columnspan": self.getspan(list_x, x0, x1, margin),
  984. "text": ""}
  985. table_line.append(_cell)
  986. _table.append(table_line)
  987. list_textbox.sort(key=lambda x:x.bbox[0])
  988. list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
  989. for textbox in list_textbox:
  990. (x0,y0,x1,y1) = textbox.bbox
  991. _text = textbox.get_text()
  992. _find = False
  993. for table_line in _table:
  994. for _cell in table_line:
  995. if self.inbox(textbox.bbox,_cell["bbox"]):
  996. _cell["text"] += _text
  997. in_objs.add(textbox)
  998. _find = True
  999. break
  1000. if _find:
  1001. break
  1002. if fixspan:
  1003. for _line in _table:
  1004. for c_i in range(len(_line)):
  1005. _cell = _line[c_i]
  1006. if _cell.get("columnspan")>1:
  1007. _cospan = _cell.get("columnspan")
  1008. _cell["columnspan"] = 1
  1009. for i in range(1,_cospan):
  1010. _line.insert(c_i,_cell)
  1011. for l_i in range(len(_table)):
  1012. _line = _table[l_i]
  1013. for c_i in range(len(_line)):
  1014. _cell = _line[c_i]
  1015. if _cell.get("rowspan")>1:
  1016. _rospan = _cell.get("rowspan")
  1017. _cell["rowspan"] = 1
  1018. for i in range(1,_rospan):
  1019. if l_i+i<len(_table)-1:
  1020. print(len(_table),l_i+i)
  1021. _table[l_i+i].insert(c_i,_cell)
  1022. print("=======")
  1023. for _line in _table:
  1024. for _cell in _line:
  1025. print(_cell,end="\t\t")
  1026. print("\n")
  1027. print("===========")
  1028. table_bbox = (_table[0][0].get("bbox")[0],
  1029. _table[0][0].get("bbox")[1],
  1030. _table[-1][-1].get("bbox")[2],
  1031. _table[-1][-1].get("bbox")[3])
  1032. ta = {"bbox": table_bbox, "table": _table}
  1033. return ta
  1034. def inbox(self, bbox0, bbox_g):
  1035. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  1036. # return 1
  1037. if self.getIOU(bbox0,bbox_g)>0.5:
  1038. return 1
  1039. return 0
  1040. def getIOU(self, bbox0, bbox1):
  1041. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  1042. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  1043. if width < 0 and height < 0:
  1044. iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
  1045. abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  1046. # print("getIOU", iou)
  1047. return iou
  1048. return 0
  1049. def getspan(self, _list, x0, x1, margin):
  1050. _count = 0
  1051. (x0,x1) = (min(x0,x1),max(x0,x1))
  1052. for _x in _list:
  1053. if _x>=(x0-margin) and _x<=(x1+margin):
  1054. _count += 1
  1055. return _count-1
  1056. def _plot(self, list_line, list_textbox):
  1057. from matplotlib import pyplot as plt
  1058. plt.figure()
  1059. for _line in list_line:
  1060. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  1061. plt.plot([x0, x1], [y0, y1])
  1062. for _line in list_line:
  1063. x0, y0, x1, y1 = _line.bbox
  1064. plt.plot([x0, x1], [y0, y1])
  1065. # for point in list_crosspoints:
  1066. # plt.scatter(point.get("point")[0],point.get("point")[1])
  1067. for textbox in list_textbox:
  1068. x0, y0, x1, y1 = textbox.bbox
  1069. plt.plot([x0, x1], [y0, y1])
  1070. plt.show()
  1071. def get_table_html(table):
  1072. html_text = '<table border="1">' + "\n"
  1073. for row in table:
  1074. html_text += "<tr>" + "\n"
  1075. for col in row:
  1076. row_span = col.get("rowspan")
  1077. col_span = col.get("columnspan")
  1078. bbox_text = col.get("text")
  1079. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1080. html_text += bbox_text + "</td>" + "\n"
  1081. html_text += "</tr>" + "\n"
  1082. html_text += "</table>" + "\n"
  1083. return html_text
  1084. def sort_object(obj_list):
  1085. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  1086. if len(obj_list) == 0:
  1087. return obj_list
  1088. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  1089. obj_list.sort(key=lambda x: x.y)
  1090. return obj_list
  1091. elif isinstance(obj_list[0], _Page):
  1092. obj_list.sort(key=lambda x: x.page_no)
  1093. return obj_list
  1094. else:
  1095. return obj_list
  1096. if __name__ == "__main__":
  1097. strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  1098. print(slash_replace(strs))