utils.py 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. import difflib
  5. import logging
  6. import mimetypes
  7. import platform
  8. import re
  9. import traceback
  10. import filetype
  11. from bs4 import BeautifulSoup
  12. from pdfminer.layout import *
  13. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8]):
  14. """
  15. [0] : continue
  16. [-1]: 逻辑处理错误
  17. [-2]: 接口调用错误
  18. [-3]: 文件格式错误,无法打开
  19. [-4]: 各类文件调用第三方包读取超时
  20. [-5]: 整个转换过程超时
  21. [-6]: 阿里云UDF队列超时
  22. [-7]: 文件需密码,无法打开
  23. [-8]: 调用现成接口报错
  24. """
  25. for c in code:
  26. if _list == [c]:
  27. return True
  28. return False
  29. def add_div(text):
  30. if text == "" or text is None:
  31. return text
  32. # if get_platform() == "Windows":
  33. # print("add_div", text)
  34. if re.findall("<div>", text):
  35. return text
  36. text = "<div>" + text + "\n"
  37. text = re.sub("\n", "</div>\n<div>", text)
  38. # text += "</div>"
  39. if text[-5:] == "<div>":
  40. # print("add_div has cut", text[-30:])
  41. text = text[:-5]
  42. return text
  43. def get_platform():
  44. sys = platform.system()
  45. return sys
  46. def get_html_p(html_path):
  47. logging.info("into get_html_p")
  48. try:
  49. with open(html_path, "r") as ff:
  50. html_str = ff.read()
  51. soup = BeautifulSoup(html_str, 'lxml')
  52. text = ""
  53. for p in soup.find_all("p"):
  54. p_text = p.text
  55. p_text = p_text.strip()
  56. if p.string != "":
  57. text += p_text
  58. text += "\n"
  59. return text
  60. except Exception as e:
  61. logging.info("get_html_p error!")
  62. print("get_html_p", traceback.print_exc())
  63. return [-1]
  64. def string_similarity(str1, str2):
  65. # 去掉<div>和回车
  66. str1 = re.sub("<div>", "", str1)
  67. str1 = re.sub("</div>", "", str1)
  68. str1 = re.sub("\n", "", str1)
  69. str2 = re.sub("<div>", "", str2)
  70. str2 = re.sub("</div>", "", str2)
  71. str2 = re.sub("\n", "", str2)
  72. # print("********************************")
  73. # print("str1", str1)
  74. # print("********************************")
  75. # print("str2", str2)
  76. # print("********************************")
  77. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  78. print("string_similarity", score)
  79. return score
  80. def get_sequential_data(text_list, bbox_list, html=False):
  81. logging.info("into get_sequential_data")
  82. try:
  83. text = ""
  84. order_list = []
  85. for i in range(len(text_list)):
  86. length_start = bbox_list[i][0][0]
  87. length_end = bbox_list[i][1][0]
  88. height_start = bbox_list[i][0][1]
  89. height_end = bbox_list[i][-1][1]
  90. # print([length_start, length_end, height_start, height_end])
  91. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  92. # text = text + infomation['text'] + "\n"
  93. if get_platform() == "Windows":
  94. print("get_sequential_data", order_list)
  95. if not order_list:
  96. if get_platform() == "Windows":
  97. print("get_sequential_data", "no order list")
  98. return ""
  99. # 根据bbox的坐标对输出排序
  100. order_list.sort(key=lambda x: (x[3], x[1], x[0]))
  101. # 根据bbox分行分列
  102. # col_list = []
  103. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  104. # for i in range(len(order_list)):
  105. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  106. # col_list.append(order_list[i])
  107. # else:
  108. # row_list.append(col_list)
  109. # col_list = []
  110. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  111. # col_list.append(order_list[i])
  112. # if i == len(order_list) - 1:
  113. # row_list.append(col_list)
  114. row_list = []
  115. used_box = []
  116. threshold = 5
  117. for box in order_list:
  118. if box in used_box:
  119. continue
  120. height_center = (box[4] + box[3]) / 2
  121. row = []
  122. for box2 in order_list:
  123. if box2 in used_box:
  124. continue
  125. height_center2 = (box2[4] + box2[3]) / 2
  126. if height_center - threshold <= height_center2 <= height_center + threshold:
  127. if box2 not in row:
  128. row.append(box2)
  129. used_box.append(box2)
  130. row.sort(key=lambda x: x[0])
  131. row_list.append(row)
  132. for row in row_list:
  133. if not row:
  134. continue
  135. if len(row) <= 1:
  136. text = text + row[0][0] + "\n"
  137. else:
  138. sub_text = ""
  139. row.sort(key=lambda x: x[1])
  140. for col in row:
  141. sub_text = sub_text + col[0] + " "
  142. sub_text = sub_text + "\n"
  143. text += sub_text
  144. if html:
  145. text = "<div>" + text
  146. text = re.sub("\n", "</div>\n<div>", text)
  147. text += "</div>"
  148. # if text[-5:] == "<div>":
  149. # text = text[:-5]
  150. return text
  151. except Exception as e:
  152. logging.info("get_sequential_data error!")
  153. print("get_sequential_data", traceback.print_exc())
  154. return [-1]
  155. # def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  156. # logging.info("into get_formatted_table")
  157. # try:
  158. # # 重新定义text_bbox_list,[point, point, text]
  159. # text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  160. # range(len(text_bbox_list))]
  161. # # 按纵坐标排序
  162. # text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  163. # table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  164. #
  165. # # print("text_bbox_list", text_bbox_list)
  166. # # print("table_bbox_list", table_bbox_list)
  167. #
  168. # # bbox位置 threshold
  169. # threshold = 5
  170. #
  171. # # 根据split_line分区,可能有个区多个表格 [(), ()]
  172. # area_text_bbox_list = []
  173. # area_table_bbox_list = []
  174. # # print("get_formatted_table, split_line", split_line)
  175. # for j in range(1, len(split_line)):
  176. # last_y = split_line[j - 1][0][1]
  177. # current_y = split_line[j][0][1]
  178. # temp_text_bbox_list = []
  179. # temp_table_bbox_list = []
  180. #
  181. # # 找出该区域下text bbox
  182. # for text_bbox in text_bbox_list:
  183. # # 计算 text bbox 中心点
  184. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  185. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  186. # if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  187. # temp_text_bbox_list.append(text_bbox)
  188. # area_text_bbox_list.append(temp_text_bbox_list)
  189. #
  190. # # 找出该区域下table bbox
  191. # for table_bbox in table_bbox_list:
  192. # # 计算 table bbox 中心点
  193. # table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  194. # (table_bbox[1][1] + table_bbox[0][1]) / 2)
  195. # if last_y < table_bbox_center[1] < current_y:
  196. # temp_table_bbox_list.append(table_bbox)
  197. # area_table_bbox_list.append(temp_table_bbox_list)
  198. #
  199. # # for j in range(len(area_text_bbox_list)):
  200. # # print("area_text_bbox_list", j, area_text_bbox_list[j])
  201. #
  202. # # 对每个区域分别进行两个bbox匹配,生成表格
  203. # area_text_list = []
  204. # area_column_list = []
  205. # for j in range(len(area_text_bbox_list)):
  206. # # 每个区域的table bbox 和text bbox
  207. # temp_table_bbox_list = area_table_bbox_list[j]
  208. # temp_text_bbox_list = area_text_bbox_list[j]
  209. #
  210. # # 判断该区域有无表格bbox
  211. # # 若无表格,将该区域文字连接
  212. # if not temp_table_bbox_list:
  213. # # 找出该区域的所有text bbox
  214. # only_text_list = []
  215. # only_bbox_list = []
  216. # for text_bbox in temp_text_bbox_list:
  217. # only_text_list.append(text_bbox[2])
  218. # only_bbox_list.append([text_bbox[0], text_bbox[1]])
  219. # only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  220. # if only_text == [-1]:
  221. # return [-1], [-1]
  222. # area_text_list.append(only_text)
  223. # area_column_list.append(0)
  224. # continue
  225. #
  226. # # 有表格
  227. # # 文本对应的表格格子
  228. # text_in_table = {}
  229. # for i in range(len(temp_text_bbox_list)):
  230. # text_bbox = temp_text_bbox_list[i]
  231. #
  232. # # 计算 text bbox 中心点
  233. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  234. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  235. #
  236. # # 判断中心点在哪个table bbox中
  237. # for table_bbox in temp_table_bbox_list:
  238. # # 中心点在table bbox中,将text写入字典
  239. # if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  240. # table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  241. # if str(table_bbox) in text_in_table.keys():
  242. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  243. # else:
  244. # text_in_table[str(table_bbox)] = text_bbox[2]
  245. # break
  246. #
  247. # # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  248. # # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  249. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  250. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  251. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  252. # # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  253. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  254. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  255. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  256. # # if str(table_bbox) in text_in_table.keys():
  257. # # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  258. # # else:
  259. # # text_in_table[str(table_bbox)] = text_bbox[2]
  260. # # break
  261. #
  262. # # 对表格格子进行分行分列,并计算总计多少小列
  263. # # 放入坐标
  264. # all_col_list = []
  265. # all_row_list = []
  266. # for i in range(len(temp_table_bbox_list)):
  267. # table_bbox = temp_table_bbox_list[i]
  268. #
  269. # # 放入所有坐标x
  270. # if table_bbox[0][0] not in all_col_list:
  271. # all_col_list.append(table_bbox[0][0])
  272. # if table_bbox[1][0] not in all_col_list:
  273. # all_col_list.append(table_bbox[1][0])
  274. #
  275. # # 放入所有坐标y
  276. # if table_bbox[0][1] not in all_row_list:
  277. # all_row_list.append(table_bbox[0][1])
  278. # if table_bbox[1][1] not in all_row_list:
  279. # all_row_list.append(table_bbox[1][1])
  280. # all_col_list.sort(key=lambda x: x)
  281. # all_row_list.sort(key=lambda x: x)
  282. #
  283. # # 分行
  284. # row_list = []
  285. # rows = []
  286. # temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  287. # y_row = temp_table_bbox_list[0][0][1]
  288. # for i in range(len(temp_table_bbox_list)):
  289. # table_bbox = temp_table_bbox_list[i]
  290. #
  291. # if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  292. # rows.append(table_bbox)
  293. # else:
  294. # y_row = table_bbox[0][1]
  295. # if rows:
  296. # rows.sort(key=lambda x: x[0][0])
  297. # row_list.append(rows)
  298. # rows = []
  299. # rows.append(table_bbox)
  300. # # print("*" * 30)
  301. # # print(row_list)
  302. #
  303. # if i == len(temp_table_bbox_list) - 1:
  304. # if rows:
  305. # rows.sort(key=lambda x: x[0][0])
  306. # row_list.append(rows)
  307. #
  308. # # 生成表格,包括文字和格子宽度
  309. # area_column = []
  310. # text = '<table border="1">' + "\n"
  311. # for row in row_list:
  312. # text += "<tr>" + "\n"
  313. # for col in row:
  314. # # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  315. # row_span = 1
  316. # for y in all_row_list:
  317. # if col[0][1] < y < col[1][1]:
  318. # if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  319. # row_span += 1
  320. #
  321. # # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  322. # col_span = 1
  323. # for x in all_col_list:
  324. # if col[0][0] < x < col[1][0]:
  325. # if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  326. # col_span += 1
  327. #
  328. # text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  329. #
  330. # if str(col) in text_in_table.keys():
  331. # text += text_in_table.get(str(col))
  332. # else:
  333. # text += ""
  334. # text += "</td>" + "\n"
  335. # text += "</tr>" + "\n"
  336. # text += "</table>" + "\n"
  337. #
  338. # # 计算最大column
  339. # max_col_num = 0
  340. # for row in row_list:
  341. # col_num = 0
  342. # for col in row:
  343. # col_num += 1
  344. # if max_col_num < col_num:
  345. # max_col_num = col_num
  346. #
  347. # area_text_list.append(text)
  348. # area_column_list.append(max_col_num)
  349. #
  350. # text = ""
  351. # if get_platform() == "Windows":
  352. # print("get_formatted_table area_text_list", area_text_list)
  353. # for area_text in area_text_list:
  354. # text += area_text
  355. # return text, area_column_list
  356. # except Exception as e:
  357. # logging.info("get_formatted_table error!")
  358. # print("get_formatted_table", traceback.print_exc())
  359. # return [-1], [-1]
  360. def rename_inner_files(root_path):
  361. try:
  362. logging.info("into rename_inner_files")
  363. # 获取解压文件夹下所有文件+文件夹,不带根路径
  364. path_list = []
  365. for root, dirs, files in os.walk(root_path, topdown=False):
  366. for name in dirs:
  367. p = os.path.join(root, name) + os.sep
  368. if get_platform() == "Windows":
  369. root_path = slash_replace(root_path)
  370. p = slash_replace(p)
  371. p = re.sub(root_path, "", p)
  372. root_path = slash_replace(root_path, True)
  373. p = slash_replace(p, True)
  374. else:
  375. p = re.sub(root_path, "", p)
  376. path_list.append(p)
  377. for name in files:
  378. p = os.path.join(root, name)
  379. if get_platform() == "Windows":
  380. root_path = slash_replace(root_path)
  381. p = slash_replace(p)
  382. p = re.sub(root_path, "", p)
  383. root_path = slash_replace(root_path, True)
  384. p = slash_replace(p, True)
  385. else:
  386. p = re.sub(root_path, "", p)
  387. path_list.append(p)
  388. # 按路径长度排序
  389. path_list.sort(key=lambda x: len(x), reverse=True)
  390. # 循环改名
  391. for old_path in path_list:
  392. # 按路径分隔符分割
  393. ss = old_path.split(os.sep)
  394. # 判断是否文件夹
  395. is_dir = 0
  396. file_type = ""
  397. if os.path.isdir(root_path + old_path):
  398. ss = ss[:-1]
  399. is_dir = 1
  400. else:
  401. if "." in old_path:
  402. file_type = "." + old_path.split(".")[-1]
  403. else:
  404. file_type = ""
  405. # 最后一级需要用hash改名
  406. new_path = ""
  407. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  408. current_level = 0
  409. for s in ss:
  410. # 路径拼接
  411. if current_level < len(ss) - 1:
  412. new_path += s + os.sep
  413. else:
  414. new_path += str(hash(s)) + file_type
  415. current_level += 1
  416. new_ab_path = root_path + new_path
  417. old_ab_path = root_path + old_path
  418. os.rename(old_ab_path, new_ab_path)
  419. # 重新获取解压文件夹下所有文件+文件夹
  420. new_path_list = []
  421. for root, dirs, files in os.walk(root_path, topdown=False):
  422. for name in dirs:
  423. new_path_list.append(os.path.join(root, name) + os.sep)
  424. for name in files:
  425. new_path_list.append(os.path.join(root, name))
  426. return new_path_list
  427. except:
  428. traceback.print_exc()
  429. return [-1]
  430. def judge_format(path):
  431. guess1 = mimetypes.guess_type(path)
  432. _type = None
  433. if guess1[0]:
  434. _type = guess1[0]
  435. else:
  436. guess2 = filetype.guess(path)
  437. if guess2:
  438. _type = guess2.mime
  439. if _type == "application/pdf":
  440. return "pdf"
  441. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  442. return "docx"
  443. if _type == "application/x-zip-compressed" or _type == "application/zip":
  444. return "zip"
  445. if _type == "application/x-rar-compressed" or _type == "application/rar":
  446. return "rar"
  447. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  448. return "xlsx"
  449. if _type == "application/msword":
  450. return "doc"
  451. if _type == "image/png":
  452. return "png"
  453. if _type == "image/jpeg":
  454. return "jpg"
  455. # 猜不到,返回None
  456. return None
  457. def slash_replace(_str, reverse=False):
  458. if reverse:
  459. _str = eval(repr(_str).replace('/', '\\\\'))
  460. else:
  461. _str = eval(repr(_str).replace('\\\\', '/'))
  462. return _str
  463. class LineTable:
  464. def recognize_table(self,list_textbox, list_line,sourceP_LB=True):
  465. self.list_line = list_line
  466. self.list_crosspoints = self.recognize_crosspoints(list_line)
  467. # 聚类
  468. cluster_crosspoints = []
  469. for _point in self.list_crosspoints:
  470. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  471. while 1:
  472. _find = False
  473. new_cluster_crosspoints = []
  474. for l_point in cluster_crosspoints:
  475. _flag = False
  476. for l_n_point in new_cluster_crosspoints:
  477. line1 = l_point.get("lines")
  478. line2 = l_n_point.get("lines")
  479. if len(line1&line2) > 0:
  480. _find = True
  481. _flag = True
  482. l_n_point["lines"] = line1.union(line2)
  483. l_n_point["points"].extend(l_point["points"])
  484. if not _flag:
  485. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  486. cluster_crosspoints = new_cluster_crosspoints
  487. if not _find:
  488. break
  489. list_l_rect = []
  490. for table_crosspoint in cluster_crosspoints:
  491. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  492. list_l_rect.append(list_rect)
  493. in_objs = set()
  494. list_tables = []
  495. for l_rect in list_l_rect:
  496. _ta = self.rect2table(list_textbox,l_rect,in_objs,sourceP_LB=sourceP_LB)
  497. if _ta:
  498. list_tables.append(_ta)
  499. # self._plot(list_line, list_textbox)
  500. return list_tables, in_objs, list_l_rect
  501. def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  502. dump_margin = 5
  503. list_rect_tmp = []
  504. # 去重
  505. for _rect in list_rect:
  506. if (_rect.bbox[3]-_rect.bbox[1] < 10) or (abs(_rect.bbox[2]-_rect.bbox[0]) < 5):
  507. continue
  508. _find = False
  509. for _tmp in list_rect_tmp:
  510. for i in range(4):
  511. if abs(_rect.bbox[i]-_tmp.bbox[i]) < dump_margin:
  512. pass
  513. else:
  514. _find = False
  515. break
  516. if i == 3:
  517. _find = True
  518. if _find:
  519. break
  520. if not _find:
  521. list_rect_tmp.append(_rect)
  522. # print("=====",len(list_rect),len(list_rect_tmp))
  523. # print(list_rect_tmp)
  524. # from matplotlib import pyplot as plt
  525. # plt.figure()
  526. # for _rect in list_rect_tmp:
  527. # x0,y0,x1,y1 = _rect.bbox
  528. # plt.boxplot(_rect.bbox)
  529. # plt.show()
  530. cluster_rect = []
  531. for _rect in list_rect:
  532. _find = False
  533. for cr in cluster_rect:
  534. for cr_rect in cr:
  535. if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
  536. _find = True
  537. cr.append(_rect)
  538. break
  539. elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
  540. _find = True
  541. cr.append(_rect)
  542. break
  543. if _find:
  544. break
  545. if not _find:
  546. cluster_rect.append([_rect])
  547. list_l_rect = cluster_rect
  548. in_objs = set()
  549. list_tables = []
  550. for l_rect in list_l_rect:
  551. _ta = self.rect2table(list_textbox,l_rect,in_objs)
  552. if _ta:
  553. list_tables.append(_ta)
  554. return list_tables,in_objs,list_l_rect
  555. def recognize_crosspoints(self, list_line,fixLine=True):
  556. list_crosspoints = []
  557. # print("lines num",len(list_line))
  558. def getMaxPoints(list_x,margin=5,reverse=False):
  559. clust_x = []
  560. for _x in list_x:
  561. _find = False
  562. for cx in clust_x:
  563. if abs(cx[0]-_x)<margin:
  564. _find = True
  565. cx.append(_x)
  566. break
  567. if not _find:
  568. clust_x.append([_x])
  569. clust_x.sort(key=lambda x:x,reverse=reverse)
  570. return clust_x[0][0],len(clust_x[0])
  571. for _i in range(len(list_line)):
  572. for _j in range(len(list_line)):
  573. line1 = list_line[_i].__dict__.get("bbox")
  574. line2 = list_line[_j].__dict__.get("bbox")
  575. exists,point = self.cross_point(line1,line2)
  576. if exists:
  577. list_crosspoints.append(point)
  578. if fixLine:
  579. #聚类
  580. cluster_crosspoints = []
  581. for _point in list_crosspoints:
  582. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  583. while 1:
  584. _find = False
  585. new_cluster_crosspoints = []
  586. for l_point in cluster_crosspoints:
  587. _flag = False
  588. for l_n_point in new_cluster_crosspoints:
  589. line1 = l_point.get("lines")
  590. line2 = l_n_point.get("lines")
  591. if len(line1&line2)>0:
  592. _find = True
  593. _flag = True
  594. l_n_point["lines"] = line1.union(line2)
  595. l_n_point["points"].extend(l_point["points"])
  596. if not _flag:
  597. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  598. cluster_crosspoints = new_cluster_crosspoints
  599. if not _find:
  600. break
  601. list_crosspoints = []
  602. for list_cp in cluster_crosspoints:
  603. points = list_cp.get("points")
  604. l_lines = []
  605. for p in points:
  606. l_lines.extend(p.get("p_lines"))
  607. l_lines = list(set(l_lines))
  608. l_lines.sort(key=lambda x:x[0])
  609. min_x,_count = getMaxPoints([l[0] for l in l_lines],reverse=False)
  610. if _count<=2:
  611. min_x = None
  612. min_y,_count = getMaxPoints([l[1] for l in l_lines],reverse=False)
  613. if _count<2:
  614. min_y = None
  615. max_x,_count = getMaxPoints([l[2] for l in l_lines],reverse=True)
  616. if _count<=2:
  617. max_x = None
  618. max_y,_count = getMaxPoints([l[3] for l in l_lines],reverse=True)
  619. if _count<=2:
  620. max_y = None
  621. if min_x and min_y and max_x and max_y:
  622. points.sort(key=lambda x:x["point"][0])
  623. if abs(min_x-points[0]["point"][0])>30:
  624. _line = LTLine(1,(min_x,min_y),(min_x,max_y))
  625. list_line.append(_line)
  626. l_lines.append(_line.bbox)
  627. # print("add=====",_line.bbox)
  628. if abs(max_x-points[-1]["point"][0])>30:
  629. _line = LTLine(1,(max_x,min_y),(max_x,max_y))
  630. list_line.append(_line)
  631. l_lines.append(_line.bbox)
  632. # print("add=====1",_line.bbox)
  633. points.sort(key=lambda x:x["point"][1])
  634. if abs(min_y-points[0]["point"][1])>30:
  635. _line = LTLine(1,(min_x,min_y),(max_x,min_y))
  636. list_line.append(_line)
  637. l_lines.append(_line.bbox)
  638. # print("add=====2",_line.bbox)
  639. if abs(max_y-points[-1]["point"][1])>30:
  640. _line = LTLine(1,(min_x,max_y),(max_x,max_y))
  641. list_line.append(_line)
  642. l_lines.append(_line.bbox)
  643. # print("add=====2",_line.bbox)
  644. for _i in range(len(l_lines)):
  645. for _j in range(len(l_lines)):
  646. line1 = l_lines[_i]
  647. line2 = l_lines[_j]
  648. exists,point = self.cross_point(line1,line2)
  649. if exists:
  650. list_crosspoints.append(point)
  651. # from matplotlib import pyplot as plt
  652. # plt.figure()
  653. # for _line in l_lines:
  654. # x0,y0,x1,y1 = _line
  655. # plt.plot([x0,x1],[y0,y1])
  656. # for point in list_crosspoints:
  657. # plt.scatter(point.get("point")[0],point.get("point")[1])
  658. # plt.show()
  659. # print(list_crosspoints)
  660. # print("points num",len(list_crosspoints))
  661. return list_crosspoints
  662. def recognize_rect(self, _page):
  663. list_line = []
  664. for _obj in _page._objs:
  665. if isinstance(_obj, (LTLine)):
  666. list_line.append(_obj)
  667. list_crosspoints = self.recognize_crosspoints(list_line)
  668. #聚类
  669. cluster_crosspoints = []
  670. for _point in list_crosspoints:
  671. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  672. while 1:
  673. _find = False
  674. new_cluster_crosspoints = []
  675. for l_point in cluster_crosspoints:
  676. _flag = False
  677. for l_n_point in new_cluster_crosspoints:
  678. line1 = l_point.get("lines")
  679. line2 = l_n_point.get("lines")
  680. if len(line1&line2)>0:
  681. _find = True
  682. _flag = True
  683. l_n_point["lines"] = line1.union(line2)
  684. l_n_point["points"].extend(l_point["points"])
  685. if not _flag:
  686. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  687. cluster_crosspoints = new_cluster_crosspoints
  688. if not _find:
  689. break
  690. # print(len(cluster_crosspoints))
  691. list_l_rect = []
  692. for table_crosspoint in cluster_crosspoints:
  693. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  694. list_l_rect.append(list_rect)
  695. return list_l_rect
  696. def crosspoint2rect(self, list_crosspoint, margin=5):
  697. dict_line_points = {}
  698. for _point in list_crosspoint:
  699. lines = list(_point.get("lines"))
  700. for _line in lines:
  701. if _line not in dict_line_points:
  702. dict_line_points[_line] = {"direct":None,"points":[]}
  703. dict_line_points[_line]["points"].append(_point)
  704. # 排序
  705. for k, v in dict_line_points.items():
  706. list_x = []
  707. list_y = []
  708. for _p in v["points"]:
  709. list_x.append(_p.get("point")[0])
  710. list_y.append(_p.get("point")[1])
  711. if max(list_x)-min(list_x)>max(list_y)-min(list_y):
  712. v.get("points").sort(key=lambda x:x.get("point")[0])
  713. v["direct"] = "row"
  714. else:
  715. v.get("points").sort(key=lambda x:x.get("point")[1])
  716. v["direct"] = "column"
  717. list_rect = []
  718. for _point in list_crosspoint:
  719. if _point["buttom"]>=margin and _point["right"]>=margin:
  720. lines = list(_point.get("lines"))
  721. _line = lines[0]
  722. if dict_line_points[_line]["direct"]=="column":
  723. _line = lines[1]
  724. next_point = None
  725. for p1 in dict_line_points[_line]["points"]:
  726. if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
  727. next_point = p1
  728. break
  729. if not next_point:
  730. continue
  731. lines = list(next_point.get("lines"))
  732. _line = lines[0]
  733. if dict_line_points[_line]["direct"]=="row":
  734. _line = lines[1]
  735. final_point = None
  736. for p1 in dict_line_points[_line]["points"]:
  737. if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
  738. final_point = p1
  739. break
  740. if not final_point:
  741. continue
  742. _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
  743. list_rect.append(_r)
  744. tmp_rect = []
  745. set_bbox = set()
  746. for _r in list_rect:
  747. _bbox = "%.2f-%.2f-%.2f-%.2f"%_r.bbox
  748. width = _r.bbox[2]-_r.bbox[0]
  749. height = _r.bbox[3]-_r.bbox[1]
  750. if width<=margin or height<=margin:
  751. continue
  752. if _bbox not in set_bbox:
  753. tmp_rect.append(_r)
  754. set_bbox.add(_bbox)
  755. list_rect = tmp_rect
  756. # import cv2
  757. # import numpy as np
  758. # import random
  759. # img = np.zeros(shape=(1000,1000),dtype=np.uint8)
  760. # img += 255
  761. #
  762. # color = []
  763. # for rect in list_rect:
  764. # color += 10
  765. # x0,y0,x1,y1 = rect.bbox
  766. # x0 *= 10/18
  767. # y0 *= 10/18
  768. # x1 *= 10/18
  769. # y1 *= 10/18
  770. # print(rect.bbox)
  771. # cv2.rectangle(img, (int(x0),int(y0)),(int(x1),int(y1)), (color%255, (color+10)%255, (color+20)%255), 3)
  772. # cv2.imshow("bbox", img)
  773. # cv2.waitKey(0)
  774. return list_rect
  775. def cross_point(self, line1, line2, segment=True, margin=2):
  776. point_is_exist = False
  777. x = y = 0
  778. x1, y1, x2, y2 = line1
  779. x3, y3, x4, y4 = line2
  780. if (x2 - x1) == 0:
  781. k1 = None
  782. b1 = 0
  783. else:
  784. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  785. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  786. if (x4 - x3) == 0: # L2直线斜率不存在
  787. k2 = None
  788. b2 = 0
  789. else:
  790. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  791. b2 = y3 * 1.0 - x3 * k2 * 1.0
  792. if k1 is None:
  793. if not k2 is None:
  794. x = x1
  795. y = k2 * x1 + b2
  796. point_is_exist = True
  797. elif k2 is None:
  798. x = x3
  799. y = k1 * x3 + b1
  800. elif not k2 == k1:
  801. x = (b2 - b1) * 1.0 / (k1 - k2)
  802. y = k1 * x * 1.0 + b1 * 1.0
  803. point_is_exist = True
  804. left = 0
  805. right = 0
  806. top = 0
  807. buttom = 0
  808. if point_is_exist:
  809. if segment:
  810. if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
  811. if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
  812. point_is_exist = True
  813. left = abs(min(x1,x3)-x)
  814. right = abs(max(x2,x4)-x)
  815. top = abs(min(y1,y3)-y)
  816. buttom = abs(max(y2,y4)-y)
  817. else:
  818. point_is_exist = False
  819. else:
  820. point_is_exist = False
  821. line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
  822. line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
  823. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  824. "top": top, "buttom": buttom, "lines": set([line1_key,line2_key]),"p_lines":[line1,line2]}
  825. def unionTable(self, list_table, fixspan=True, margin=2):
  826. set_x = set()
  827. set_y = set()
  828. list_cell = []
  829. for _t in list_table:
  830. for _line in _t:
  831. list_cell.extend(_line)
  832. clusters_rects = []
  833. #根据y1聚类
  834. set_id = set()
  835. list_cell_dump = []
  836. for _cell in list_cell:
  837. _id = id(_cell)
  838. if _id in set_id:
  839. continue
  840. set_id.add(_id)
  841. list_cell_dump.append(_cell)
  842. list_cell = list_cell_dump
  843. list_cell.sort(key=lambda x:x.get("bbox")[3])
  844. for _rect in list_cell:
  845. _y0 = _rect.get("bbox")[3]
  846. _find = False
  847. for l_cr in clusters_rects:
  848. if abs(l_cr[0].get("bbox")[3]-_y0)<2:
  849. _find = True
  850. l_cr.append(_rect)
  851. break
  852. if not _find:
  853. clusters_rects.append([_rect])
  854. clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
  855. for l_cr in clusters_rects:
  856. l_cr.sort(key=lambda x:x.get("bbox")[0])
  857. # print("=============:")
  858. # for l_r in clusters_rects:
  859. # print(len(l_r))
  860. for _line in clusters_rects:
  861. for _rect in _line:
  862. (x0,y0,x1,y1) = _rect.get("bbox")
  863. set_x.add(x0)
  864. set_x.add(x1)
  865. set_y.add(y0)
  866. set_y.add(y1)
  867. if len(set_x)==0 or len(set_y)==0:
  868. return
  869. list_x = list(set_x)
  870. list_y = list(set_y)
  871. list_x.sort(key=lambda x:x)
  872. list_y.sort(key=lambda x:x,reverse=True)
  873. _table = []
  874. for _line in clusters_rects:
  875. table_line = []
  876. for _rect in _line:
  877. (x0,y0,x1,y1) = _rect.get("bbox")
  878. _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
  879. table_line.append(_cell)
  880. _table.append(table_line)
  881. # print("=====================>>")
  882. # for _line in _table:
  883. # for _cell in _line:
  884. # print(_cell,end="\t")
  885. # print("\n")
  886. # print("=====================>>")
  887. # print(_table)
  888. if fixspan:
  889. for _line in _table:
  890. extend_line = []
  891. for c_i in range(len(_line)):
  892. _cell = _line[c_i]
  893. if _cell.get("columnspan")>1:
  894. _cospan = _cell.get("columnspan")
  895. _cell["columnspan"] = 1
  896. for i in range(1,_cospan):
  897. extend_line.append({"index":c_i+1,"cell":_cell})
  898. extend_line.sort(key=lambda x:x["index"],reverse=True)
  899. for _el in extend_line:
  900. _line.insert(_el["index"],_el["cell"])
  901. for l_i in range(len(_table)):
  902. _line = _table[l_i]
  903. for c_i in range(len(_line)):
  904. _cell = _line[c_i]
  905. if _cell.get("rowspan")>1:
  906. _rospan = _cell.get("rowspan")
  907. _cell["rowspan"] = 1
  908. for i in range(1,_rospan):
  909. _table[l_i+i].insert(c_i,_cell)
  910. table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
  911. ta = {"bbox":table_bbox,"table":_table}
  912. return ta
  913. def rect2table(self, list_textbox, list_rect, in_objs, margin=5, fixspan=True,sourceP_LB=True,fixRect=True):
  914. def getIOU(bbox0,bbox1):
  915. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  916. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  917. if width<0 and height<0:
  918. return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  919. return 0
  920. _table = []
  921. set_x = set()
  922. set_y = set()
  923. clusters_rects = []
  924. # 根据y1聚类
  925. if sourceP_LB:
  926. list_rect.sort(key=lambda x:x.bbox[3])
  927. for _rect in list_rect:
  928. _y0 = _rect.bbox[3]
  929. _find = False
  930. for l_cr in clusters_rects:
  931. if abs(l_cr[0].bbox[3]-_y0)<margin:
  932. _find = True
  933. l_cr.append(_rect)
  934. break
  935. if not _find:
  936. clusters_rects.append([_rect])
  937. else:
  938. list_rect.sort(key=lambda x:x.bbox[1])
  939. for _rect in list_rect:
  940. _y0 = _rect.bbox[1]
  941. _find = False
  942. for l_cr in clusters_rects:
  943. if abs(l_cr[0].bbox[1]-_y0)<margin:
  944. _find = True
  945. l_cr.append(_rect)
  946. break
  947. if not _find:
  948. clusters_rects.append([_rect])
  949. # cul spans
  950. for _line in clusters_rects:
  951. for _rect in _line:
  952. (x0,y0,x1,y1) = _rect.bbox
  953. set_x.add(x0)
  954. set_x.add(x1)
  955. set_y.add(y0)
  956. set_y.add(y1)
  957. if len(set_x)==0 or len(set_y)==0:
  958. return
  959. if len(list_rect)<=1:
  960. return
  961. list_x = list(set_x)
  962. list_y = list(set_y)
  963. list_x.sort(key=lambda x:x)
  964. list_y.sort(key=lambda x:x,reverse=sourceP_LB)
  965. # print("clusters_rects", len(clusters_rects))
  966. if sourceP_LB:
  967. clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
  968. else:
  969. clusters_rects.sort(key=lambda x:x[0].bbox[1],reverse=sourceP_LB)
  970. for l_cr in clusters_rects:
  971. l_cr.sort(key=lambda x:x.bbox[0])
  972. pop_x = []
  973. for i in range(len(list_x)-1):
  974. _i = len(list_x)-i-1
  975. l_i = _i-1
  976. if abs(list_x[_i]-list_x[l_i])<5:
  977. pop_x.append(_i)
  978. pop_x.sort(key=lambda x:x,reverse=True)
  979. for _x in pop_x:
  980. list_x.pop(_x)
  981. #
  982. pop_x = []
  983. for i in range(len(list_y)-1):
  984. _i = len(list_y)-i-1
  985. l_i = _i-1
  986. if abs(list_y[_i]-list_y[l_i])<5:
  987. pop_x.append(_i)
  988. pop_x.sort(key=lambda x:x,reverse=True)
  989. for _x in pop_x:
  990. list_y.pop(_x)
  991. # print(list_x)
  992. # print(list_y)
  993. for _line in clusters_rects:
  994. table_line = []
  995. for _rect in _line:
  996. (x0, y0, x1, y1) = _rect.bbox
  997. _cell = {"bbox": (x0, y0, x1, y1),
  998. "rect": _rect,
  999. "rowspan": self.getspan(list_y, y0, y1, margin),
  1000. "columnspan": self.getspan(list_x, x0, x1, margin),
  1001. "text": ""}
  1002. table_line.append(_cell)
  1003. _table.append(table_line)
  1004. list_textbox.sort(key=lambda x:x.bbox[0])
  1005. list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
  1006. # print("list_textbox", list_textbox)
  1007. for textbox in list_textbox:
  1008. (x0,y0,x1,y1) = textbox.bbox
  1009. _text = re.sub('[\s\r\n]','',textbox.get_text())
  1010. _find = False
  1011. for table_line in _table:
  1012. for _cell in table_line:
  1013. if self.inbox(textbox.bbox, _cell["bbox"], textbox.get_text()):
  1014. _cell["text"] += _text
  1015. in_objs.add(textbox)
  1016. _find = True
  1017. break
  1018. if _find:
  1019. break
  1020. if fixspan:
  1021. for _line in _table:
  1022. for c_i in range(len(_line)):
  1023. _cell = _line[c_i]
  1024. if _cell.get("columnspan")>1:
  1025. _cospan = _cell.get("columnspan")
  1026. _cell["columnspan"] = 1
  1027. for i in range(1,_cospan):
  1028. _line.insert(c_i,_cell)
  1029. for l_i in range(len(_table)):
  1030. _line = _table[l_i]
  1031. for c_i in range(len(_line)):
  1032. _cell = _line[c_i]
  1033. if _cell.get("rowspan")>1:
  1034. _rospan = _cell.get("rowspan")
  1035. _cell["rowspan"] = 1
  1036. for i in range(1,_rospan):
  1037. if l_i+i<=len(_table)-1:
  1038. # print(len(_table),l_i+i)
  1039. _table[l_i+i].insert(c_i,_cell)
  1040. if fixRect:
  1041. for _line in _table:
  1042. extend_line = []
  1043. for c_i in range(len(_line)):
  1044. c_cell = _line[c_i]
  1045. if c_i==0 and c_cell["bbox"][0]!=list_x[0]:
  1046. _bbox = (list_x[0],c_cell["bbox"][1], c_cell["bbox"][0],c_cell["bbox"][3])
  1047. _cell = {"bbox": _bbox,
  1048. "rect": LTRect(1,_bbox),
  1049. "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
  1050. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  1051. "text": ""}
  1052. extend_line.append({"index":c_i,"cell":_cell})
  1053. if c_i<len(_line)-1:
  1054. n_cell = _line[c_i+1]
  1055. _bbox = c_cell["bbox"]
  1056. n_bbox = n_cell["bbox"]
  1057. if _bbox[0]==n_bbox[0] and _bbox[2]==n_bbox[2]:
  1058. continue
  1059. else:
  1060. if abs(_bbox[2]-n_bbox[0])>margin:
  1061. _bbox = (_bbox[2],_bbox[1], n_bbox[0],_bbox[3])
  1062. _cell = {"bbox": _bbox,
  1063. "rect": LTRect(1,_bbox),
  1064. "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
  1065. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  1066. "text": ""}
  1067. extend_line.append({"index":c_i+1,"cell":_cell})
  1068. if c_i==len(_line)-1:
  1069. if abs(c_cell["bbox"][2]-list_x[-1])>margin:
  1070. _bbox = (c_cell["bbox"][2],c_cell["bbox"][1], list_x[-1],c_cell["bbox"][3])
  1071. _cell = {"bbox": _bbox,
  1072. "rect": LTRect(1,_bbox),
  1073. "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
  1074. "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
  1075. "text": ""}
  1076. extend_line.append({"index":c_i+1,"cell":_cell})
  1077. extend_line.sort(key=lambda x:x["index"],reverse=True)
  1078. for _tmp in extend_line:
  1079. _line.insert(_tmp["index"],_tmp["cell"])
  1080. list_textbox.sort(key=lambda x:x.bbox[0])
  1081. list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
  1082. for textbox in list_textbox:
  1083. if textbox in in_objs:
  1084. continue
  1085. (x0,y0,x1,y1) = textbox.bbox
  1086. _text = textbox.get_text()
  1087. _find = False
  1088. for table_line in _table:
  1089. for _cell in table_line:
  1090. if self.inbox(textbox.bbox,_cell["bbox"], textbox.get_text()):
  1091. _cell["text"] += _text
  1092. in_objs.add(textbox)
  1093. _find = True
  1094. break
  1095. if _find:
  1096. break
  1097. # print("=======")
  1098. # for _line in _table:
  1099. # for _cell in _line:
  1100. # print(_cell,end="\t\t")
  1101. # print("\n")
  1102. # print("===========")
  1103. table_bbox = (_table[0][0].get("bbox")[0],
  1104. _table[0][0].get("bbox")[1],
  1105. _table[-1][-1].get("bbox")[2],
  1106. _table[-1][-1].get("bbox")[3])
  1107. ta = {"bbox": table_bbox, "table": _table}
  1108. return ta
  1109. def inbox(self, bbox0, bbox_g, text=""):
  1110. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  1111. # return 1
  1112. # print("utils inbox", text, self.getIOU(bbox0,bbox_g), bbox0, bbox_g)
  1113. if self.getIOU(bbox0,bbox_g)>0.5:
  1114. return 1
  1115. return 0
  1116. def getIOU(self, bbox0, bbox1):
  1117. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  1118. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  1119. if width < 0 and height < 0:
  1120. iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
  1121. abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  1122. # print("getIOU", iou)
  1123. return iou
  1124. return 0
  1125. def getspan(self, _list, x0, x1, margin):
  1126. _count = 0
  1127. (x0,x1) = (min(x0,x1),max(x0,x1))
  1128. for _x in _list:
  1129. if _x>=(x0-margin) and _x<=(x1+margin):
  1130. _count += 1
  1131. return _count-1
  1132. def _plot(self, list_line, list_textbox):
  1133. from matplotlib import pyplot as plt
  1134. plt.figure()
  1135. for _line in list_line:
  1136. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  1137. plt.plot([x0, x1], [y0, y1])
  1138. for _line in list_line:
  1139. x0, y0, x1, y1 = _line.bbox
  1140. plt.plot([x0, x1], [y0, y1])
  1141. # for point in list_crosspoints:
  1142. # plt.scatter(point.get("point")[0],point.get("point")[1])
  1143. for textbox in list_textbox:
  1144. x0, y0, x1, y1 = textbox.bbox
  1145. plt.plot([x0, x1], [y0, y1])
  1146. plt.show()
  1147. def get_table_html(table):
  1148. html_text = '<table border="1">' + "\n"
  1149. for row in table:
  1150. html_text += "<tr>" + "\n"
  1151. for col in row:
  1152. row_span = col.get("rowspan")
  1153. col_span = col.get("columnspan")
  1154. bbox_text = col.get("text")
  1155. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  1156. html_text += bbox_text + "</td>" + "\n"
  1157. html_text += "</tr>" + "\n"
  1158. html_text += "</table>" + "\n"
  1159. return html_text
  1160. def sort_object(obj_list, is_reverse=False):
  1161. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  1162. if len(obj_list) == 0:
  1163. return obj_list
  1164. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  1165. obj_list.sort(key=lambda x: (x.y, x.x), reverse=is_reverse)
  1166. return obj_list
  1167. elif isinstance(obj_list[0], _Page):
  1168. obj_list.sort(key=lambda x: x.page_no)
  1169. return obj_list
  1170. else:
  1171. return obj_list
  1172. if __name__ == "__main__":
  1173. # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  1174. # print(slash_replace(strs))
  1175. # from matplotlib import pyplot as plt
  1176. # import random
  1177. # fig = plt.figure()
  1178. # plt.xlim(100)
  1179. # plt.ylim(100)
  1180. # fig.add_subplot(111)
  1181. # x0,y0,x1,y1 = (1,2,3,4)
  1182. # plt.gca().add_patch(plt.Rectangle(xy=(x0, y0),
  1183. # width=x1-x0,
  1184. # height=y1-y0,
  1185. # edgecolor=(random.randint(0,255)/255,random.randint(0,255)/255,random.randint(0,255)/255),
  1186. # fill=False, linewidth=2))
  1187. #
  1188. # plt.show()
  1189. import cv2
  1190. import numpy as np
  1191. img = np.zeros(shape=(1800,1800),dtype=np.uint8)
  1192. img += 255
  1193. cv2.imshow("bbox", img)
  1194. cv2.waitKey(0)