utils.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. import difflib
  5. import logging
  6. import mimetypes
  7. import platform
  8. import re
  9. import traceback
  10. import filetype
  11. from bs4 import BeautifulSoup
  12. from pdfminer.layout import *
  13. def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8]):
  14. """
  15. [0] : continue
  16. [-1]: 逻辑处理错误
  17. [-2]: 接口调用错误
  18. [-3]: 文件格式错误,无法打开
  19. [-4]: 各类文件调用第三方包读取超时
  20. [-5]: 整个转换过程超时
  21. [-6]: 阿里云UDF队列超时
  22. [-7]: 文件需密码,无法打开
  23. [-8]: 调用现成接口报错
  24. """
  25. for c in code:
  26. if _list == [c]:
  27. return True
  28. return False
  29. def add_div(text):
  30. if text == "" or text is None:
  31. return text
  32. if get_platform() == "Windows":
  33. print("add_div", text)
  34. if re.findall("<div>", text):
  35. return text
  36. text = "<div>" + text + "\n"
  37. text = re.sub("\n", "</div>\n<div>", text)
  38. # text += "</div>"
  39. if text[-5:] == "<div>":
  40. print("add_div has cut", text[-30:])
  41. text = text[:-5]
  42. return text
  43. def get_platform():
  44. sys = platform.system()
  45. return sys
  46. def get_html_p(html_path):
  47. logging.info("into get_html_p")
  48. try:
  49. with open(html_path, "r") as ff:
  50. html_str = ff.read()
  51. soup = BeautifulSoup(html_str, 'lxml')
  52. text = ""
  53. for p in soup.find_all("p"):
  54. p_text = p.text
  55. p_text = p_text.strip()
  56. if p.string != "":
  57. text += p_text
  58. text += "\n"
  59. return text
  60. except Exception as e:
  61. logging.info("get_html_p error!")
  62. print("get_html_p", traceback.print_exc())
  63. return [-1]
  64. def string_similarity(str1, str2):
  65. # 去掉<div>和回车
  66. str1 = re.sub("<div>", "", str1)
  67. str1 = re.sub("</div>", "", str1)
  68. str1 = re.sub("\n", "", str1)
  69. str2 = re.sub("<div>", "", str2)
  70. str2 = re.sub("</div>", "", str2)
  71. str2 = re.sub("\n", "", str2)
  72. # print("********************************")
  73. # print("str1", str1)
  74. # print("********************************")
  75. # print("str2", str2)
  76. # print("********************************")
  77. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  78. print("string_similarity", score)
  79. return score
  80. def get_sequential_data(text_list, bbox_list, html=False):
  81. logging.info("into get_sequential_data")
  82. try:
  83. text = ""
  84. order_list = []
  85. for i in range(len(text_list)):
  86. length_start = bbox_list[i][0][0]
  87. length_end = bbox_list[i][1][0]
  88. height_start = bbox_list[i][0][1]
  89. height_end = bbox_list[i][-1][1]
  90. # print([length_start, length_end, height_start, height_end])
  91. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  92. # text = text + infomation['text'] + "\n"
  93. if get_platform() == "Windows":
  94. print("get_sequential_data", order_list)
  95. if not order_list:
  96. if get_platform() == "Windows":
  97. print("get_sequential_data", "no order list")
  98. return ""
  99. # 根据bbox的坐标对输出排序
  100. order_list.sort(key=lambda x: (x[3], x[1]))
  101. # 根据bbox分行分列
  102. # col_list = []
  103. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  104. # for i in range(len(order_list)):
  105. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  106. # col_list.append(order_list[i])
  107. # else:
  108. # row_list.append(col_list)
  109. # col_list = []
  110. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  111. # col_list.append(order_list[i])
  112. # if i == len(order_list) - 1:
  113. # row_list.append(col_list)
  114. row_list = []
  115. used_box = []
  116. threshold = 5
  117. for box in order_list:
  118. if box in used_box:
  119. continue
  120. height_center = (box[4] + box[3]) / 2
  121. row = []
  122. for box2 in order_list:
  123. if box2 in used_box:
  124. continue
  125. height_center2 = (box2[4] + box2[3]) / 2
  126. if height_center - threshold <= height_center2 <= height_center + threshold:
  127. if box2 not in row:
  128. row.append(box2)
  129. used_box.append(box2)
  130. row.sort(key=lambda x: x[0])
  131. row_list.append(row)
  132. for row in row_list:
  133. if not row:
  134. continue
  135. if len(row) <= 1:
  136. text = text + row[0][0] + "\n"
  137. else:
  138. sub_text = ""
  139. row.sort(key=lambda x: x[1])
  140. for col in row:
  141. sub_text = sub_text + col[0] + " "
  142. sub_text = sub_text + "\n"
  143. text += sub_text
  144. if html:
  145. text = "<div>" + text
  146. text = re.sub("\n", "</div>\n<div>", text)
  147. text += "</div>"
  148. # if text[-5:] == "<div>":
  149. # text = text[:-5]
  150. return text
  151. except Exception as e:
  152. logging.info("get_sequential_data error!")
  153. print("get_sequential_data", traceback.print_exc())
  154. return [-1]
  155. # def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  156. # logging.info("into get_formatted_table")
  157. # try:
  158. # # 重新定义text_bbox_list,[point, point, text]
  159. # text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  160. # range(len(text_bbox_list))]
  161. # # 按纵坐标排序
  162. # text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  163. # table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  164. #
  165. # # print("text_bbox_list", text_bbox_list)
  166. # # print("table_bbox_list", table_bbox_list)
  167. #
  168. # # bbox位置 threshold
  169. # threshold = 5
  170. #
  171. # # 根据split_line分区,可能有个区多个表格 [(), ()]
  172. # area_text_bbox_list = []
  173. # area_table_bbox_list = []
  174. # # print("get_formatted_table, split_line", split_line)
  175. # for j in range(1, len(split_line)):
  176. # last_y = split_line[j - 1][0][1]
  177. # current_y = split_line[j][0][1]
  178. # temp_text_bbox_list = []
  179. # temp_table_bbox_list = []
  180. #
  181. # # 找出该区域下text bbox
  182. # for text_bbox in text_bbox_list:
  183. # # 计算 text bbox 中心点
  184. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  185. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  186. # if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  187. # temp_text_bbox_list.append(text_bbox)
  188. # area_text_bbox_list.append(temp_text_bbox_list)
  189. #
  190. # # 找出该区域下table bbox
  191. # for table_bbox in table_bbox_list:
  192. # # 计算 table bbox 中心点
  193. # table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  194. # (table_bbox[1][1] + table_bbox[0][1]) / 2)
  195. # if last_y < table_bbox_center[1] < current_y:
  196. # temp_table_bbox_list.append(table_bbox)
  197. # area_table_bbox_list.append(temp_table_bbox_list)
  198. #
  199. # # for j in range(len(area_text_bbox_list)):
  200. # # print("area_text_bbox_list", j, area_text_bbox_list[j])
  201. #
  202. # # 对每个区域分别进行两个bbox匹配,生成表格
  203. # area_text_list = []
  204. # area_column_list = []
  205. # for j in range(len(area_text_bbox_list)):
  206. # # 每个区域的table bbox 和text bbox
  207. # temp_table_bbox_list = area_table_bbox_list[j]
  208. # temp_text_bbox_list = area_text_bbox_list[j]
  209. #
  210. # # 判断该区域有无表格bbox
  211. # # 若无表格,将该区域文字连接
  212. # if not temp_table_bbox_list:
  213. # # 找出该区域的所有text bbox
  214. # only_text_list = []
  215. # only_bbox_list = []
  216. # for text_bbox in temp_text_bbox_list:
  217. # only_text_list.append(text_bbox[2])
  218. # only_bbox_list.append([text_bbox[0], text_bbox[1]])
  219. # only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  220. # if only_text == [-1]:
  221. # return [-1], [-1]
  222. # area_text_list.append(only_text)
  223. # area_column_list.append(0)
  224. # continue
  225. #
  226. # # 有表格
  227. # # 文本对应的表格格子
  228. # text_in_table = {}
  229. # for i in range(len(temp_text_bbox_list)):
  230. # text_bbox = temp_text_bbox_list[i]
  231. #
  232. # # 计算 text bbox 中心点
  233. # text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  234. # (text_bbox[1][1] + text_bbox[0][1]) / 2)
  235. #
  236. # # 判断中心点在哪个table bbox中
  237. # for table_bbox in temp_table_bbox_list:
  238. # # 中心点在table bbox中,将text写入字典
  239. # if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  240. # table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  241. # if str(table_bbox) in text_in_table.keys():
  242. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  243. # else:
  244. # text_in_table[str(table_bbox)] = text_bbox[2]
  245. # break
  246. #
  247. # # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  248. # # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  249. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  250. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  251. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  252. # # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  253. # # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  254. # # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  255. # # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  256. # # if str(table_bbox) in text_in_table.keys():
  257. # # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  258. # # else:
  259. # # text_in_table[str(table_bbox)] = text_bbox[2]
  260. # # break
  261. #
  262. # # 对表格格子进行分行分列,并计算总计多少小列
  263. # # 放入坐标
  264. # all_col_list = []
  265. # all_row_list = []
  266. # for i in range(len(temp_table_bbox_list)):
  267. # table_bbox = temp_table_bbox_list[i]
  268. #
  269. # # 放入所有坐标x
  270. # if table_bbox[0][0] not in all_col_list:
  271. # all_col_list.append(table_bbox[0][0])
  272. # if table_bbox[1][0] not in all_col_list:
  273. # all_col_list.append(table_bbox[1][0])
  274. #
  275. # # 放入所有坐标y
  276. # if table_bbox[0][1] not in all_row_list:
  277. # all_row_list.append(table_bbox[0][1])
  278. # if table_bbox[1][1] not in all_row_list:
  279. # all_row_list.append(table_bbox[1][1])
  280. # all_col_list.sort(key=lambda x: x)
  281. # all_row_list.sort(key=lambda x: x)
  282. #
  283. # # 分行
  284. # row_list = []
  285. # rows = []
  286. # temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  287. # y_row = temp_table_bbox_list[0][0][1]
  288. # for i in range(len(temp_table_bbox_list)):
  289. # table_bbox = temp_table_bbox_list[i]
  290. #
  291. # if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  292. # rows.append(table_bbox)
  293. # else:
  294. # y_row = table_bbox[0][1]
  295. # if rows:
  296. # rows.sort(key=lambda x: x[0][0])
  297. # row_list.append(rows)
  298. # rows = []
  299. # rows.append(table_bbox)
  300. # # print("*" * 30)
  301. # # print(row_list)
  302. #
  303. # if i == len(temp_table_bbox_list) - 1:
  304. # if rows:
  305. # rows.sort(key=lambda x: x[0][0])
  306. # row_list.append(rows)
  307. #
  308. # # 生成表格,包括文字和格子宽度
  309. # area_column = []
  310. # text = '<table border="1">' + "\n"
  311. # for row in row_list:
  312. # text += "<tr>" + "\n"
  313. # for col in row:
  314. # # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  315. # row_span = 1
  316. # for y in all_row_list:
  317. # if col[0][1] < y < col[1][1]:
  318. # if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  319. # row_span += 1
  320. #
  321. # # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  322. # col_span = 1
  323. # for x in all_col_list:
  324. # if col[0][0] < x < col[1][0]:
  325. # if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  326. # col_span += 1
  327. #
  328. # text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  329. #
  330. # if str(col) in text_in_table.keys():
  331. # text += text_in_table.get(str(col))
  332. # else:
  333. # text += ""
  334. # text += "</td>" + "\n"
  335. # text += "</tr>" + "\n"
  336. # text += "</table>" + "\n"
  337. #
  338. # # 计算最大column
  339. # max_col_num = 0
  340. # for row in row_list:
  341. # col_num = 0
  342. # for col in row:
  343. # col_num += 1
  344. # if max_col_num < col_num:
  345. # max_col_num = col_num
  346. #
  347. # area_text_list.append(text)
  348. # area_column_list.append(max_col_num)
  349. #
  350. # text = ""
  351. # if get_platform() == "Windows":
  352. # print("get_formatted_table area_text_list", area_text_list)
  353. # for area_text in area_text_list:
  354. # text += area_text
  355. # return text, area_column_list
  356. # except Exception as e:
  357. # logging.info("get_formatted_table error!")
  358. # print("get_formatted_table", traceback.print_exc())
  359. # return [-1], [-1]
  360. def rename_inner_files(root_path):
  361. try:
  362. logging.info("into rename_inner_files")
  363. # 获取解压文件夹下所有文件+文件夹,不带根路径
  364. path_list = []
  365. for root, dirs, files in os.walk(root_path, topdown=False):
  366. for name in dirs:
  367. p = os.path.join(root, name) + os.sep
  368. if get_platform() == "Windows":
  369. root_path = slash_replace(root_path)
  370. p = slash_replace(p)
  371. p = re.sub(root_path, "", p)
  372. root_path = slash_replace(root_path, True)
  373. p = slash_replace(p, True)
  374. else:
  375. p = re.sub(root_path, "", p)
  376. path_list.append(p)
  377. for name in files:
  378. p = os.path.join(root, name)
  379. if get_platform() == "Windows":
  380. root_path = slash_replace(root_path)
  381. p = slash_replace(p)
  382. p = re.sub(root_path, "", p)
  383. root_path = slash_replace(root_path, True)
  384. p = slash_replace(p, True)
  385. else:
  386. p = re.sub(root_path, "", p)
  387. path_list.append(p)
  388. # 按路径长度排序
  389. path_list.sort(key=lambda x: len(x), reverse=True)
  390. # 循环改名
  391. for old_path in path_list:
  392. # 按路径分隔符分割
  393. ss = old_path.split(os.sep)
  394. # 判断是否文件夹
  395. is_dir = 0
  396. file_type = ""
  397. if os.path.isdir(root_path + old_path):
  398. ss = ss[:-1]
  399. is_dir = 1
  400. else:
  401. if "." in old_path:
  402. file_type = "." + old_path.split(".")[-1]
  403. else:
  404. file_type = ""
  405. # 最后一级需要用hash改名
  406. new_path = ""
  407. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  408. current_level = 0
  409. for s in ss:
  410. # 路径拼接
  411. if current_level < len(ss) - 1:
  412. new_path += s + os.sep
  413. else:
  414. new_path += str(hash(s)) + file_type
  415. current_level += 1
  416. new_ab_path = root_path + new_path
  417. old_ab_path = root_path + old_path
  418. os.rename(old_ab_path, new_ab_path)
  419. # 重新获取解压文件夹下所有文件+文件夹
  420. new_path_list = []
  421. for root, dirs, files in os.walk(root_path, topdown=False):
  422. for name in dirs:
  423. new_path_list.append(os.path.join(root, name) + os.sep)
  424. for name in files:
  425. new_path_list.append(os.path.join(root, name))
  426. return new_path_list
  427. except:
  428. traceback.print_exc()
  429. return [-1]
  430. def judge_format(path):
  431. guess1 = mimetypes.guess_type(path)
  432. _type = None
  433. if guess1[0]:
  434. _type = guess1[0]
  435. else:
  436. guess2 = filetype.guess(path)
  437. if guess2:
  438. _type = guess2.mime
  439. if _type == "application/pdf":
  440. return "pdf"
  441. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  442. return "docx"
  443. if _type == "application/x-zip-compressed" or _type == "application/zip":
  444. return "zip"
  445. if _type == "application/x-rar-compressed" or _type == "application/rar":
  446. return "rar"
  447. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  448. return "xlsx"
  449. if _type == "application/msword":
  450. return "doc"
  451. if _type == "image/png":
  452. return "png"
  453. if _type == "image/jpeg":
  454. return "jpg"
  455. # 猜不到,返回None
  456. return None
  457. def slash_replace(_str, reverse=False):
  458. if reverse:
  459. _str = eval(repr(_str).replace('/', '\\\\'))
  460. else:
  461. _str = eval(repr(_str).replace('\\\\', '/'))
  462. return _str
  463. class LineTable():
  464. def recognize_table(self,list_textbox, list_line,sourceP_LB=True):
  465. self.list_line = list_line
  466. self.list_crosspoints = self.recognize_crosspoints(list_line)
  467. # 聚类
  468. cluster_crosspoints = []
  469. for _point in self.list_crosspoints:
  470. cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
  471. while 1:
  472. _find = False
  473. new_cluster_crosspoints = []
  474. for l_point in cluster_crosspoints:
  475. _flag = False
  476. for l_n_point in new_cluster_crosspoints:
  477. line1 = l_point.get("lines")
  478. line2 = l_n_point.get("lines")
  479. if len(line1&line2) > 0:
  480. _find = True
  481. _flag = True
  482. l_n_point["lines"] = line1.union(line2)
  483. l_n_point["points"].extend(l_point["points"])
  484. if not _flag:
  485. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  486. cluster_crosspoints = new_cluster_crosspoints
  487. if not _find:
  488. break
  489. list_l_rect = []
  490. for table_crosspoint in cluster_crosspoints:
  491. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  492. list_l_rect.append(list_rect)
  493. in_objs = set()
  494. list_tables = []
  495. for l_rect in list_l_rect:
  496. _ta = self.rect2table(list_textbox,l_rect,in_objs,sourceP_LB=sourceP_LB)
  497. if _ta:
  498. list_tables.append(_ta)
  499. self._plot(list_line, list_textbox)
  500. return list_tables, in_objs, list_l_rect
  501. def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
  502. dump_margin = 5
  503. list_rect_tmp = []
  504. # 去重
  505. for _rect in list_rect:
  506. if (_rect.bbox[3]-_rect.bbox[1] < 10) or (abs(_rect.bbox[2]-_rect.bbox[0]) < 5):
  507. continue
  508. _find = False
  509. for _tmp in list_rect_tmp:
  510. for i in range(4):
  511. if abs(_rect.bbox[i]-_tmp.bbox[i]) < dump_margin:
  512. pass
  513. else:
  514. _find = False
  515. break
  516. if i == 3:
  517. _find = True
  518. if _find:
  519. break
  520. if not _find:
  521. list_rect_tmp.append(_rect)
  522. # print("=====",len(list_rect),len(list_rect_tmp))
  523. # print(list_rect_tmp)
  524. # from matplotlib import pyplot as plt
  525. # plt.figure()
  526. # for _rect in list_rect_tmp:
  527. # x0,y0,x1,y1 = _rect.bbox
  528. # plt.boxplot(_rect.bbox)
  529. # plt.show()
  530. cluster_rect = []
  531. for _rect in list_rect:
  532. _find = False
  533. for cr in cluster_rect:
  534. for cr_rect in cr:
  535. if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
  536. _find = True
  537. cr.append(_rect)
  538. break
  539. elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
  540. _find = True
  541. cr.append(_rect)
  542. break
  543. if _find:
  544. break
  545. if not _find:
  546. cluster_rect.append([_rect])
  547. list_l_rect = cluster_rect
  548. in_objs = set()
  549. list_tables = []
  550. for l_rect in list_l_rect:
  551. _ta = self.rect2table(list_textbox,l_rect,in_objs)
  552. if _ta:
  553. list_tables.append(_ta)
  554. return list_tables,in_objs,list_l_rect
  555. def recognize_crosspoints(self, list_line):
  556. from matplotlib import pyplot as plt
  557. list_crosspoints = []
  558. # print("lines num",len(list_line))
  559. for _i in range(len(list_line)):
  560. for _j in range(len(list_line)):
  561. line1 = list_line[_i].__dict__.get("bbox")
  562. line2 = list_line[_j].__dict__.get("bbox")
  563. exists,point = self.cross_point(line1,line2)
  564. if exists:
  565. list_crosspoints.append(point)
  566. # plt.figure()
  567. # for _line in list_line:
  568. # x0,y0,x1,y1 = _line.__dict__.get("bbox")
  569. # plt.plot([x0,x1],[y0,y1])
  570. # for _line in list_line:
  571. # x0,y0,x1,y1 = _line.bbox
  572. # plt.plot([x0,x1],[y0,y1])
  573. # for point in list_crosspoints:
  574. # plt.scatter(point.get("point")[0],point.get("point")[1])
  575. # plt.show()
  576. # print(list_crosspoints)
  577. # print("points num",len(list_crosspoints))
  578. return list_crosspoints
  579. def recognize_rect(self, _page):
  580. list_line = []
  581. for _obj in _page._objs:
  582. if isinstance(_obj, (LTLine)):
  583. list_line.append(_obj)
  584. list_crosspoints = self.recognize_crosspoints(list_line)
  585. #聚类
  586. cluster_crosspoints = []
  587. for _point in list_crosspoints:
  588. cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
  589. while 1:
  590. _find = False
  591. new_cluster_crosspoints = []
  592. for l_point in cluster_crosspoints:
  593. _flag = False
  594. for l_n_point in new_cluster_crosspoints:
  595. line1 = l_point.get("lines")
  596. line2 = l_n_point.get("lines")
  597. if len(line1&line2)>0:
  598. _find = True
  599. _flag = True
  600. l_n_point["lines"] = line1.union(line2)
  601. l_n_point["points"].extend(l_point["points"])
  602. if not _flag:
  603. new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
  604. cluster_crosspoints = new_cluster_crosspoints
  605. if not _find:
  606. break
  607. # print(len(cluster_crosspoints))
  608. list_l_rect = []
  609. for table_crosspoint in cluster_crosspoints:
  610. list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
  611. list_l_rect.append(list_rect)
  612. return list_l_rect
  613. def crosspoint2rect(self, list_crosspoint, margin=4):
  614. dict_line_points = {}
  615. for _point in list_crosspoint:
  616. lines = list(_point.get("lines"))
  617. for _line in lines:
  618. if _line not in dict_line_points:
  619. dict_line_points[_line] = {"direct":None,"points":[]}
  620. dict_line_points[_line]["points"].append(_point)
  621. # 排序
  622. for k, v in dict_line_points.items():
  623. list_x = []
  624. list_y = []
  625. for _p in v["points"]:
  626. list_x.append(_p.get("point")[0])
  627. list_y.append(_p.get("point")[1])
  628. if max(list_x)-min(list_x)>max(list_y)-min(list_y):
  629. v.get("points").sort(key=lambda x:x.get("point")[0])
  630. v["direct"] = "row"
  631. else:
  632. v.get("points").sort(key=lambda x:x.get("point")[1])
  633. v["direct"] = "column"
  634. list_rect = []
  635. for _point in list_crosspoint:
  636. if _point["buttom"]>=margin and _point["right"]>=margin:
  637. lines = list(_point.get("lines"))
  638. _line = lines[0]
  639. if dict_line_points[_line]["direct"]=="column":
  640. _line = lines[1]
  641. next_point = None
  642. for p1 in dict_line_points[_line]["points"]:
  643. if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
  644. next_point = p1
  645. break
  646. if not next_point:
  647. continue
  648. lines = list(next_point.get("lines"))
  649. _line = lines[0]
  650. if dict_line_points[_line]["direct"]=="row":
  651. _line = lines[1]
  652. final_point = None
  653. for p1 in dict_line_points[_line]["points"]:
  654. if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
  655. final_point = p1
  656. break
  657. if not final_point:
  658. continue
  659. _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
  660. list_rect.append(_r)
  661. return list_rect
  662. def cross_point(self, line1, line2, segment=True, margin=2):
  663. point_is_exist = False
  664. x = y = 0
  665. x1, y1, x2, y2 = line1
  666. x3, y3, x4, y4 = line2
  667. if (x2 - x1) == 0:
  668. k1 = None
  669. b1 = 0
  670. else:
  671. k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
  672. b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
  673. if (x4 - x3) == 0: # L2直线斜率不存在
  674. k2 = None
  675. b2 = 0
  676. else:
  677. k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
  678. b2 = y3 * 1.0 - x3 * k2 * 1.0
  679. if k1 is None:
  680. if not k2 is None:
  681. x = x1
  682. y = k2 * x1 + b2
  683. point_is_exist = True
  684. elif k2 is None:
  685. x = x3
  686. y = k1 * x3 + b1
  687. elif not k2 == k1:
  688. x = (b2 - b1) * 1.0 / (k1 - k2)
  689. y = k1 * x * 1.0 + b1 * 1.0
  690. point_is_exist = True
  691. left = 0
  692. right = 0
  693. top = 0
  694. buttom = 0
  695. if point_is_exist:
  696. if segment:
  697. if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
  698. if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
  699. point_is_exist = True
  700. left = abs(min(x1,x3)-x)
  701. right = abs(max(x2,x4)-x)
  702. top = abs(min(y1,y3)-y)
  703. buttom = abs(max(y2,y4)-y)
  704. else:
  705. point_is_exist = False
  706. else:
  707. point_is_exist = False
  708. line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
  709. line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
  710. return point_is_exist, {"point": [x, y], "left": left, "right": right,
  711. "top": top, "buttom": buttom, "lines": set([line1_key,line2_key])}
  712. def unionTable(self, list_table, fixspan=True, margin=2):
  713. set_x = set()
  714. set_y = set()
  715. list_cell = []
  716. for _t in list_table:
  717. for _line in _t:
  718. list_cell.extend(_line)
  719. clusters_rects = []
  720. #根据y1聚类
  721. set_id = set()
  722. list_cell_dump = []
  723. for _cell in list_cell:
  724. _id = id(_cell)
  725. if _id in set_id:
  726. continue
  727. set_id.add(_id)
  728. list_cell_dump.append(_cell)
  729. list_cell = list_cell_dump
  730. list_cell.sort(key=lambda x:x.get("bbox")[3])
  731. for _rect in list_cell:
  732. _y0 = _rect.get("bbox")[3]
  733. _find = False
  734. for l_cr in clusters_rects:
  735. if abs(l_cr[0].get("bbox")[3]-_y0)<2:
  736. _find = True
  737. l_cr.append(_rect)
  738. break
  739. if not _find:
  740. clusters_rects.append([_rect])
  741. clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
  742. for l_cr in clusters_rects:
  743. l_cr.sort(key=lambda x:x.get("bbox")[0])
  744. print("=============:")
  745. for l_r in clusters_rects:
  746. print(len(l_r))
  747. for _line in clusters_rects:
  748. for _rect in _line:
  749. (x0,y0,x1,y1) = _rect.get("bbox")
  750. set_x.add(x0)
  751. set_x.add(x1)
  752. set_y.add(y0)
  753. set_y.add(y1)
  754. if len(set_x)==0 or len(set_y)==0:
  755. return
  756. list_x = list(set_x)
  757. list_y = list(set_y)
  758. list_x.sort(key=lambda x:x)
  759. list_y.sort(key=lambda x:x,reverse=True)
  760. _table = []
  761. for _line in clusters_rects:
  762. table_line = []
  763. for _rect in _line:
  764. (x0,y0,x1,y1) = _rect.get("bbox")
  765. _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
  766. table_line.append(_cell)
  767. _table.append(table_line)
  768. # print("=====================>>")
  769. # for _line in _table:
  770. # for _cell in _line:
  771. # print(_cell,end="\t")
  772. # print("\n")
  773. # print("=====================>>")
  774. # print(_table)
  775. if fixspan:
  776. for _line in _table:
  777. for c_i in range(len(_line)):
  778. _cell = _line[c_i]
  779. if _cell.get("columnspan")>1:
  780. _cospan = _cell.get("columnspan")
  781. _cell["columnspan"] = 1
  782. for i in range(1,_cospan):
  783. _line.insert(c_i,_cell)
  784. for l_i in range(len(_table)):
  785. _line = _table[l_i]
  786. for c_i in range(len(_line)):
  787. _cell = _line[c_i]
  788. if _cell.get("rowspan")>1:
  789. _rospan = _cell.get("rowspan")
  790. _cell["rowspan"] = 1
  791. for i in range(1,_rospan):
  792. _table[l_i+i].insert(c_i,_cell)
  793. table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
  794. ta = {"bbox":table_bbox,"table":_table}
  795. return ta
  796. def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=True,sourceP_LB=True):
  797. _table = []
  798. set_x = set()
  799. set_y = set()
  800. clusters_rects = []
  801. # 根据y1聚类
  802. list_rect.sort(key=lambda x:x.bbox[3])
  803. for _rect in list_rect:
  804. _y0 = _rect.bbox[3]
  805. _find = False
  806. for l_cr in clusters_rects:
  807. if abs(l_cr[0].bbox[3]-_y0)<2:
  808. _find = True
  809. l_cr.append(_rect)
  810. break
  811. if not _find:
  812. clusters_rects.append([_rect])
  813. print("clusters_rects", len(clusters_rects))
  814. clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
  815. for l_cr in clusters_rects:
  816. l_cr.sort(key=lambda x:x.bbox[0])
  817. # cul spans
  818. for _line in clusters_rects:
  819. for _rect in _line:
  820. (x0,y0,x1,y1) = _rect.bbox
  821. set_x.add(x0)
  822. set_x.add(x1)
  823. set_y.add(y0)
  824. set_y.add(y1)
  825. if len(set_x)==0 or len(set_y)==0:
  826. return
  827. list_x = list(set_x)
  828. list_y = list(set_y)
  829. list_x.sort(key=lambda x:x)
  830. list_y.sort(key=lambda x:x,reverse=sourceP_LB)
  831. pop_x = []
  832. for i in range(len(list_x)-1):
  833. _i = len(list_x)-i-1
  834. l_i = _i-1
  835. if abs(list_x[_i]-list_x[l_i])<2:
  836. pop_x.append(_i)
  837. pop_x.sort(key=lambda x:x,reverse=True)
  838. for _x in pop_x:
  839. list_x.pop(_x)
  840. #
  841. pop_x = []
  842. for i in range(len(list_y)-1):
  843. _i = len(list_y)-i-1
  844. l_i = _i-1
  845. if abs(list_y[_i]-list_y[l_i])<2:
  846. pop_x.append(_i)
  847. pop_x.sort(key=lambda x:x,reverse=True)
  848. for _x in pop_x:
  849. list_y.pop(_x)
  850. # print(list_x)
  851. # print(list_y)
  852. for _line in clusters_rects:
  853. table_line = []
  854. for _rect in _line:
  855. (x0, y0, x1, y1) = _rect.bbox
  856. _cell = {"bbox": (x0, y0, x1, y1),
  857. "rect": _rect,
  858. "rowspan": self.getspan(list_y, y0, y1, margin),
  859. "columnspan": self.getspan(list_x, x0, x1, margin),
  860. "text": ""}
  861. table_line.append(_cell)
  862. _table.append(table_line)
  863. list_textbox.sort(key=lambda x:x.bbox[0])
  864. list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
  865. for textbox in list_textbox:
  866. (x0,y0,x1,y1) = textbox.bbox
  867. _text = textbox.get_text()
  868. print("textbox", _text, textbox.bbox)
  869. _find = False
  870. for table_line in _table:
  871. for _cell in table_line:
  872. if self.inbox(textbox.bbox,_cell["bbox"]):
  873. _cell["text"] += _text
  874. in_objs.add(textbox)
  875. _find = True
  876. break
  877. if _find:
  878. break
  879. if fixspan:
  880. for _line in _table:
  881. for c_i in range(len(_line)):
  882. _cell = _line[c_i]
  883. if _cell.get("columnspan")>1:
  884. _cospan = _cell.get("columnspan")
  885. _cell["columnspan"] = 1
  886. for i in range(1,_cospan):
  887. _line.insert(c_i,_cell)
  888. for l_i in range(len(_table)):
  889. _line = _table[l_i]
  890. for c_i in range(len(_line)):
  891. _cell = _line[c_i]
  892. if _cell.get("rowspan")>1:
  893. _rospan = _cell.get("rowspan")
  894. _cell["rowspan"] = 1
  895. for i in range(1,_rospan):
  896. if l_i+i<len(_table)-1:
  897. print(len(_table),l_i+i)
  898. _table[l_i+i].insert(c_i,_cell)
  899. # print("=======")
  900. # for _line in _table:
  901. # for _cell in _line:
  902. # print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
  903. # print("\n")
  904. # print("===========")
  905. table_bbox = (_table[0][0].get("bbox")[0],
  906. _table[0][0].get("bbox")[1],
  907. _table[-1][-1].get("bbox")[2],
  908. _table[-1][-1].get("bbox")[3])
  909. ta = {"bbox": table_bbox, "table": _table}
  910. return ta
  911. def inbox(self, bbox0, bbox_g):
  912. # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
  913. # return 1
  914. if self.getIOU(bbox0,bbox_g)>0.5:
  915. return 1
  916. return 0
  917. def getIOU(self, bbox0, bbox1):
  918. width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
  919. height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
  920. print("getIOU", width, height)
  921. if width < 0 and height < 0:
  922. iou = abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),
  923. abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
  924. print("getIOU", iou)
  925. return iou
  926. return 0
  927. def getspan(self, _list, x0, x1, margin):
  928. _count = 0
  929. (x0,x1) = (min(x0,x1),max(x0,x1))
  930. for _x in _list:
  931. if _x>=(x0-margin) and _x<=(x1+margin):
  932. _count += 1
  933. return _count-1
  934. def _plot(self, list_line, list_textbox):
  935. from matplotlib import pyplot as plt
  936. plt.figure()
  937. for _line in list_line:
  938. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  939. plt.plot([x0, x1], [y0, y1])
  940. for _line in list_line:
  941. x0, y0, x1, y1 = _line.bbox
  942. plt.plot([x0, x1], [y0, y1])
  943. # for point in list_crosspoints:
  944. # plt.scatter(point.get("point")[0],point.get("point")[1])
  945. for textbox in list_textbox:
  946. x0, y0, x1, y1 = textbox.bbox
  947. plt.plot([x0, x1], [y0, y1])
  948. plt.show()
  949. def get_table_html(table):
  950. html_text = '<table border="1">' + "\n"
  951. for row in table:
  952. html_text += "<tr>" + "\n"
  953. for col in row:
  954. row_span = col.get("rowspan")
  955. col_span = col.get("columnspan")
  956. bbox_text = col.get("text")
  957. html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  958. html_text += bbox_text + "</td>" + "\n"
  959. html_text += "</tr>" + "\n"
  960. html_text += "</table>" + "\n"
  961. return html_text
  962. def sort_object(obj_list):
  963. from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
  964. if len(obj_list) == 0:
  965. return obj_list
  966. if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
  967. obj_list.sort(key=lambda x: x.y, reverse=True)
  968. return obj_list
  969. elif isinstance(obj_list[0], _Page):
  970. obj_list.sort(key=lambda x: x.page_no)
  971. return obj_list
  972. else:
  973. return obj_list
  974. if __name__ == "__main__":
  975. strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  976. print(slash_replace(strs))