utils.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. import difflib
  5. import logging
  6. import mimetypes
  7. import platform
  8. import re
  9. import traceback
  10. import filetype
  11. from bs4 import BeautifulSoup
  12. def judge_error_code(_list, code=[-1, -2, -3, -4, -5, -7]):
  13. for c in code:
  14. if _list == [c]:
  15. return True
  16. return False
  17. def add_div(text):
  18. if text == "" or text is None:
  19. return text
  20. if get_platform() == "Windows":
  21. print("add_div", text)
  22. if re.findall("<div>", text):
  23. return text
  24. text = "<div>" + text + "\n"
  25. text = re.sub("\n", "</div>\n<div>", text)
  26. # text += "</div>"
  27. if text[-5:] == "<div>":
  28. print("add_div has cut", text[-30:])
  29. text = text[:-5]
  30. return text
  31. def get_platform():
  32. sys = platform.system()
  33. return sys
  34. def get_html_p(html_path):
  35. logging.info("into get_html_p")
  36. try:
  37. with open(html_path, "r") as ff:
  38. html_str = ff.read()
  39. soup = BeautifulSoup(html_str, 'lxml')
  40. text = ""
  41. for p in soup.find_all("p"):
  42. p_text = p.text
  43. p_text = p_text.strip()
  44. if p.string != "":
  45. text += p_text
  46. text += "\n"
  47. return text
  48. except Exception as e:
  49. logging.info("get_html_p error!")
  50. print("get_html_p", traceback.print_exc())
  51. return [-1]
  52. def string_similarity(str1, str2):
  53. # 去掉<div>和回车
  54. str1 = re.sub("<div>", "", str1)
  55. str1 = re.sub("</div>", "", str1)
  56. str1 = re.sub("\n", "", str1)
  57. str2 = re.sub("<div>", "", str2)
  58. str2 = re.sub("</div>", "", str2)
  59. str2 = re.sub("\n", "", str2)
  60. # print("********************************")
  61. # print("str1", str1)
  62. # print("********************************")
  63. # print("str2", str2)
  64. # print("********************************")
  65. score = difflib.SequenceMatcher(None, str1, str2).ratio()
  66. print("string_similarity", score)
  67. return score
  68. def get_sequential_data(text_list, bbox_list, html=False):
  69. logging.info("into get_sequential_data")
  70. try:
  71. text = ""
  72. order_list = []
  73. for i in range(len(text_list)):
  74. length_start = bbox_list[i][0][0]
  75. length_end = bbox_list[i][1][0]
  76. height_start = bbox_list[i][0][1]
  77. height_end = bbox_list[i][-1][1]
  78. # print([length_start, length_end, height_start, height_end])
  79. order_list.append([text_list[i], length_start, length_end, height_start, height_end])
  80. # text = text + infomation['text'] + "\n"
  81. if get_platform() == "Windows":
  82. print("get_sequential_data", order_list)
  83. if not order_list:
  84. if get_platform() == "Windows":
  85. print("get_sequential_data", "no order list")
  86. return ""
  87. # 根据bbox的坐标对输出排序
  88. order_list.sort(key=lambda x: (x[3], x[1]))
  89. # 根据bbox分行分列
  90. # col_list = []
  91. # height_end = int((order_list[0][4] + order_list[0][3]) / 2)
  92. # for i in range(len(order_list)):
  93. # if height_end - threshold <= order_list[i][3] <= height_end + threshold:
  94. # col_list.append(order_list[i])
  95. # else:
  96. # row_list.append(col_list)
  97. # col_list = []
  98. # height_end = int((order_list[i][4] + order_list[i][3]) / 2)
  99. # col_list.append(order_list[i])
  100. # if i == len(order_list) - 1:
  101. # row_list.append(col_list)
  102. row_list = []
  103. used_box = []
  104. threshold = 5
  105. for box in order_list:
  106. if box in used_box:
  107. continue
  108. height_center = (box[4] + box[3]) / 2
  109. row = []
  110. for box2 in order_list:
  111. if box2 in used_box:
  112. continue
  113. height_center2 = (box2[4] + box2[3]) / 2
  114. if height_center - threshold <= height_center2 <= height_center + threshold:
  115. if box2 not in row:
  116. row.append(box2)
  117. used_box.append(box2)
  118. row.sort(key=lambda x: x[0])
  119. row_list.append(row)
  120. for row in row_list:
  121. if not row:
  122. continue
  123. if len(row) <= 1:
  124. text = text + row[0][0] + "\n"
  125. else:
  126. sub_text = ""
  127. row.sort(key=lambda x: x[1])
  128. for col in row:
  129. sub_text = sub_text + col[0] + " "
  130. sub_text = sub_text + "\n"
  131. text += sub_text
  132. if html:
  133. text = "<div>" + text
  134. text = re.sub("\n", "</div>\n<div>", text)
  135. text += "</div>"
  136. # if text[-5:] == "<div>":
  137. # text = text[:-5]
  138. return text
  139. except Exception as e:
  140. logging.info("get_sequential_data error!")
  141. print("get_sequential_data", traceback.print_exc())
  142. return [-1]
  143. def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
  144. logging.info("into get_formatted_table")
  145. try:
  146. # 重新定义text_bbox_list,[point, point, text]
  147. text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
  148. range(len(text_bbox_list))]
  149. # 按纵坐标排序
  150. text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  151. table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
  152. # print("text_bbox_list", text_bbox_list)
  153. # print("table_bbox_list", table_bbox_list)
  154. # bbox位置 threshold
  155. threshold = 5
  156. # 根据split_line分区,可能有个区多个表格 [(), ()]
  157. area_text_bbox_list = []
  158. area_table_bbox_list = []
  159. # print("get_formatted_table, split_line", split_line)
  160. for j in range(1, len(split_line)):
  161. last_y = split_line[j - 1][0][1]
  162. current_y = split_line[j][0][1]
  163. temp_text_bbox_list = []
  164. temp_table_bbox_list = []
  165. # 找出该区域下text bbox
  166. for text_bbox in text_bbox_list:
  167. # 计算 text bbox 中心点
  168. text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  169. (text_bbox[1][1] + text_bbox[0][1]) / 2)
  170. if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
  171. temp_text_bbox_list.append(text_bbox)
  172. area_text_bbox_list.append(temp_text_bbox_list)
  173. # 找出该区域下table bbox
  174. for table_bbox in table_bbox_list:
  175. # 计算 table bbox 中心点
  176. table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
  177. (table_bbox[1][1] + table_bbox[0][1]) / 2)
  178. if last_y < table_bbox_center[1] < current_y:
  179. temp_table_bbox_list.append(table_bbox)
  180. area_table_bbox_list.append(temp_table_bbox_list)
  181. # for j in range(len(area_text_bbox_list)):
  182. # print("area_text_bbox_list", j, area_text_bbox_list[j])
  183. # 对每个区域分别进行两个bbox匹配,生成表格
  184. area_text_list = []
  185. area_column_list = []
  186. for j in range(len(area_text_bbox_list)):
  187. # 每个区域的table bbox 和text bbox
  188. temp_table_bbox_list = area_table_bbox_list[j]
  189. temp_text_bbox_list = area_text_bbox_list[j]
  190. # 判断该区域有无表格bbox
  191. # 若无表格,将该区域文字连接
  192. if not temp_table_bbox_list:
  193. # 找出该区域的所有text bbox
  194. only_text_list = []
  195. only_bbox_list = []
  196. for text_bbox in temp_text_bbox_list:
  197. only_text_list.append(text_bbox[2])
  198. only_bbox_list.append([text_bbox[0], text_bbox[1]])
  199. only_text = get_sequential_data(only_text_list, only_bbox_list, True)
  200. if only_text == [-1]:
  201. return [-1], [-1]
  202. area_text_list.append(only_text)
  203. area_column_list.append(0)
  204. continue
  205. # 有表格
  206. # 文本对应的表格格子
  207. text_in_table = {}
  208. for i in range(len(temp_text_bbox_list)):
  209. text_bbox = temp_text_bbox_list[i]
  210. # 计算 text bbox 中心点
  211. text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
  212. (text_bbox[1][1] + text_bbox[0][1]) / 2)
  213. # 判断中心点在哪个table bbox中
  214. for table_bbox in temp_table_bbox_list:
  215. # 中心点在table bbox中,将text写入字典
  216. if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
  217. table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
  218. if str(table_bbox) in text_in_table.keys():
  219. text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  220. else:
  221. text_in_table[str(table_bbox)] = text_bbox[2]
  222. break
  223. # 如果未找到text bbox匹配的table bbox,加大threshold匹配
  224. # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  225. # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
  226. # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  227. # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  228. # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
  229. # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
  230. # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
  231. # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
  232. # if str(table_bbox) in text_in_table.keys():
  233. # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
  234. # else:
  235. # text_in_table[str(table_bbox)] = text_bbox[2]
  236. # break
  237. # 对表格格子进行分行分列,并计算总计多少小列
  238. # 放入坐标
  239. all_col_list = []
  240. all_row_list = []
  241. for i in range(len(temp_table_bbox_list)):
  242. table_bbox = temp_table_bbox_list[i]
  243. # 放入所有坐标x
  244. if table_bbox[0][0] not in all_col_list:
  245. all_col_list.append(table_bbox[0][0])
  246. if table_bbox[1][0] not in all_col_list:
  247. all_col_list.append(table_bbox[1][0])
  248. # 放入所有坐标y
  249. if table_bbox[0][1] not in all_row_list:
  250. all_row_list.append(table_bbox[0][1])
  251. if table_bbox[1][1] not in all_row_list:
  252. all_row_list.append(table_bbox[1][1])
  253. all_col_list.sort(key=lambda x: x)
  254. all_row_list.sort(key=lambda x: x)
  255. # 分行
  256. row_list = []
  257. rows = []
  258. temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
  259. y_row = temp_table_bbox_list[0][0][1]
  260. for i in range(len(temp_table_bbox_list)):
  261. table_bbox = temp_table_bbox_list[i]
  262. if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
  263. rows.append(table_bbox)
  264. else:
  265. y_row = table_bbox[0][1]
  266. if rows:
  267. rows.sort(key=lambda x: x[0][0])
  268. row_list.append(rows)
  269. rows = []
  270. rows.append(table_bbox)
  271. # print("*" * 30)
  272. # print(row_list)
  273. if i == len(temp_table_bbox_list) - 1:
  274. if rows:
  275. rows.sort(key=lambda x: x[0][0])
  276. row_list.append(rows)
  277. # 生成表格,包括文字和格子宽度
  278. area_column = []
  279. text = '<table border="1">' + "\n"
  280. for row in row_list:
  281. text += "<tr>" + "\n"
  282. for col in row:
  283. # 计算bbox y坐标之间有多少其他点,+1即为所占行数
  284. row_span = 1
  285. for y in all_row_list:
  286. if col[0][1] < y < col[1][1]:
  287. if y - col[0][1] >= 2 and col[1][1] - y >= 2:
  288. row_span += 1
  289. # 计算bbox x坐标之间有多少其他点,+1即为所占列数
  290. col_span = 1
  291. for x in all_col_list:
  292. if col[0][0] < x < col[1][0]:
  293. if x - col[0][0] >= 2 and col[1][0] - x >= 2:
  294. col_span += 1
  295. text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
  296. if str(col) in text_in_table.keys():
  297. text += text_in_table.get(str(col))
  298. else:
  299. text += ""
  300. text += "</td>" + "\n"
  301. text += "</tr>" + "\n"
  302. text += "</table>" + "\n"
  303. # 计算最大column
  304. max_col_num = 0
  305. for row in row_list:
  306. col_num = 0
  307. for col in row:
  308. col_num += 1
  309. if max_col_num < col_num:
  310. max_col_num = col_num
  311. area_text_list.append(text)
  312. area_column_list.append(max_col_num)
  313. text = ""
  314. if get_platform() == "Windows":
  315. print("get_formatted_table area_text_list", area_text_list)
  316. for area_text in area_text_list:
  317. text += area_text
  318. return text, area_column_list
  319. except Exception as e:
  320. logging.info("get_formatted_table error!")
  321. print("get_formatted_table", traceback.print_exc())
  322. return [-1], [-1]
  323. def rename_inner_files(root_path):
  324. try:
  325. logging.info("into rename_inner_files")
  326. # 获取解压文件夹下所有文件+文件夹,不带根路径
  327. path_list = []
  328. for root, dirs, files in os.walk(root_path, topdown=False):
  329. for name in dirs:
  330. p = os.path.join(root, name) + os.sep
  331. if get_platform() == "Windows":
  332. root_path = slash_replace(root_path)
  333. p = slash_replace(p)
  334. p = re.sub(root_path, "", p)
  335. root_path = slash_replace(root_path, True)
  336. p = slash_replace(p, True)
  337. else:
  338. p = re.sub(root_path, "", p)
  339. path_list.append(p)
  340. for name in files:
  341. p = os.path.join(root, name)
  342. if get_platform() == "Windows":
  343. root_path = slash_replace(root_path)
  344. p = slash_replace(p)
  345. p = re.sub(root_path, "", p)
  346. root_path = slash_replace(root_path, True)
  347. p = slash_replace(p, True)
  348. else:
  349. p = re.sub(root_path, "", p)
  350. path_list.append(p)
  351. # 按路径长度排序
  352. path_list.sort(key=lambda x: len(x), reverse=True)
  353. # 循环改名
  354. for old_path in path_list:
  355. # 按路径分隔符分割
  356. ss = old_path.split(os.sep)
  357. # 判断是否文件夹
  358. is_dir = 0
  359. file_type = ""
  360. if os.path.isdir(root_path + old_path):
  361. ss = ss[:-1]
  362. is_dir = 1
  363. else:
  364. if "." in old_path:
  365. file_type = "." + old_path.split(".")[-1]
  366. else:
  367. file_type = ""
  368. # 最后一级需要用hash改名
  369. new_path = ""
  370. # new_path = re.sub(ss[-1], str(hash(ss[-1])), old_path) + file_type
  371. current_level = 0
  372. for s in ss:
  373. # 路径拼接
  374. if current_level < len(ss) - 1:
  375. new_path += s + os.sep
  376. else:
  377. new_path += str(hash(s)) + file_type
  378. current_level += 1
  379. new_ab_path = root_path + new_path
  380. old_ab_path = root_path + old_path
  381. os.rename(old_ab_path, new_ab_path)
  382. # 重新获取解压文件夹下所有文件+文件夹
  383. new_path_list = []
  384. for root, dirs, files in os.walk(root_path, topdown=False):
  385. for name in dirs:
  386. new_path_list.append(os.path.join(root, name) + os.sep)
  387. for name in files:
  388. new_path_list.append(os.path.join(root, name))
  389. return new_path_list
  390. except:
  391. traceback.print_exc()
  392. return [-1]
  393. def judge_format(path):
  394. guess1 = mimetypes.guess_type(path)
  395. _type = None
  396. if guess1[0]:
  397. _type = guess1[0]
  398. else:
  399. guess2 = filetype.guess(path)
  400. if guess2:
  401. _type = guess2.mime
  402. if _type == "application/pdf":
  403. return "pdf"
  404. if _type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
  405. return "docx"
  406. if _type == "application/x-zip-compressed" or _type == "application/zip":
  407. return "zip"
  408. if _type == "application/x-rar-compressed" or _type == "application/rar":
  409. return "rar"
  410. if _type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
  411. return "xlsx"
  412. if _type == "application/msword":
  413. return "doc"
  414. if _type == "image/png":
  415. return "png"
  416. if _type == "image/jpeg":
  417. return "jpg"
  418. # 猜不到,返回None
  419. return None
  420. def slash_replace(_str, reverse=False):
  421. if reverse:
  422. _str = eval(repr(_str).replace('/', '\\\\'))
  423. else:
  424. _str = eval(repr(_str).replace('\\\\', '/'))
  425. return _str
  426. if __name__ == "__main__":
  427. strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
  428. print(slash_replace(strs))