convert_layout.py 24 KB


  1. import os
  2. import sys
  3. sys.setrecursionlimit(10000)
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
  6. import re
  7. import traceback
  8. from bs4 import BeautifulSoup
  9. from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
  10. from format_convert.wrapt_timeout_decorator import timeout
  11. class TreeNode:
  12. def __init__(self, data):
  13. self.data = data
  14. self.children = []
  15. def add_child(self, child_node):
  16. self.children.append(child_node)
  17. def print_tree(node, level=0):
  18. print(" " * level + str(node.data))
  19. for child in node.children:
  20. print_tree(child, level + 1)
  21. def print_tree_order(node, div_list, level=0):
  22. text = " " * level + div_list[node.data[0]].text
  23. colors = [(255, 0, 0, 0.7), (0, 255, 0, 0.6), (0, 0, 255, 0.6), (255, 127, 0, 0.2),
  24. # (123, 104, 238, 0.2),
  25. (238, 238, 0, 0.2),
  26. (255, 104, 255, 0.2)
  27. ]
  28. if level < len(colors):
  29. color = colors[level]
  30. else:
  31. color = colors[-1]
  32. text = '<div style="background-color: rgba{}";>'.format(str(color)) + text + '</div>'
  33. if level == 0:
  34. text = '<!DOCTYPE HTML><head><meta charset="UTF-8">' + text
  35. with open('../layout.html', 'a') as f:
  36. f.write(text)
  37. for child in node.children:
  38. print('node.child', child.data[:10])
  39. print_tree_order(child, div_list, level + 1)
  40. class LayoutConvert:
  41. def __init__(self, html):
  42. self.html = html
  43. self.order_type_list = ['[★]?(\d{1,3}[.])+[.\d]?',
  44. '[★]?[A-Z][.、]',
  45. '[★]?[a-z][.、]',
  46. '[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]',
  47. '[ⅠⅡⅢⅣⅤⅥⅦⅧⅩⅪⅫ]',
  48. '[ⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹ]',
  49. '[❶❷❸❹❻❼❽❾❿]',
  50. '第[一二三四五六七八九十]{1,2}[章节篇]',
  51. '第\d{1,2}[章节篇]',
  52. '[((]\d{1,3}[))]',
  53. '[★]?\d{1,3}、',
  54. '[((][一二三四五六七八九十]{1,3}[))]',
  55. '[一二三四五六七八九十]{1,3}、',
  56. '包[1-9]{1,3}',
  57. '标段[1-9]{1,3}',
  58. ]
  59. self.chinese_arabic_dict = {
  60. '一': 1,
  61. '二': 2,
  62. '三': 3,
  63. '四': 4,
  64. '五': 5,
  65. '六': 6,
  66. '七': 7,
  67. '八': 8,
  68. '九': 9,
  69. '十': 10,
  70. }
  71. def get_layout(self):
  72. return
  73. def recursion_get_tree(self, index_list, div_list, start_index, end_index):
  74. print([start_index, end_index], div_list[start_index].text[:10], '-'*20)
  75. tree_node = TreeNode([start_index, end_index])
  76. if end_index - start_index == 1:
  77. print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
  78. return tree_node
  79. temp_end_i = index_list[0][0]
  80. for start_i, end_i in index_list:
  81. if not start_index < start_i <= end_i <= end_index:
  82. if start_i == 0:
  83. print('continue not start_index < start_i <= end_i <= end_index', start_i, end_i)
  84. continue
  85. if start_i < temp_end_i:
  86. print('continue start_i < temp_end_i', start_i, temp_end_i, div_list[start_i])
  87. continue
  88. sub_tree_node = self.recursion_get_tree(index_list, div_list, start_i, end_i)
  89. tree_node.add_child(sub_tree_node)
  90. temp_end_i = end_i
  91. print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
  92. return tree_node
  93. def get_order_number_tree(self, product=None):
  94. def get_order_no(_ti, _div_text):
  95. _tis = re.split('[.、]', str(_ti))
  96. temp_tis = []
  97. for _t in _tis:
  98. if _t != '':
  99. temp_tis.append(_t)
  100. _tis = temp_tis
  101. _ti_order_no = None
  102. if len(_tis) >= 2:
  103. re.search('', _div_text)
  104. else:
  105. _match = re.search('[1-9]+', _div_text)
  106. if _match:
  107. _ti_order_no = int(_match.group())
  108. else:
  109. _match = re.search('[一二三四五六七八九十]+', _div_text)
  110. if _match:
  111. _ti_order_no = _match.group()
  112. temp_order_no = ''
  113. for o in _ti_order_no:
  114. temp_order_no += str(self.chinese_arabic_dict.get(o))
  115. _ti_order_no = int(temp_order_no)
  116. return _ti_order_no
  117. soup = BeautifulSoup(self.html, 'lxml')
  118. div_list = soup.findAll('div')
  119. type_index_list = []
  120. range_index_list = []
  121. cut_type_index_dict = {}
  122. # temp_type_index_list = []
  123. # 获取每一行的序号类型
  124. for div_index, d in enumerate(div_list):
  125. text = d.text
  126. # 判断该行是什么序号类型
  127. find_type_index = -1
  128. for type_index, reg in enumerate(self.order_type_list):
  129. if find_type_index >= 0:
  130. continue
  131. match = re.finditer(reg, text)
  132. for m in match:
  133. if m.span()[0] != 0:
  134. continue
  135. order = m.group()
  136. if type_index in [0, 1]:
  137. order = re.sub('[★]', '', order)
  138. # 普通情况,单层序号
  139. if type_index != 0:
  140. find_type_index = type_index
  141. # 特殊情况,多层序号
  142. else:
  143. ss = order.split('.')
  144. # if len(re.findall('[.]', m.group())) == 1:
  145. if len(ss) - ss.count('') == 1:
  146. find_type_index = 0
  147. # print('find_type_index1', find_type_index, text[:5])
  148. else:
  149. # 用小数表示多层序号
  150. find_type_index = re.sub('\d+', '0', order)
  151. find_type_index = re.sub('[.]', '', find_type_index)
  152. find_type_index = find_type_index[0] + '.' + find_type_index[1:-1] + '1'
  153. find_type_index = float(find_type_index)
  154. # print('find_type_index2', find_type_index, text[:5])
  155. break
  156. type_index_list.append(find_type_index)
  157. # 根据每一行的序号类型分块
  158. for div_index, d in enumerate(div_list):
  159. find_type_index = type_index_list[div_index]
  160. sub_type_index_list = type_index_list[:div_index]
  161. text = d.text
  162. print(text)
  163. # 若无序号类型,跳过
  164. if find_type_index < 0:
  165. # type_index_list.append(find_type_index)
  166. print('continue -1')
  167. print('-'*40)
  168. continue
  169. print('find_type_index, div_index', find_type_index, div_index)
  170. # 已经存在相同的序号类型
  171. if find_type_index in sub_type_index_list:
  172. # # 判断是否开始的序号
  173. # if (find_type_index >= 1 or find_type_index == 0) and len(re.findall('[1一]', text[:3])) == 1 \
  174. # and len(re.findall('[2-9二三四五六七八九十]', text[:3])) == 0:
  175. # # type_index_list.append(find_type_index)
  176. # final_index = None
  177. # for temp_div_index, temp_type in enumerate(sub_type_index_list):
  178. # if find_type_index == temp_type:
  179. # final_index = temp_div_index
  180. # final_block_index = div_index
  181. # min_block_size = 100000
  182. # for block in range_index_list:
  183. # if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
  184. # min_block_size = block[1] - block[0]
  185. # final_block_index = block[1]+1
  186. # if final_index is not None and [final_index, final_block_index] not in range_index_list:
  187. # range_index_list.append([final_index, final_block_index])
  188. # if cut_type_index_dict.get(find_type_index) is not None:
  189. # if div_index > cut_type_index_dict[find_type_index]:
  190. # cut_type_index_dict[find_type_index] = final_block_index
  191. # else:
  192. # cut_type_index_dict[find_type_index] = final_block_index
  193. # print('continue 1')
  194. # print('cut_type_index_dict', cut_type_index_dict)
  195. # print('-'*40)
  196. # continue
  197. # 判断是否开始的序号
  198. # if 0 < find_type_index < 1 \
  199. # and len(re.findall('[1]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 1 \
  200. # and len(re.findall('[2-9]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 0:
  201. # # type_index_list.append(find_type_index)
  202. # final_index = None
  203. # for temp_div_index, temp_type in enumerate(sub_type_index_list):
  204. # if find_type_index == temp_type:
  205. # final_index = temp_div_index
  206. # final_block_index = div_index
  207. # min_block_size = 100000
  208. # for block in range_index_list:
  209. # if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
  210. # min_block_size = block[1] - block[0]
  211. # final_block_index = block[1]+1
  212. # if final_index is not None and [final_index, final_block_index] not in range_index_list:
  213. # range_index_list.append([final_index, final_block_index])
  214. # if cut_type_index_dict.get(find_type_index) is not None:
  215. # if div_index > cut_type_index_dict[find_type_index]:
  216. # cut_type_index_dict[find_type_index] = final_block_index
  217. # else:
  218. # cut_type_index_dict[find_type_index] = final_block_index
  219. # print('continue 2')
  220. # print('-'*40)
  221. # continue
  222. # 找之前相同的序号类型的index,且index不能超过截断的该类型的index
  223. last_index = len(sub_type_index_list) - 1 - sub_type_index_list[::-1].index(find_type_index)
  224. print('find_type_index', find_type_index, [last_index, div_index], [sub_type_index_list[0], sub_type_index_list[-1]])
  225. if last_index < cut_type_index_dict.get(find_type_index, 0):
  226. # type_index_list.append(find_type_index)
  227. print('continue 3 last_index < cut_type_index_dict ', last_index, cut_type_index_dict.get(find_type_index, 0))
  228. print('-'*40)
  229. continue
  230. # 新增块
  231. range_index_list.append([last_index, div_index])
  232. print('find last_index add block', [last_index, div_index])
  233. # 更新截断
  234. if cut_type_index_dict.get(find_type_index) is not None:
  235. if div_index > cut_type_index_dict[find_type_index]:
  236. cut_type_index_dict[find_type_index] = div_index
  237. else:
  238. cut_type_index_dict[find_type_index] = div_index
  239. # 找到块了,那么块内的所有序号类型的截断到该块的最小index
  240. final_type_index_dict = {}
  241. for temp_div_index, temp_type in enumerate(sub_type_index_list[last_index+1:div_index]):
  242. temp_div_index += last_index + 1
  243. if temp_div_index < cut_type_index_dict.get(temp_type, 0):
  244. continue
  245. # 对块内有的类型的最后一个都新增块
  246. if temp_div_index <= range_index_list[-1][0]:
  247. continue
  248. final_type_index_dict[temp_type] = temp_div_index
  249. for temp_type in final_type_index_dict.keys():
  250. final_index = final_type_index_dict.get(temp_type)
  251. if [final_index, div_index] not in range_index_list:
  252. print('add block cut_type_index_dict 1', cut_type_index_dict)
  253. range_index_list.append([final_index, div_index])
  254. print('add block ', [final_index, div_index])
  255. if cut_type_index_dict.get(temp_type) is not None:
  256. if div_index > cut_type_index_dict[temp_type]:
  257. cut_type_index_dict[temp_type] = div_index
  258. else:
  259. cut_type_index_dict[temp_type] = div_index
  260. print('add block cut_type_index_dict 2', cut_type_index_dict)
  261. # temp_type_index_list = []
  262. else:
  263. print('find_type_index not in type_index_list')
  264. print(cut_type_index_dict)
  265. # 存储所有序号类型
  266. # type_index_list.append(find_type_index)
  267. # 存储块内的序号类型
  268. # temp_type_index_list.append(find_type_index)
  269. print('-'*40)
  270. if not range_index_list:
  271. print('no range_index_list')
  272. return
  273. # 排序
  274. range_index_list.sort(key=lambda x: (x[0], x[1]))
  275. # 生成最后的块
  276. for temp_type in range(len(self.order_type_list)):
  277. for div_index, d in enumerate(div_list[::-1]):
  278. div_index = len(div_list) - 1 - div_index
  279. if type_index_list[div_index] != temp_type:
  280. continue
  281. if [div_index, div_index+1] not in range_index_list:
  282. range_index_list.append([div_index, len(div_list)-1])
  283. break
  284. # last_block_index = range_index_list[-1][1]
  285. # for div_index, d in enumerate(div_list[last_block_index:]):
  286. # div_index = div_index + last_block_index
  287. # if type_index_list[div_index] < 0:
  288. # continue
  289. # if [div_index, len(div_list)-1] not in range_index_list:
  290. # range_index_list.append([div_index, len(div_list)-1])
  291. # 排序
  292. range_index_list.sort(key=lambda x: (x[0], -x[1]))
  293. print('type_index_list', type_index_list)
  294. block_dict = {}
  295. index_div_list = []
  296. for range_index in range_index_list:
  297. _text = ''
  298. for d in div_list[range_index[0]:range_index[1]]:
  299. _text += d.text
  300. print(range_index, _text[:20])
  301. # 合并重叠的
  302. delete_range_index_list = []
  303. # for i, range_index in enumerate(range_index_list):
  304. # if range_index in delete_range_index_list:
  305. # continue
  306. # for j in range(i+1, len(range_index_list)):
  307. # range_index2 = range_index_list[j]
  308. # if range_index2 in delete_range_index_list:
  309. # continue
  310. # if range_index[0] == range_index2[0] or range_index[1] == range_index2[1]:
  311. # delete_range_index_list.append(range_index2)
  312. # 补充中间断开的
  313. add_range_index_list = []
  314. if range_index_list[0][0] != 0:
  315. for j in range(0, range_index_list[0][0]):
  316. add_range_index_list.append([j, j+1])
  317. for i in range(1, len(range_index_list)):
  318. range_index1 = range_index_list[i-1]
  319. range_index2 = range_index_list[i]
  320. if range_index1[1] != range_index2[0] or (range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]):
  321. for j in range(range_index1[0], range_index2[0]):
  322. add_range_index_list.append([j, j+1])
  323. # add_range_index_list.append([range_index1[0], range_index2[0]])
  324. # if range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]:
  325. # add_range_index_list.append([range_index1[0]+1, range_index2[0]])
  326. print('delete_range_index_list', delete_range_index_list)
  327. print('add_range_index_list', add_range_index_list)
  328. print('len(range_index_list)', len(range_index_list))
  329. for range_index in delete_range_index_list:
  330. if range_index in range_index_list:
  331. range_index_list.remove(range_index)
  332. print('len(range_index_list)', len(range_index_list))
  333. range_index_list += add_range_index_list
  334. range_index_list.sort(key=lambda x: (x[0], -x[1]))
  335. print('len(range_index_list)', len(range_index_list))
  336. tree_root = self.recursion_get_tree(range_index_list, div_list, 0, len(div_list))
  337. # print_tree(tree_root)
  338. with open('../layout.html', 'w') as f:
  339. f.write('')
  340. print_tree_order(tree_root, div_list)
  341. with open('../origin.html', 'w') as f:
  342. f.write(self.html)
  343. # 打印某个产品的参数
  344. if product:
  345. candidate_div_list = []
  346. for i, div in enumerate(div_list):
  347. div = div.text
  348. if i == 0 or i == len(div_list)-1:
  349. continue
  350. if not re.search(product, div):
  351. continue
  352. print('find product', div[:20])
  353. type_index = type_index_list[i]
  354. type_index_after = None
  355. for ti in type_index_list[i+1:]:
  356. if ti != -1:
  357. type_index_after = ti
  358. break
  359. type_index_before = None
  360. for ti in type_index_list[:i][::-1]:
  361. if ti != -1:
  362. type_index_before = ti
  363. break
  364. print('type_index, type_index_before, type_index_after1', type_index, type_index_before, type_index_after)
  365. # 复用序号样式
  366. dup_type_index_flag = 0
  367. if type_index_after == type_index:
  368. dup_type_index_flag = 1
  369. print('type_index, type_index_before, type_index_after2', type_index, type_index_before, type_index_after)
  370. block_type_list = []
  371. block_div_list = []
  372. no_order_type_list = []
  373. sub_type_index_list = type_index_list[i:]
  374. type_index_pair1 = [type_index_before, type_index]
  375. type_index_pair2 = [type_index, type_index_after]
  376. for j, ti in enumerate(sub_type_index_list):
  377. real_j = j + i
  378. if j == 0 or j == len(sub_type_index_list) - 1:
  379. continue
  380. ti_previous = sub_type_index_list[j-1]
  381. ti_next = sub_type_index_list[j+1]
  382. ti_pair1 = [ti_previous, ti_next]
  383. ti_pair2 = [ti, ti_next]
  384. _div = div_list[real_j].text
  385. # 判断多层还是单层,且是否第一个
  386. tis = re.split('[.、]', str(ti))
  387. temp_tis = []
  388. for _ti in tis:
  389. if _ti != '':
  390. temp_tis.append(_ti)
  391. tis = temp_tis
  392. break_flag1 = 0
  393. if len(tis) >= 2:
  394. if len(re.findall('[1一]{2,}', tis[-1])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', tis[-1])) != 0:
  395. break_flag1 = 1
  396. else:
  397. if len(re.findall('[1一]{2,}', _div[:3])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', _div[:6])) != 0:
  398. break_flag1 = 1
  399. # 有复用的,与搜索的type_index相同且连续,但与之前的相同的type_index的数字不连续
  400. break_flag2 = 0
  401. if dup_type_index_flag and type_index == ti and ti in block_type_list:
  402. last_ti_index = block_type_list[::-1].index(ti)
  403. last_ti_index = len(block_type_list) - 1 - last_ti_index
  404. last_ti_div = block_div_list[last_ti_index]
  405. last_ti_order_no = get_order_no(ti, last_ti_div)
  406. ti_order_no = get_order_no(ti, _div)
  407. type_index_order_no = get_order_no(type_index, div)
  408. print('last_ti_order_no, ti_order_no, type_index_order_no', last_ti_order_no, ti_order_no, type_index_order_no)
  409. print(last_ti_div[:10], _div[:10], div[:10])
  410. if None not in [type_index_order_no, last_ti_order_no, ti_order_no]:
  411. if ti_order_no - type_index_order_no == 1 and ti_order_no - last_ti_order_no != 1:
  412. break_flag2 = 1
  413. if break_flag2:
  414. break
  415. # 碰到很大的序号类型
  416. elif ti in [7, 8]:
  417. break
  418. # 碰到不是从1开始的
  419. elif ti == -1:
  420. no_order_type_list.append(ti)
  421. block_type_list.append(ti)
  422. block_div_list.append(_div)
  423. elif ti not in block_type_list and break_flag1:
  424. print('not 1 start break', _div[:6], len(re.findall('[1一]', _div[:3])), len(re.findall('[2-9二三四五六七八九十]', _div[:6])))
  425. print(block_div_list)
  426. print(block_type_list)
  427. break
  428. elif not dup_type_index_flag and ti not in [type_index, type_index_before, type_index_after]:
  429. block_type_list.append(ti)
  430. block_div_list.append(_div)
  431. no_order_type_list = []
  432. else:
  433. # 遇到相同类型的组合
  434. if not dup_type_index_flag and (type_index_pair1 == ti_pair1):
  435. block_type_list.append(ti)
  436. block_div_list.append(_div)
  437. print('type_index_pair1 == ti_pair1 or type_index_pair2 == ti_pair2 break',
  438. _div[:6], type_index_pair1, ti_pair1, type_index_pair2, ti_pair2)
  439. break
  440. else:
  441. no_order_type_list = []
  442. block_type_list.append(ti)
  443. block_div_list.append(_div)
  444. if not block_type_list:
  445. continue
  446. # 排除末尾为非序号的
  447. if block_type_list[-1] == -1:
  448. block_type_list = block_type_list[:len(block_type_list)-len(no_order_type_list)]
  449. block_div_list = block_div_list[:len(block_div_list)-len(no_order_type_list)]
  450. candidate_div_list.append(block_div_list)
  451. print('len(candidate_div_list)', len(candidate_div_list))
  452. print('candidate_div_list', candidate_div_list)
  453. if candidate_div_list:
  454. candidate_div_list.sort(key=lambda x: len(x))
  455. for div in candidate_div_list:
  456. print(len(div), div)
  457. print('='*10, product, '='*10)
  458. for div in candidate_div_list[-1]:
  459. print(div)
  460. # print(d.text)
  461. def order_show_in_layout(self, tree_root, div_list):
  462. print_tree_order(tree_root, div_list)
  463. # with open('../result.html', 'r') as f:
  464. with open(r'C:\Users\Administrator\Desktop\test_layout\4.html', 'r') as f:
  465. html = f.read()
  466. LayoutConvert(html).get_order_number_tree('连续性血液净化设备')
  467. _list = [1, 3, 5, 7, 9]
  468. print(len(_list) - 1 - _list[::-1].index(3))