123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562 |
- import os
- import sys
- sys.setrecursionlimit(10000)
- sys.path.append(os.path.dirname(__file__) + "/../")
- from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
- import re
- import traceback
- from bs4 import BeautifulSoup
- from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
- from format_convert.wrapt_timeout_decorator import timeout
- class TreeNode:
- def __init__(self, data):
- self.data = data
- self.children = []
- def add_child(self, child_node):
- self.children.append(child_node)
- def print_tree(node, level=0):
- print(" " * level + str(node.data))
- for child in node.children:
- print_tree(child, level + 1)
- def print_tree_order(node, div_list, level=0):
- text = " " * level + div_list[node.data[0]].text
- colors = [(255, 0, 0, 0.7), (0, 255, 0, 0.6), (0, 0, 255, 0.6), (255, 127, 0, 0.2),
- # (123, 104, 238, 0.2),
- (238, 238, 0, 0.2),
- (255, 104, 255, 0.2)
- ]
- if level < len(colors):
- color = colors[level]
- else:
- color = colors[-1]
- text = '<div style="background-color: rgba{}";>'.format(str(color)) + text + '</div>'
- if level == 0:
- text = '<!DOCTYPE HTML><head><meta charset="UTF-8">' + text
- with open('../layout.html', 'a') as f:
- f.write(text)
- for child in node.children:
- print('node.child', child.data[:10])
- print_tree_order(child, div_list, level + 1)
- class LayoutConvert:
- def __init__(self, html):
- self.html = html
- self.order_type_list = ['[★]?(\d{1,3}[.])+[.\d]?',
- '[★]?[A-Z][.、]',
- '[★]?[a-z][.、]',
- '[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]',
- '[ⅠⅡⅢⅣⅤⅥⅦⅧⅩⅪⅫ]',
- '[ⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹ]',
- '[❶❷❸❹❻❼❽❾❿]',
- '第[一二三四五六七八九十]{1,2}[章节篇]',
- '第\d{1,2}[章节篇]',
- '[((]\d{1,3}[))]',
- '[★]?\d{1,3}、',
- '[((][一二三四五六七八九十]{1,3}[))]',
- '[一二三四五六七八九十]{1,3}、',
- '包[1-9]{1,3}',
- '标段[1-9]{1,3}',
- ]
- self.chinese_arabic_dict = {
- '一': 1,
- '二': 2,
- '三': 3,
- '四': 4,
- '五': 5,
- '六': 6,
- '七': 7,
- '八': 8,
- '九': 9,
- '十': 10,
- }
- def get_layout(self):
- return
- def recursion_get_tree(self, index_list, div_list, start_index, end_index):
- print([start_index, end_index], div_list[start_index].text[:10], '-'*20)
- tree_node = TreeNode([start_index, end_index])
- if end_index - start_index == 1:
- print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
- return tree_node
- temp_end_i = index_list[0][0]
- for start_i, end_i in index_list:
- if not start_index < start_i <= end_i <= end_index:
- if start_i == 0:
- print('continue not start_index < start_i <= end_i <= end_index', start_i, end_i)
- continue
- if start_i < temp_end_i:
- print('continue start_i < temp_end_i', start_i, temp_end_i, div_list[start_i])
- continue
- sub_tree_node = self.recursion_get_tree(index_list, div_list, start_i, end_i)
- tree_node.add_child(sub_tree_node)
- temp_end_i = end_i
- print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
- return tree_node
- def get_order_number_tree(self, product=None):
- def get_order_no(_ti, _div_text):
- _tis = re.split('[.、]', str(_ti))
- temp_tis = []
- for _t in _tis:
- if _t != '':
- temp_tis.append(_t)
- _tis = temp_tis
- _ti_order_no = None
- if len(_tis) >= 2:
- re.search('', _div_text)
- else:
- _match = re.search('[1-9]+', _div_text)
- if _match:
- _ti_order_no = int(_match.group())
- else:
- _match = re.search('[一二三四五六七八九十]+', _div_text)
- if _match:
- _ti_order_no = _match.group()
- temp_order_no = ''
- for o in _ti_order_no:
- temp_order_no += str(self.chinese_arabic_dict.get(o))
- _ti_order_no = int(temp_order_no)
- return _ti_order_no
- soup = BeautifulSoup(self.html, 'lxml')
- div_list = soup.findAll('div')
- type_index_list = []
- range_index_list = []
- cut_type_index_dict = {}
- # temp_type_index_list = []
- # 获取每一行的序号类型
- for div_index, d in enumerate(div_list):
- text = d.text
- # 判断该行是什么序号类型
- find_type_index = -1
- for type_index, reg in enumerate(self.order_type_list):
- if find_type_index >= 0:
- continue
- match = re.finditer(reg, text)
- for m in match:
- if m.span()[0] != 0:
- continue
- order = m.group()
- if type_index in [0, 1]:
- order = re.sub('[★]', '', order)
- # 普通情况,单层序号
- if type_index != 0:
- find_type_index = type_index
- # 特殊情况,多层序号
- else:
- ss = order.split('.')
- # if len(re.findall('[.]', m.group())) == 1:
- if len(ss) - ss.count('') == 1:
- find_type_index = 0
- # print('find_type_index1', find_type_index, text[:5])
- else:
- # 用小数表示多层序号
- find_type_index = re.sub('\d+', '0', order)
- find_type_index = re.sub('[.]', '', find_type_index)
- find_type_index = find_type_index[0] + '.' + find_type_index[1:-1] + '1'
- find_type_index = float(find_type_index)
- # print('find_type_index2', find_type_index, text[:5])
- break
- type_index_list.append(find_type_index)
- # 根据每一行的序号类型分块
- for div_index, d in enumerate(div_list):
- find_type_index = type_index_list[div_index]
- sub_type_index_list = type_index_list[:div_index]
- text = d.text
- print(text)
- # 若无序号类型,跳过
- if find_type_index < 0:
- # type_index_list.append(find_type_index)
- print('continue -1')
- print('-'*40)
- continue
- print('find_type_index, div_index', find_type_index, div_index)
- # 已经存在相同的序号类型
- if find_type_index in sub_type_index_list:
- # # 判断是否开始的序号
- # if (find_type_index >= 1 or find_type_index == 0) and len(re.findall('[1一]', text[:3])) == 1 \
- # and len(re.findall('[2-9二三四五六七八九十]', text[:3])) == 0:
- # # type_index_list.append(find_type_index)
- # final_index = None
- # for temp_div_index, temp_type in enumerate(sub_type_index_list):
- # if find_type_index == temp_type:
- # final_index = temp_div_index
- # final_block_index = div_index
- # min_block_size = 100000
- # for block in range_index_list:
- # if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
- # min_block_size = block[1] - block[0]
- # final_block_index = block[1]+1
- # if final_index is not None and [final_index, final_block_index] not in range_index_list:
- # range_index_list.append([final_index, final_block_index])
- # if cut_type_index_dict.get(find_type_index) is not None:
- # if div_index > cut_type_index_dict[find_type_index]:
- # cut_type_index_dict[find_type_index] = final_block_index
- # else:
- # cut_type_index_dict[find_type_index] = final_block_index
- # print('continue 1')
- # print('cut_type_index_dict', cut_type_index_dict)
- # print('-'*40)
- # continue
- # 判断是否开始的序号
- # if 0 < find_type_index < 1 \
- # and len(re.findall('[1]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 1 \
- # and len(re.findall('[2-9]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 0:
- # # type_index_list.append(find_type_index)
- # final_index = None
- # for temp_div_index, temp_type in enumerate(sub_type_index_list):
- # if find_type_index == temp_type:
- # final_index = temp_div_index
- # final_block_index = div_index
- # min_block_size = 100000
- # for block in range_index_list:
- # if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
- # min_block_size = block[1] - block[0]
- # final_block_index = block[1]+1
- # if final_index is not None and [final_index, final_block_index] not in range_index_list:
- # range_index_list.append([final_index, final_block_index])
- # if cut_type_index_dict.get(find_type_index) is not None:
- # if div_index > cut_type_index_dict[find_type_index]:
- # cut_type_index_dict[find_type_index] = final_block_index
- # else:
- # cut_type_index_dict[find_type_index] = final_block_index
- # print('continue 2')
- # print('-'*40)
- # continue
- # 找之前相同的序号类型的index,且index不能超过截断的该类型的index
- last_index = len(sub_type_index_list) - 1 - sub_type_index_list[::-1].index(find_type_index)
- print('find_type_index', find_type_index, [last_index, div_index], [sub_type_index_list[0], sub_type_index_list[-1]])
- if last_index < cut_type_index_dict.get(find_type_index, 0):
- # type_index_list.append(find_type_index)
- print('continue 3 last_index < cut_type_index_dict ', last_index, cut_type_index_dict.get(find_type_index, 0))
- print('-'*40)
- continue
- # 新增块
- range_index_list.append([last_index, div_index])
- print('find last_index add block', [last_index, div_index])
- # 更新截断
- if cut_type_index_dict.get(find_type_index) is not None:
- if div_index > cut_type_index_dict[find_type_index]:
- cut_type_index_dict[find_type_index] = div_index
- else:
- cut_type_index_dict[find_type_index] = div_index
- # 找到块了,那么块内的所有序号类型的截断到该块的最小index
- final_type_index_dict = {}
- for temp_div_index, temp_type in enumerate(sub_type_index_list[last_index+1:div_index]):
- temp_div_index += last_index + 1
- if temp_div_index < cut_type_index_dict.get(temp_type, 0):
- continue
- # 对块内有的类型的最后一个都新增块
- if temp_div_index <= range_index_list[-1][0]:
- continue
- final_type_index_dict[temp_type] = temp_div_index
- for temp_type in final_type_index_dict.keys():
- final_index = final_type_index_dict.get(temp_type)
- if [final_index, div_index] not in range_index_list:
- print('add block cut_type_index_dict 1', cut_type_index_dict)
- range_index_list.append([final_index, div_index])
- print('add block ', [final_index, div_index])
- if cut_type_index_dict.get(temp_type) is not None:
- if div_index > cut_type_index_dict[temp_type]:
- cut_type_index_dict[temp_type] = div_index
- else:
- cut_type_index_dict[temp_type] = div_index
- print('add block cut_type_index_dict 2', cut_type_index_dict)
- # temp_type_index_list = []
- else:
- print('find_type_index not in type_index_list')
- print(cut_type_index_dict)
- # 存储所有序号类型
- # type_index_list.append(find_type_index)
- # 存储块内的序号类型
- # temp_type_index_list.append(find_type_index)
- print('-'*40)
- if not range_index_list:
- print('no range_index_list')
- return
- # 排序
- range_index_list.sort(key=lambda x: (x[0], x[1]))
- # 生成最后的块
- for temp_type in range(len(self.order_type_list)):
- for div_index, d in enumerate(div_list[::-1]):
- div_index = len(div_list) - 1 - div_index
- if type_index_list[div_index] != temp_type:
- continue
- if [div_index, div_index+1] not in range_index_list:
- range_index_list.append([div_index, len(div_list)-1])
- break
- # last_block_index = range_index_list[-1][1]
- # for div_index, d in enumerate(div_list[last_block_index:]):
- # div_index = div_index + last_block_index
- # if type_index_list[div_index] < 0:
- # continue
- # if [div_index, len(div_list)-1] not in range_index_list:
- # range_index_list.append([div_index, len(div_list)-1])
- # 排序
- range_index_list.sort(key=lambda x: (x[0], -x[1]))
- print('type_index_list', type_index_list)
- block_dict = {}
- index_div_list = []
- for range_index in range_index_list:
- _text = ''
- for d in div_list[range_index[0]:range_index[1]]:
- _text += d.text
- print(range_index, _text[:20])
- # 合并重叠的
- delete_range_index_list = []
- # for i, range_index in enumerate(range_index_list):
- # if range_index in delete_range_index_list:
- # continue
- # for j in range(i+1, len(range_index_list)):
- # range_index2 = range_index_list[j]
- # if range_index2 in delete_range_index_list:
- # continue
- # if range_index[0] == range_index2[0] or range_index[1] == range_index2[1]:
- # delete_range_index_list.append(range_index2)
- # 补充中间断开的
- add_range_index_list = []
- if range_index_list[0][0] != 0:
- for j in range(0, range_index_list[0][0]):
- add_range_index_list.append([j, j+1])
- for i in range(1, len(range_index_list)):
- range_index1 = range_index_list[i-1]
- range_index2 = range_index_list[i]
- if range_index1[1] != range_index2[0] or (range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]):
- for j in range(range_index1[0], range_index2[0]):
- add_range_index_list.append([j, j+1])
- # add_range_index_list.append([range_index1[0], range_index2[0]])
- # if range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]:
- # add_range_index_list.append([range_index1[0]+1, range_index2[0]])
- print('delete_range_index_list', delete_range_index_list)
- print('add_range_index_list', add_range_index_list)
- print('len(range_index_list)', len(range_index_list))
- for range_index in delete_range_index_list:
- if range_index in range_index_list:
- range_index_list.remove(range_index)
- print('len(range_index_list)', len(range_index_list))
- range_index_list += add_range_index_list
- range_index_list.sort(key=lambda x: (x[0], -x[1]))
- print('len(range_index_list)', len(range_index_list))
- tree_root = self.recursion_get_tree(range_index_list, div_list, 0, len(div_list))
- # print_tree(tree_root)
- with open('../layout.html', 'w') as f:
- f.write('')
- print_tree_order(tree_root, div_list)
- with open('../origin.html', 'w') as f:
- f.write(self.html)
- # 打印某个产品的参数
- if product:
- candidate_div_list = []
- for i, div in enumerate(div_list):
- div = div.text
- if i == 0 or i == len(div_list)-1:
- continue
- if not re.search(product, div):
- continue
- print('find product', div[:20])
- type_index = type_index_list[i]
- type_index_after = None
- for ti in type_index_list[i+1:]:
- if ti != -1:
- type_index_after = ti
- break
- type_index_before = None
- for ti in type_index_list[:i][::-1]:
- if ti != -1:
- type_index_before = ti
- break
- print('type_index, type_index_before, type_index_after1', type_index, type_index_before, type_index_after)
- # 复用序号样式
- dup_type_index_flag = 0
- if type_index_after == type_index:
- dup_type_index_flag = 1
- print('type_index, type_index_before, type_index_after2', type_index, type_index_before, type_index_after)
- block_type_list = []
- block_div_list = []
- no_order_type_list = []
- sub_type_index_list = type_index_list[i:]
- type_index_pair1 = [type_index_before, type_index]
- type_index_pair2 = [type_index, type_index_after]
- for j, ti in enumerate(sub_type_index_list):
- real_j = j + i
- if j == 0 or j == len(sub_type_index_list) - 1:
- continue
- ti_previous = sub_type_index_list[j-1]
- ti_next = sub_type_index_list[j+1]
- ti_pair1 = [ti_previous, ti_next]
- ti_pair2 = [ti, ti_next]
- _div = div_list[real_j].text
- # 判断多层还是单层,且是否第一个
- tis = re.split('[.、]', str(ti))
- temp_tis = []
- for _ti in tis:
- if _ti != '':
- temp_tis.append(_ti)
- tis = temp_tis
- break_flag1 = 0
- if len(tis) >= 2:
- if len(re.findall('[1一]{2,}', tis[-1])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', tis[-1])) != 0:
- break_flag1 = 1
- else:
- if len(re.findall('[1一]{2,}', _div[:3])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', _div[:6])) != 0:
- break_flag1 = 1
- # 有复用的,与搜索的type_index相同且连续,但与之前的相同的type_index的数字不连续
- break_flag2 = 0
- if dup_type_index_flag and type_index == ti and ti in block_type_list:
- last_ti_index = block_type_list[::-1].index(ti)
- last_ti_index = len(block_type_list) - 1 - last_ti_index
- last_ti_div = block_div_list[last_ti_index]
- last_ti_order_no = get_order_no(ti, last_ti_div)
- ti_order_no = get_order_no(ti, _div)
- type_index_order_no = get_order_no(type_index, div)
- print('last_ti_order_no, ti_order_no, type_index_order_no', last_ti_order_no, ti_order_no, type_index_order_no)
- print(last_ti_div[:10], _div[:10], div[:10])
- if None not in [type_index_order_no, last_ti_order_no, ti_order_no]:
- if ti_order_no - type_index_order_no == 1 and ti_order_no - last_ti_order_no != 1:
- break_flag2 = 1
- if break_flag2:
- break
- # 碰到很大的序号类型
- elif ti in [7, 8]:
- break
- # 碰到不是从1开始的
- elif ti == -1:
- no_order_type_list.append(ti)
- block_type_list.append(ti)
- block_div_list.append(_div)
- elif ti not in block_type_list and break_flag1:
- print('not 1 start break', _div[:6], len(re.findall('[1一]', _div[:3])), len(re.findall('[2-9二三四五六七八九十]', _div[:6])))
- print(block_div_list)
- print(block_type_list)
- break
- elif not dup_type_index_flag and ti not in [type_index, type_index_before, type_index_after]:
- block_type_list.append(ti)
- block_div_list.append(_div)
- no_order_type_list = []
- else:
- # 遇到相同类型的组合
- if not dup_type_index_flag and (type_index_pair1 == ti_pair1):
- block_type_list.append(ti)
- block_div_list.append(_div)
- print('type_index_pair1 == ti_pair1 or type_index_pair2 == ti_pair2 break',
- _div[:6], type_index_pair1, ti_pair1, type_index_pair2, ti_pair2)
- break
- else:
- no_order_type_list = []
- block_type_list.append(ti)
- block_div_list.append(_div)
- if not block_type_list:
- continue
- # 排除末尾为非序号的
- if block_type_list[-1] == -1:
- block_type_list = block_type_list[:len(block_type_list)-len(no_order_type_list)]
- block_div_list = block_div_list[:len(block_div_list)-len(no_order_type_list)]
- candidate_div_list.append(block_div_list)
- print('len(candidate_div_list)', len(candidate_div_list))
- print('candidate_div_list', candidate_div_list)
- if candidate_div_list:
- candidate_div_list.sort(key=lambda x: len(x))
- for div in candidate_div_list:
- print(len(div), div)
- print('='*10, product, '='*10)
- for div in candidate_div_list[-1]:
- print(div)
- # print(d.text)
- def order_show_in_layout(self, tree_root, div_list):
- print_tree_order(tree_root, div_list)
- # with open('../result.html', 'r') as f:
- with open(r'C:\Users\Administrator\Desktop\test_layout\4.html', 'r') as f:
- html = f.read()
- LayoutConvert(html).get_order_number_tree('连续性血液净化设备')
- _list = [1, 3, 5, 7, 9]
- print(len(_list) - 1 - _list[::-1].index(3))
|