import os
import sys
sys.setrecursionlimit(10000)
sys.path.append(os.path.dirname(__file__) + "/../")
from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
import re
import traceback
from bs4 import BeautifulSoup
from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator, get_garble_code
from format_convert.wrapt_timeout_decorator import timeout
class TreeNode:
def __init__(self, data):
self.data = data
self.children = []
def add_child(self, child_node):
self.children.append(child_node)
def print_tree(node, level=0):
print(" " * level + str(node.data))
for child in node.children:
print_tree(child, level + 1)
def print_tree_order(node, div_list, level=0):
text = " " * level + div_list[node.data[0]].text
colors = [(255, 0, 0, 0.7), (0, 255, 0, 0.6), (0, 0, 255, 0.6), (255, 127, 0, 0.2),
# (123, 104, 238, 0.2),
(238, 238, 0, 0.2),
(255, 104, 255, 0.2)
]
if level < len(colors):
color = colors[level]
else:
color = colors[-1]
text = '
'.format(str(color)) + text + '
'
if level == 0:
text = '' + text
with open('../layout.html', 'a') as f:
f.write(text)
for child in node.children:
print('node.child', child.data[:10])
print_tree_order(child, div_list, level + 1)
class LayoutConvert:
def __init__(self, html):
self.html = html
self.order_type_list = ['[★]?(\d{1,3}[.])+[.\d]?',
'[★]?[A-Z][.、]',
'[★]?[a-z][.、]',
'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮⑯⑰⑱⑲⑳]',
'[ⅠⅡⅢⅣⅤⅥⅦⅧⅩⅪⅫ]',
'[ⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹ]',
'[❶❷❸❹❻❼❽❾❿]',
'第[一二三四五六七八九十]{1,2}[章节篇]',
'第\d{1,2}[章节篇]',
'[((]\d{1,3}[))]',
'[★]?\d{1,3}、',
'[((][一二三四五六七八九十]{1,3}[))]',
'[一二三四五六七八九十]{1,3}、',
'包[1-9]{1,3}',
'标段[1-9]{1,3}',
]
self.chinese_arabic_dict = {
'一': 1,
'二': 2,
'三': 3,
'四': 4,
'五': 5,
'六': 6,
'七': 7,
'八': 8,
'九': 9,
'十': 10,
}
def get_layout(self):
return
def recursion_get_tree(self, index_list, div_list, start_index, end_index):
print([start_index, end_index], div_list[start_index].text[:10], '-'*20)
tree_node = TreeNode([start_index, end_index])
if end_index - start_index == 1:
print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
return tree_node
temp_end_i = index_list[0][0]
for start_i, end_i in index_list:
if not start_index < start_i <= end_i <= end_index:
if start_i == 0:
print('continue not start_index < start_i <= end_i <= end_index', start_i, end_i)
continue
if start_i < temp_end_i:
print('continue start_i < temp_end_i', start_i, temp_end_i, div_list[start_i])
continue
sub_tree_node = self.recursion_get_tree(index_list, div_list, start_i, end_i)
tree_node.add_child(sub_tree_node)
temp_end_i = end_i
print([start_index, end_index], div_list[end_index-1].text[:10], '='*20)
return tree_node
def get_order_number_tree(self, product=None):
def get_order_no(_ti, _div_text):
_tis = re.split('[.、]', str(_ti))
temp_tis = []
for _t in _tis:
if _t != '':
temp_tis.append(_t)
_tis = temp_tis
_ti_order_no = None
if len(_tis) >= 2:
re.search('', _div_text)
else:
_match = re.search('[1-9]+', _div_text)
if _match:
_ti_order_no = int(_match.group())
else:
_match = re.search('[一二三四五六七八九十]+', _div_text)
if _match:
_ti_order_no = _match.group()
temp_order_no = ''
for o in _ti_order_no:
temp_order_no += str(self.chinese_arabic_dict.get(o))
_ti_order_no = int(temp_order_no)
return _ti_order_no
soup = BeautifulSoup(self.html, 'lxml')
div_list = soup.findAll('div')
type_index_list = []
range_index_list = []
cut_type_index_dict = {}
# temp_type_index_list = []
# 获取每一行的序号类型
for div_index, d in enumerate(div_list):
text = d.text
# 判断该行是什么序号类型
find_type_index = -1
for type_index, reg in enumerate(self.order_type_list):
if find_type_index >= 0:
continue
match = re.finditer(reg, text)
for m in match:
if m.span()[0] != 0:
continue
order = m.group()
if type_index in [0, 1]:
order = re.sub('[★]', '', order)
# 普通情况,单层序号
if type_index != 0:
find_type_index = type_index
# 特殊情况,多层序号
else:
ss = order.split('.')
# if len(re.findall('[.]', m.group())) == 1:
if len(ss) - ss.count('') == 1:
find_type_index = 0
# print('find_type_index1', find_type_index, text[:5])
else:
# 用小数表示多层序号
find_type_index = re.sub('\d+', '0', order)
find_type_index = re.sub('[.]', '', find_type_index)
find_type_index = find_type_index[0] + '.' + find_type_index[1:-1] + '1'
find_type_index = float(find_type_index)
# print('find_type_index2', find_type_index, text[:5])
break
type_index_list.append(find_type_index)
# 根据每一行的序号类型分块
for div_index, d in enumerate(div_list):
find_type_index = type_index_list[div_index]
sub_type_index_list = type_index_list[:div_index]
text = d.text
print(text)
# 若无序号类型,跳过
if find_type_index < 0:
# type_index_list.append(find_type_index)
print('continue -1')
print('-'*40)
continue
print('find_type_index, div_index', find_type_index, div_index)
# 已经存在相同的序号类型
if find_type_index in sub_type_index_list:
# # 判断是否开始的序号
# if (find_type_index >= 1 or find_type_index == 0) and len(re.findall('[1一]', text[:3])) == 1 \
# and len(re.findall('[2-9二三四五六七八九十]', text[:3])) == 0:
# # type_index_list.append(find_type_index)
# final_index = None
# for temp_div_index, temp_type in enumerate(sub_type_index_list):
# if find_type_index == temp_type:
# final_index = temp_div_index
# final_block_index = div_index
# min_block_size = 100000
# for block in range_index_list:
# if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
# min_block_size = block[1] - block[0]
# final_block_index = block[1]+1
# if final_index is not None and [final_index, final_block_index] not in range_index_list:
# range_index_list.append([final_index, final_block_index])
# if cut_type_index_dict.get(find_type_index) is not None:
# if div_index > cut_type_index_dict[find_type_index]:
# cut_type_index_dict[find_type_index] = final_block_index
# else:
# cut_type_index_dict[find_type_index] = final_block_index
# print('continue 1')
# print('cut_type_index_dict', cut_type_index_dict)
# print('-'*40)
# continue
# 判断是否开始的序号
# if 0 < find_type_index < 1 \
# and len(re.findall('[1]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 1 \
# and len(re.findall('[2-9]', text[len(str(find_type_index))-1:len(str(find_type_index))+1])) == 0:
# # type_index_list.append(find_type_index)
# final_index = None
# for temp_div_index, temp_type in enumerate(sub_type_index_list):
# if find_type_index == temp_type:
# final_index = temp_div_index
# final_block_index = div_index
# min_block_size = 100000
# for block in range_index_list:
# if block[0] <= final_index <= block[1] and block[1] - block[0] < min_block_size:
# min_block_size = block[1] - block[0]
# final_block_index = block[1]+1
# if final_index is not None and [final_index, final_block_index] not in range_index_list:
# range_index_list.append([final_index, final_block_index])
# if cut_type_index_dict.get(find_type_index) is not None:
# if div_index > cut_type_index_dict[find_type_index]:
# cut_type_index_dict[find_type_index] = final_block_index
# else:
# cut_type_index_dict[find_type_index] = final_block_index
# print('continue 2')
# print('-'*40)
# continue
# 找之前相同的序号类型的index,且index不能超过截断的该类型的index
last_index = len(sub_type_index_list) - 1 - sub_type_index_list[::-1].index(find_type_index)
print('find_type_index', find_type_index, [last_index, div_index], [sub_type_index_list[0], sub_type_index_list[-1]])
if last_index < cut_type_index_dict.get(find_type_index, 0):
# type_index_list.append(find_type_index)
print('continue 3 last_index < cut_type_index_dict ', last_index, cut_type_index_dict.get(find_type_index, 0))
print('-'*40)
continue
# 新增块
range_index_list.append([last_index, div_index])
print('find last_index add block', [last_index, div_index])
# 更新截断
if cut_type_index_dict.get(find_type_index) is not None:
if div_index > cut_type_index_dict[find_type_index]:
cut_type_index_dict[find_type_index] = div_index
else:
cut_type_index_dict[find_type_index] = div_index
# 找到块了,那么块内的所有序号类型的截断到该块的最小index
final_type_index_dict = {}
for temp_div_index, temp_type in enumerate(sub_type_index_list[last_index+1:div_index]):
temp_div_index += last_index + 1
if temp_div_index < cut_type_index_dict.get(temp_type, 0):
continue
# 对块内有的类型的最后一个都新增块
if temp_div_index <= range_index_list[-1][0]:
continue
final_type_index_dict[temp_type] = temp_div_index
for temp_type in final_type_index_dict.keys():
final_index = final_type_index_dict.get(temp_type)
if [final_index, div_index] not in range_index_list:
print('add block cut_type_index_dict 1', cut_type_index_dict)
range_index_list.append([final_index, div_index])
print('add block ', [final_index, div_index])
if cut_type_index_dict.get(temp_type) is not None:
if div_index > cut_type_index_dict[temp_type]:
cut_type_index_dict[temp_type] = div_index
else:
cut_type_index_dict[temp_type] = div_index
print('add block cut_type_index_dict 2', cut_type_index_dict)
# temp_type_index_list = []
else:
print('find_type_index not in type_index_list')
print(cut_type_index_dict)
# 存储所有序号类型
# type_index_list.append(find_type_index)
# 存储块内的序号类型
# temp_type_index_list.append(find_type_index)
print('-'*40)
if not range_index_list:
print('no range_index_list')
return
# 排序
range_index_list.sort(key=lambda x: (x[0], x[1]))
# 生成最后的块
for temp_type in range(len(self.order_type_list)):
for div_index, d in enumerate(div_list[::-1]):
div_index = len(div_list) - 1 - div_index
if type_index_list[div_index] != temp_type:
continue
if [div_index, div_index+1] not in range_index_list:
range_index_list.append([div_index, len(div_list)-1])
break
# last_block_index = range_index_list[-1][1]
# for div_index, d in enumerate(div_list[last_block_index:]):
# div_index = div_index + last_block_index
# if type_index_list[div_index] < 0:
# continue
# if [div_index, len(div_list)-1] not in range_index_list:
# range_index_list.append([div_index, len(div_list)-1])
# 排序
range_index_list.sort(key=lambda x: (x[0], -x[1]))
print('type_index_list', type_index_list)
block_dict = {}
index_div_list = []
for range_index in range_index_list:
_text = ''
for d in div_list[range_index[0]:range_index[1]]:
_text += d.text
print(range_index, _text[:20])
# 合并重叠的
delete_range_index_list = []
# for i, range_index in enumerate(range_index_list):
# if range_index in delete_range_index_list:
# continue
# for j in range(i+1, len(range_index_list)):
# range_index2 = range_index_list[j]
# if range_index2 in delete_range_index_list:
# continue
# if range_index[0] == range_index2[0] or range_index[1] == range_index2[1]:
# delete_range_index_list.append(range_index2)
# 补充中间断开的
add_range_index_list = []
if range_index_list[0][0] != 0:
for j in range(0, range_index_list[0][0]):
add_range_index_list.append([j, j+1])
for i in range(1, len(range_index_list)):
range_index1 = range_index_list[i-1]
range_index2 = range_index_list[i]
if range_index1[1] != range_index2[0] or (range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]):
for j in range(range_index1[0], range_index2[0]):
add_range_index_list.append([j, j+1])
# add_range_index_list.append([range_index1[0], range_index2[0]])
# if range_index1[1] - range_index1[0] > 1 and range_index1[0] != range_index2[0]:
# add_range_index_list.append([range_index1[0]+1, range_index2[0]])
print('delete_range_index_list', delete_range_index_list)
print('add_range_index_list', add_range_index_list)
print('len(range_index_list)', len(range_index_list))
for range_index in delete_range_index_list:
if range_index in range_index_list:
range_index_list.remove(range_index)
print('len(range_index_list)', len(range_index_list))
range_index_list += add_range_index_list
range_index_list.sort(key=lambda x: (x[0], -x[1]))
print('len(range_index_list)', len(range_index_list))
tree_root = self.recursion_get_tree(range_index_list, div_list, 0, len(div_list))
# print_tree(tree_root)
with open('../layout.html', 'w') as f:
f.write('')
print_tree_order(tree_root, div_list)
with open('../origin.html', 'w') as f:
f.write(self.html)
# 打印某个产品的参数
if product:
candidate_div_list = []
for i, div in enumerate(div_list):
div = div.text
if i == 0 or i == len(div_list)-1:
continue
if not re.search(product, div):
continue
print('find product', div[:20])
type_index = type_index_list[i]
type_index_after = None
for ti in type_index_list[i+1:]:
if ti != -1:
type_index_after = ti
break
type_index_before = None
for ti in type_index_list[:i][::-1]:
if ti != -1:
type_index_before = ti
break
print('type_index, type_index_before, type_index_after1', type_index, type_index_before, type_index_after)
# 复用序号样式
dup_type_index_flag = 0
if type_index_after == type_index:
dup_type_index_flag = 1
print('type_index, type_index_before, type_index_after2', type_index, type_index_before, type_index_after)
block_type_list = []
block_div_list = []
no_order_type_list = []
sub_type_index_list = type_index_list[i:]
type_index_pair1 = [type_index_before, type_index]
type_index_pair2 = [type_index, type_index_after]
for j, ti in enumerate(sub_type_index_list):
real_j = j + i
if j == 0 or j == len(sub_type_index_list) - 1:
continue
ti_previous = sub_type_index_list[j-1]
ti_next = sub_type_index_list[j+1]
ti_pair1 = [ti_previous, ti_next]
ti_pair2 = [ti, ti_next]
_div = div_list[real_j].text
# 判断多层还是单层,且是否第一个
tis = re.split('[.、]', str(ti))
temp_tis = []
for _ti in tis:
if _ti != '':
temp_tis.append(_ti)
tis = temp_tis
break_flag1 = 0
if len(tis) >= 2:
if len(re.findall('[1一]{2,}', tis[-1])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', tis[-1])) != 0:
break_flag1 = 1
else:
if len(re.findall('[1一]{2,}', _div[:3])) >= 1 or len(re.findall('[2-9二三四五六七八九十]', _div[:6])) != 0:
break_flag1 = 1
# 有复用的,与搜索的type_index相同且连续,但与之前的相同的type_index的数字不连续
break_flag2 = 0
if dup_type_index_flag and type_index == ti and ti in block_type_list:
last_ti_index = block_type_list[::-1].index(ti)
last_ti_index = len(block_type_list) - 1 - last_ti_index
last_ti_div = block_div_list[last_ti_index]
last_ti_order_no = get_order_no(ti, last_ti_div)
ti_order_no = get_order_no(ti, _div)
type_index_order_no = get_order_no(type_index, div)
print('last_ti_order_no, ti_order_no, type_index_order_no', last_ti_order_no, ti_order_no, type_index_order_no)
print(last_ti_div[:10], _div[:10], div[:10])
if None not in [type_index_order_no, last_ti_order_no, ti_order_no]:
if ti_order_no - type_index_order_no == 1 and ti_order_no - last_ti_order_no != 1:
break_flag2 = 1
if break_flag2:
break
# 碰到很大的序号类型
elif ti in [7, 8]:
break
# 碰到不是从1开始的
elif ti == -1:
no_order_type_list.append(ti)
block_type_list.append(ti)
block_div_list.append(_div)
elif ti not in block_type_list and break_flag1:
print('not 1 start break', _div[:6], len(re.findall('[1一]', _div[:3])), len(re.findall('[2-9二三四五六七八九十]', _div[:6])))
print(block_div_list)
print(block_type_list)
break
elif not dup_type_index_flag and ti not in [type_index, type_index_before, type_index_after]:
block_type_list.append(ti)
block_div_list.append(_div)
no_order_type_list = []
else:
# 遇到相同类型的组合
if not dup_type_index_flag and (type_index_pair1 == ti_pair1):
block_type_list.append(ti)
block_div_list.append(_div)
print('type_index_pair1 == ti_pair1 or type_index_pair2 == ti_pair2 break',
_div[:6], type_index_pair1, ti_pair1, type_index_pair2, ti_pair2)
break
else:
no_order_type_list = []
block_type_list.append(ti)
block_div_list.append(_div)
if not block_type_list:
continue
# 排除末尾为非序号的
if block_type_list[-1] == -1:
block_type_list = block_type_list[:len(block_type_list)-len(no_order_type_list)]
block_div_list = block_div_list[:len(block_div_list)-len(no_order_type_list)]
candidate_div_list.append(block_div_list)
print('len(candidate_div_list)', len(candidate_div_list))
print('candidate_div_list', candidate_div_list)
if candidate_div_list:
candidate_div_list.sort(key=lambda x: len(x))
for div in candidate_div_list:
print(len(div), div)
print('='*10, product, '='*10)
for div in candidate_div_list[-1]:
print(div)
# print(d.text)
def order_show_in_layout(self, tree_root, div_list):
print_tree_order(tree_root, div_list)
# with open('../result.html', 'r') as f:
with open(r'C:\Users\Administrator\Desktop\test_layout\4.html', 'r') as f:
html = f.read()
LayoutConvert(html).get_order_number_tree('连续性血液净化设备')
_list = [1, 3, 5, 7, 9]
print(len(_list) - 1 - _list[::-1].index(3))