|
@@ -26,7 +26,7 @@ from BiddingKG.dl.entityLink.entityLink import *
|
|
|
|
|
|
|
|
|
|
#
|
|
#
|
|
-def tableToText(soup):
|
|
|
|
|
|
+def tableToText(soup, docid=None):
|
|
'''
|
|
'''
|
|
@param:
|
|
@param:
|
|
soup:网页html的soup
|
|
soup:网页html的soup
|
|
@@ -181,7 +181,7 @@ def tableToText(soup):
|
|
count_1 = 0
|
|
count_1 = 0
|
|
count_0 = 0
|
|
count_0 = 0
|
|
for i in range(len(line)):
|
|
for i in range(len(line)):
|
|
- if line[i][0]==fix_value:
|
|
|
|
|
|
+ if line[i][0] == fix_value:
|
|
continue
|
|
continue
|
|
if line[i][1]==1:
|
|
if line[i][1]==1:
|
|
if first_1==-1:
|
|
if first_1==-1:
|
|
@@ -211,12 +211,12 @@ def tableToText(soup):
|
|
"""
|
|
"""
|
|
@summary: 计算每个节点受到的挤压度来判断是否需要染色
|
|
@summary: 计算每个节点受到的挤压度来判断是否需要染色
|
|
"""
|
|
"""
|
|
- #print("B",inner_table[index])
|
|
|
|
|
|
+ # print("B",inner_table[index])
|
|
min_presure = 3
|
|
min_presure = 3
|
|
list_dye = []
|
|
list_dye = []
|
|
first = None
|
|
first = None
|
|
count = 0
|
|
count = 0
|
|
- temp_set = set()
|
|
|
|
|
|
+ temp_set = set(['~~'])
|
|
_index = 0
|
|
_index = 0
|
|
for item in inner_table[index]:
|
|
for item in inner_table[index]:
|
|
if first is None:
|
|
if first is None:
|
|
@@ -272,7 +272,7 @@ def tableToText(soup):
|
|
dye_set.add((inner_table[index][h][0],dye_type))
|
|
dye_set.add((inner_table[index][h][0],dye_type))
|
|
key_set.add(inner_table[index][h][0])
|
|
key_set.add(inner_table[index][h][0])
|
|
begin = end
|
|
begin = end
|
|
- #print("E",inner_table[index])
|
|
|
|
|
|
+ # print("E",inner_table[index])
|
|
|
|
|
|
def otherrepair(inner_table,index,dye_set,key_set):
|
|
def otherrepair(inner_table,index,dye_set,key_set):
|
|
list_provide_repair = []
|
|
list_provide_repair = []
|
|
@@ -327,25 +327,254 @@ def tableToText(soup):
|
|
return
|
|
return
|
|
repairTable(inner_table, dye_set, key_set)
|
|
repairTable(inner_table, dye_set, key_set)
|
|
|
|
|
|
- def repair_table2(inner_table):
|
|
|
|
|
|
+ def repair_table2(inner_table, show=0, row_no=0):
|
|
"""
|
|
"""
|
|
@summary: 修复表头识别,将明显错误的进行修正
|
|
@summary: 修复表头识别,将明显错误的进行修正
|
|
"""
|
|
"""
|
|
- # 修复第一第二第三中标候选人作为列表头
|
|
|
|
- if len(inner_table) >= 2 and len(inner_table[0]) >= 3:
|
|
|
|
- for i in range(len(inner_table[:3])):
|
|
|
|
- for j in range(len(inner_table[i])-2):
|
|
|
|
- if inner_table[i][j][0] == '第一中标候选人' \
|
|
|
|
- and inner_table[i][j+1][0] == '第二中标候选人' \
|
|
|
|
- and inner_table[i][j+2][0] == '第三中标候选人' \
|
|
|
|
- and i+1 < len(inner_table) \
|
|
|
|
- and inner_table[i+1][j][1] == 0 \
|
|
|
|
- and inner_table[i+1][j+1][1] == 0 \
|
|
|
|
- and inner_table[i+1][j+2][1] == 0:
|
|
|
|
|
|
+
|
|
|
|
+ # 循环处理单元格,一次获取需要的
|
|
|
|
+ one_head_index_list = []
|
|
|
|
+ zero_head_index_list = []
|
|
|
|
+ all_head_index_list = []
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ head_cnt = 0
|
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
|
+ # 删除前后逗号
|
|
|
|
+ inner_table[i][j][0] = re.sub('^[,,]+', '', inner_table[i][j][0])
|
|
|
|
+ inner_table[i][j][0] = re.sub('[,,]+$', '', inner_table[i][j][0])
|
|
|
|
+
|
|
|
|
+ # 统计表头数
|
|
|
|
+ if inner_table[i][j][1] == 1:
|
|
|
|
+ head_cnt += 1
|
|
|
|
+
|
|
|
|
+ # 表头数list
|
|
|
|
+ if head_cnt == 0:
|
|
|
|
+ zero_head_index_list.append(i)
|
|
|
|
+ elif head_cnt == 1:
|
|
|
|
+ one_head_index_list.append(i)
|
|
|
|
+ elif head_cnt == len(inner_table[i]):
|
|
|
|
+ all_head_index_list.append(i)
|
|
|
|
+
|
|
|
|
+ # 修复冒号在文本中间的,不能作为表头;(冒号后面需多个字)
|
|
|
|
+ # 冒号在括号中的除外
|
|
|
|
+ # 冒号在最后的,判断后一个格子是否有重复的文字
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
|
+ _text = inner_table[i][j][0]
|
|
|
|
+ if len(_text) >= 3 and inner_table[i][j][1] == 1:
|
|
|
|
+ match = re.search('[::]', _text)
|
|
|
|
+ if match:
|
|
|
|
+ start_index, end_index = match.span()
|
|
|
|
+ if start_index == 0:
|
|
|
|
+ continue
|
|
|
|
+ if end_index == len(_text):
|
|
|
|
+ if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+ inner_table[i][j+1][1] = 0
|
|
|
|
+ else:
|
|
|
|
+ continue
|
|
|
|
+ if re.search('[((]', _text[:start_index]) and re.search('[))]', _text[end_index:]):
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ m1 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[:start_index])
|
|
|
|
+ m2 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[end_index:])
|
|
|
|
+ if m1 and m2 and (len(m2.group()) >= 2 or m2.group() in ['是', '否']):
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+
|
|
|
|
+ if show:
|
|
|
|
+ print('inner_table[i]1', inner_table[row_no])
|
|
|
|
+
|
|
|
|
+ # 修复实际只有几列,但有一列由于重复占了太多行表头识别错误
|
|
|
|
+ # for i in range(len(inner_table)):
|
|
|
|
+ # head_flag_dict = {}
|
|
|
|
+ # for j in range(len(inner_table[i])):
|
|
|
|
+ # if inner_table[i][j][0] in head_flag_dict.keys():
|
|
|
|
+ # head_flag_dict[inner_table[i][j][0]] += [inner_table[i][j][1]]
|
|
|
|
+ # else:
|
|
|
|
+ # head_flag_dict[inner_table[i][j][0]] = [inner_table[i][j][1]]
|
|
|
|
+ #
|
|
|
|
+ # if len(head_flag_dict.keys()) == 2:
|
|
|
|
+ # col_flag = None
|
|
|
|
+ # col_value = None
|
|
|
|
+ # for key in head_flag_dict.keys():
|
|
|
|
+ # flag_list = head_flag_dict[key]
|
|
|
|
+ # if len(flag_list) >= 4 and len(set(flag_list)) == 2 and len(set(flag_list[1:])) == 1:
|
|
|
|
+ # col_flag = flag_list[0]
|
|
|
|
+ # col_value = key
|
|
|
|
+ # break
|
|
|
|
+ #
|
|
|
|
+ # if col_flag is not None:
|
|
|
|
+ # for j in range(len(inner_table[i])):
|
|
|
|
+ # if inner_table[i][j][0] == col_value:
|
|
|
|
+ # inner_table[i][j][1] = col_flag
|
|
|
|
+
|
|
|
|
+ # 多个重复列的预测值不同,以第一个为准
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ col = inner_table[i][0]
|
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
|
+ if inner_table[i][j][0] == col[0]:
|
|
|
|
+ if inner_table[i][j][1] != col[1]:
|
|
|
|
+ inner_table[i][j][1] = col[1]
|
|
|
|
+ else:
|
|
|
|
+ col = inner_table[i][j]
|
|
|
|
+
|
|
|
|
+ if show:
|
|
|
|
+ print('inner_table[i]2', inner_table[row_no])
|
|
|
|
+
|
|
|
|
+ # 修复多个重复的单元格表头不一致
|
|
|
|
+ # for i in range(len(inner_table)):
|
|
|
|
+ # for j in range(len(inner_table[i])-1):
|
|
|
|
+ # only_chinese1 = ''.join(re.findall('[\u4e00-\u9fa5]+', inner_table[i][j][0]))
|
|
|
|
+ # only_chinese2 = ''.join(re.findall('[\u4e00-\u9fa5]+', inner_table[i][j+1][0]))
|
|
|
|
+ # if only_chinese1 == only_chinese2 and inner_table[i][j][1] != inner_table[i][j+1][1]:
|
|
|
|
+ # inner_table[i][j][1] = 1
|
|
|
|
+ # inner_table[i][j+1][1] = 1
|
|
|
|
+
|
|
|
|
+ # if show:
|
|
|
|
+ # print('inner_table[i]3', inner_table[row_no])
|
|
|
|
+
|
|
|
|
+ # # 修复一行几乎都是表头,个别不是;或者一行几乎都是非表头,个别是
|
|
|
|
+ # for i in range(len(inner_table)):
|
|
|
|
+ # head_dict = {}
|
|
|
|
+ # not_head_dict = {}
|
|
|
|
+ # for j in range(len(inner_table[i])):
|
|
|
|
+ # if inner_table[i][j][1] == 1:
|
|
|
|
+ # if inner_table[i][j][0] not in head_dict:
|
|
|
|
+ # head_dict[inner_table[i][j][0]] = 1
|
|
|
|
+ # else:
|
|
|
|
+ # if inner_table[i][j][0] not in not_head_dict:
|
|
|
|
+ # not_head_dict[inner_table[i][j][0]] = 1
|
|
|
|
+ #
|
|
|
|
+ # # 非表头:表头 <= 1:3
|
|
|
|
+ # # if len(head_dict.keys()) > 0 and len(not_head_dict.keys()) / len(head_dict.keys()) <= 1/3 and len(head_dict.keys()) >= 3:
|
|
|
|
+ # # for j in range(len(inner_table[i])):
|
|
|
|
+ # # if len(re.sub(' ', '', inner_table[i][j][0])) > 0:
|
|
|
|
+ # # inner_table[i][j][1] = 1
|
|
|
|
+ #
|
|
|
|
+ # # 表头数一个且非表头数大于2且上一行都是表头
|
|
|
|
+ # if i > 0 and len(head_dict.keys()) == 1 and len(not_head_dict.keys()) >= 2 and inner_table[i][0][1] == 0:
|
|
|
|
+ # last_row = inner_table[i-1]
|
|
|
|
+ # col_list = []
|
|
|
|
+ # for j in range(len(last_row)):
|
|
|
|
+ # if len(re.sub(' ', '', last_row[j][0])) > 0:
|
|
|
|
+ # if last_row[j][1] == 0:
|
|
|
|
+ # col_list = []
|
|
|
|
+ # break
|
|
|
|
+ # col_list.append(last_row[j][0])
|
|
|
|
+ # if col_list:
|
|
|
|
+ # col_list = list(set(col_list))
|
|
|
|
+ # if len(col_list) > 2:
|
|
|
|
+ # for j in range(len(inner_table[i])):
|
|
|
|
+ # if inner_table[i][j][1] == 1:
|
|
|
|
+ # inner_table[i][j][1] = 0
|
|
|
|
+
|
|
|
|
+ # 一整个大表格,第一行为表头,下面行中有个别格子被识别为表头
|
|
|
|
+ # 候选人后面修复
|
|
|
|
+ for index in one_head_index_list:
|
|
|
|
+ if (index - 1 in zero_head_index_list and index - 2 in zero_head_index_list) \
|
|
|
|
+ or (index - 1 in zero_head_index_list and index - 2 in all_head_index_list) \
|
|
|
|
+ or (index - 1 in all_head_index_list):
|
|
|
|
+ for j in range(len(inner_table[index])):
|
|
|
|
+ inner_table[index][j][1] = 0
|
|
|
|
+ zero_head_index_list.append(index)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ if show:
|
|
|
|
+ print('inner_table[i]4', inner_table[row_no])
|
|
|
|
+
|
|
|
|
+ # 修复第一第二第三中标候选人作为表头
|
|
|
|
+ first_tenderer = ['第一中标候选人', '第一中标人', '第一中标(成交)人', '第一候选人']
|
|
|
|
+ second_tenderer = ['第二中标候选人', '第二中标(成交)候选人', '第二候选人']
|
|
|
|
+ third_tenderer = ['第三中标候选人', '第三中标(成交)候选人', '第三候选人']
|
|
|
|
+ # n1 next one, n2 next two, l1 last one, l2 last two
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ row = inner_table[i]
|
|
|
|
+ n1_row, n2_row = None, None
|
|
|
|
+ if i+1 < len(inner_table):
|
|
|
|
+ n1_row = inner_table[i+1]
|
|
|
|
+ if i+2 < len(inner_table):
|
|
|
|
+ n2_row = inner_table[i+2]
|
|
|
|
+ for j in range(len(row)):
|
|
|
|
+ row_col = row[j]
|
|
|
|
+ n1_row_col, n2_row_col = None, None
|
|
|
|
+ row_n1_col, row_n2_col = None, None
|
|
|
|
+ n1_row_n1_col, n2_row_n1_col, n1_row_n2_col = None, None, None
|
|
|
|
+ if n1_row:
|
|
|
|
+ n1_row_col = n1_row[j]
|
|
|
|
+ if n2_row:
|
|
|
|
+ n2_row_col = n2_row[j]
|
|
|
|
+ if j+1 < len(row):
|
|
|
|
+ row_n1_col = row[j+1]
|
|
|
|
+ if j+2 < len(row):
|
|
|
|
+ row_n2_col = row[j+2]
|
|
|
|
+ if n1_row and j+1 < len(n1_row):
|
|
|
|
+ n1_row_n1_col = n1_row[j+1]
|
|
|
|
+ if n2_row and j+1 < len(n2_row):
|
|
|
|
+ n2_row_n1_col = n2_row[j+1]
|
|
|
|
+ if n1_row and j+2 < len(n1_row):
|
|
|
|
+ n1_row_n2_col = n1_row[j+2]
|
|
|
|
+
|
|
|
|
+ # 连续作为行表头
|
|
|
|
+ if row_col[0] in first_tenderer and row_n1_col and row_n1_col[1] == 0:
|
|
|
|
+ if n1_row_col and n1_row_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
|
|
|
|
+ inner_table[i][j][1] = 1
|
|
|
|
+ inner_table[i+1][j][1] = 1
|
|
|
|
+ if n2_row_col and n2_row_col[0] in third_tenderer and n2_row_n1_col and n2_row_n1_col[1] == 0:
|
|
|
|
+ inner_table[i+2][j][1] = 1
|
|
|
|
+
|
|
|
|
+ # 连续作为列表头
|
|
|
|
+ if row_col[0] in first_tenderer and n1_row_col and n1_row_col[1] == 0:
|
|
|
|
+ if row_n1_col and row_n1_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
|
|
inner_table[i][j][1] = 1
|
|
inner_table[i][j][1] = 1
|
|
inner_table[i][j+1][1] = 1
|
|
inner_table[i][j+1][1] = 1
|
|
|
|
+ if row_n2_col and row_n2_col[0] in third_tenderer and n1_row_n2_col and n1_row_n2_col[1] == 0:
|
|
inner_table[i][j+2][1] = 1
|
|
inner_table[i][j+2][1] = 1
|
|
- break
|
|
|
|
|
|
+
|
|
|
|
+ if show:
|
|
|
|
+ print('inner_table[i]5', inner_table[row_no])
|
|
|
|
+
|
|
|
|
+ # 修复表头关键词未作为表头
|
|
|
|
+ # 文本匹配关键词,直接作为表头
|
|
|
|
+ head_keyword = ['供应商', '总价']
|
|
|
|
+ # 末尾匹配关键词且前一列为表头且与前一列文本不同,直接不做表头
|
|
|
|
+ head_keyword2 = ['管理中心', '有限公司', '项目采购', ]
|
|
|
|
+ # 开头匹配关键词,直接不做表头
|
|
|
|
+ head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
|
|
|
|
+ # 文本匹配关键词且前一列为表头,直接作为表头
|
|
|
|
+ head_keyword4 = ['综合排名']
|
|
|
|
+ # 文本在关键词中,直接不做表头
|
|
|
|
+ head_keyword5 = ['殡葬用地']
|
|
|
|
+
|
|
|
|
+ # n1 next one, n2 next two, l1 last one, l2 last two
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ row = inner_table[i]
|
|
|
|
+ for j in range(len(row)):
|
|
|
|
+ row_col = row[j]
|
|
|
|
+ row_l1_col = None
|
|
|
|
+ if j-1 > 0:
|
|
|
|
+ row_l1_col = row[j-1]
|
|
|
|
+
|
|
|
|
+ match = re.search('[\u4e00-\u9fa50-9a-zA-Z::]+', row_col[0])
|
|
|
|
+ if inner_table[i][j][1] == 0 and match and match.group() in head_keyword:
|
|
|
|
+ inner_table[i][j][1] = 1
|
|
|
|
+ for key in head_keyword2:
|
|
|
|
+ match = re.search(key+'$', row_col[0])
|
|
|
|
+ if j > 0 and row_l1_col and row_l1_col[1] == 1 and row_l1_col[0] != row_col[0] and match and row_col[1] == 1:
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+ for key in head_keyword3:
|
|
|
|
+ match = re.search('^'+key, row_col[0])
|
|
|
|
+ if match and row_col[1] == 1:
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+ for key in head_keyword4:
|
|
|
|
+ match = re.search(key, row_col[0])
|
|
|
|
+ if j > 0 and row_l1_col and row_l1_col[1] == 1 and match and row_col[1] == 0:
|
|
|
|
+ inner_table[i][j][1] = 1
|
|
|
|
+ if row_col[0] in head_keyword5:
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+
|
|
|
|
+ if show:
|
|
|
|
+ print('inner_table[i]6', inner_table[row_no])
|
|
|
|
|
|
# 修复姓名被作为表头 # 2023-02-10 取消修复,避免项目名称、编号,单位、单价等作为了非表头
|
|
# 修复姓名被作为表头 # 2023-02-10 取消修复,避免项目名称、编号,单位、单价等作为了非表头
|
|
# surname = [
|
|
# surname = [
|
|
@@ -358,6 +587,7 @@ def tableToText(soup):
|
|
# and (inner_table[i][j][0][0] in surname or inner_table[i][j][0][:2] in surname) \
|
|
# and (inner_table[i][j][0][0] in surname or inner_table[i][j][0][:2] in surname) \
|
|
# and re.search("[^\u4e00-\u9fa5]", inner_table[i][j][0]) is None:
|
|
# and re.search("[^\u4e00-\u9fa5]", inner_table[i][j][0]) is None:
|
|
# inner_table[i][j][1] = 0
|
|
# inner_table[i][j][1] = 0
|
|
|
|
+
|
|
return inner_table
|
|
return inner_table
|
|
|
|
|
|
def sliceTable(inner_table,fix_value="~~"):
|
|
def sliceTable(inner_table,fix_value="~~"):
|
|
@@ -456,7 +686,7 @@ def tableToText(soup):
|
|
|
|
|
|
return inner_table,head_list
|
|
return inner_table,head_list
|
|
|
|
|
|
- def set_head_model(inner_table):
|
|
|
|
|
|
+ def set_head_model(inner_table, show=0):
|
|
origin_inner_table = copy.deepcopy(inner_table)
|
|
origin_inner_table = copy.deepcopy(inner_table)
|
|
for i in range(len(inner_table)):
|
|
for i in range(len(inner_table)):
|
|
for j in range(len(inner_table[i])):
|
|
for j in range(len(inner_table[i])):
|
|
@@ -469,16 +699,30 @@ def tableToText(soup):
|
|
# 模型预测表头
|
|
# 模型预测表头
|
|
predict_list = predict(inner_table)
|
|
predict_list = predict(inner_table)
|
|
|
|
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ predict_list = predict(inner_table)
|
|
|
|
+ print('table head predict cost: ', time.time()-start_time)
|
|
|
|
+
|
|
# 组合结果
|
|
# 组合结果
|
|
for i in range(len(inner_table)):
|
|
for i in range(len(inner_table)):
|
|
for j in range(len(inner_table[i])):
|
|
for j in range(len(inner_table[i])):
|
|
inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
|
|
inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
|
|
|
|
|
|
- # print("table_head before repair", inner_table)
|
|
|
|
|
|
+ if show:
|
|
|
|
+ print("table_head before repair")
|
|
|
|
+ for r in inner_table:
|
|
|
|
+ print('row', r)
|
|
|
|
+ print("="*80)
|
|
|
|
|
|
# 表头修正
|
|
# 表头修正
|
|
- repairTable(inner_table)
|
|
|
|
- inner_table = repair_table2(inner_table)
|
|
|
|
|
|
+ # repairTable(inner_table)
|
|
|
|
+ inner_table = table_head_repair_process(inner_table, docid)
|
|
|
|
+
|
|
|
|
+ if show:
|
|
|
|
+ print("="*80)
|
|
|
|
+ print("table_head after repair")
|
|
|
|
+ for r in inner_table:
|
|
|
|
+ print('row', r)
|
|
|
|
|
|
# 按表头分割表格
|
|
# 按表头分割表格
|
|
head_list = sliceTable(inner_table)
|
|
head_list = sliceTable(inner_table)
|
|
@@ -1139,6 +1383,7 @@ def tableToText(soup):
|
|
if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
in_attachment = True
|
|
in_attachment = True
|
|
#逆序处理嵌套表格
|
|
#逆序处理嵌套表格
|
|
|
|
+ # print('len(tbodies)1', len(tbodies))
|
|
for tbody_index in range(1,len(tbodies)+1):
|
|
for tbody_index in range(1,len(tbodies)+1):
|
|
tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
|
|
tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
|
|
inner_table = trunTable(tbody,_in_attachment)
|
|
inner_table = trunTable(tbody,_in_attachment)
|
|
@@ -1155,6 +1400,7 @@ def tableToText(soup):
|
|
if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
if 'class' in _part.attrs and "richTextFetch" in _part['class']:
|
|
in_attachment = True
|
|
in_attachment = True
|
|
#逆序处理嵌套表格
|
|
#逆序处理嵌套表格
|
|
|
|
+ # print('len(tbodies)2', len(tbodies))
|
|
for tbody_index in range(1,len(tbodies)+1):
|
|
for tbody_index in range(1,len(tbodies)+1):
|
|
tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
|
|
tbody,_in_attachment = tbodies[len(tbodies)-tbody_index]
|
|
inner_table = trunTable(tbody,_in_attachment)
|
|
inner_table = trunTable(tbody,_in_attachment)
|
|
@@ -1163,6 +1409,284 @@ def tableToText(soup):
|
|
return soup
|
|
return soup
|
|
# return list_innerTable
|
|
# return list_innerTable
|
|
|
|
|
|
|
|
+
|
|
|
|
+def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0):
|
|
|
|
+ def pre_process(inner_table):
|
|
|
|
+ """
|
|
|
|
+ 修复前的预处理
|
|
|
|
+ """
|
|
|
|
+ # 循环处理单元格,一次获取需要的
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
|
+ # 删除前后逗号
|
|
|
|
+ inner_table[i][j][0] = re.sub('^[,,]+', '', inner_table[i][j][0])
|
|
|
|
+ inner_table[i][j][0] = re.sub('[,,]+$', '', inner_table[i][j][0])
|
|
|
|
+ return inner_table
|
|
|
|
+
|
|
|
|
+ def repair_by_colon(inner_table):
|
|
|
|
+ """
|
|
|
|
+ 根据冒号修复当前格子的表头值
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ # 修复冒号在文本中间的,不能作为表头;(冒号后面需多个字)
|
|
|
|
+ # 冒号在括号中的除外
|
|
|
|
+ # 冒号在最后的,判断后一个格子是否有重复的文字
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
|
+ _text = inner_table[i][j][0]
|
|
|
|
+ if len(_text) >= 3 and inner_table[i][j][1] == 1:
|
|
|
|
+ match = re.search('[::]', _text)
|
|
|
|
+ if match:
|
|
|
|
+ start_index, end_index = match.span()
|
|
|
|
+ if start_index == 0:
|
|
|
|
+ continue
|
|
|
|
+ if end_index == len(_text):
|
|
|
|
+ if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+ inner_table[i][j+1][1] = 0
|
|
|
|
+ else:
|
|
|
|
+ continue
|
|
|
|
+ if re.search('[((]', _text[:start_index]) and re.search('[))]', _text[end_index:]):
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ m1 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[:start_index])
|
|
|
|
+ m2 = re.search('[\u4e00-\u9fa50-9a-zA-Z]', _text[end_index:])
|
|
|
|
+ if m1 and m2 and (len(m2.group()) >= 2 or m2.group() in ['是', '否']):
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+
|
|
|
|
+ return inner_table
|
|
|
|
+
|
|
|
|
+ def repair_by_duplicate(inner_table):
|
|
|
|
+ """
|
|
|
|
+ 根据列重复修复当前格子的表头值
|
|
|
|
+ """
|
|
|
|
+ # 多个重复列的预测值不同,以第一个为准
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ col = inner_table[i][0]
|
|
|
|
+ for j in range(1, len(inner_table[i])):
|
|
|
|
+ if inner_table[i][j][0] == col[0]:
|
|
|
|
+ if inner_table[i][j][1] != col[1]:
|
|
|
|
+ if col != inner_table[i][0]:
|
|
|
|
+ inner_table[i][j][1] = col[1]
|
|
|
|
+ else:
|
|
|
|
+ inner_table[i][0][1] = inner_table[i][j][1]
|
|
|
|
+ col = inner_table[i][0]
|
|
|
|
+ else:
|
|
|
|
+ col = inner_table[i][j]
|
|
|
|
+ return inner_table
|
|
|
|
+
|
|
|
|
+ def repair_by_around(inner_table):
|
|
|
|
+ """
|
|
|
|
+ 根据周围的表头值修复当前格子的表头值
|
|
|
|
+ """
|
|
|
|
+ one_head_index_list = []
|
|
|
|
+ zero_head_index_list = []
|
|
|
|
+ all_head_index_list = []
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ head_cnt = 0
|
|
|
|
+ head_index = None
|
|
|
|
+ head_dict = {}
|
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
|
+ # 统计表头数
|
|
|
|
+ if inner_table[i][j][1] == 1:
|
|
|
|
+ head_cnt += 1
|
|
|
|
+ head_index = j
|
|
|
|
+ if inner_table[i][j][0] not in ['~~', '', ' ']:
|
|
|
|
+ if inner_table[i][j][0] in head_dict.keys():
|
|
|
|
+ head_dict[inner_table[i][j][0]] += 1
|
|
|
|
+ else:
|
|
|
|
+ head_dict[inner_table[i][j][0]] = 1
|
|
|
|
+ # 表头数list
|
|
|
|
+ if head_cnt == 0:
|
|
|
|
+ zero_head_index_list.append(i)
|
|
|
|
+ elif head_cnt == 1:
|
|
|
|
+ # 这个单个表头需满足前面有非表头
|
|
|
|
+ find_flag = 0
|
|
|
|
+ for k in range(head_index):
|
|
|
|
+ if inner_table[i][k][1] == 0:
|
|
|
|
+ find_flag = 1
|
|
|
|
+ if find_flag and len(head_dict.keys()) > 2:
|
|
|
|
+ one_head_index_list.append(i)
|
|
|
|
+ elif head_cnt == len(inner_table[i]):
|
|
|
|
+ all_head_index_list.append(i)
|
|
|
|
+
|
|
|
|
+ # 一整个大表格,第一行为表头,下面行中有个别格子被识别为表头
|
|
|
|
+ # 候选人后面修复
|
|
|
|
+ for index in one_head_index_list:
|
|
|
|
+ if (index - 1 in zero_head_index_list and index - 2 in zero_head_index_list) \
|
|
|
|
+ or (index - 1 in zero_head_index_list and index - 2 in all_head_index_list) \
|
|
|
|
+ or (index - 1 in all_head_index_list):
|
|
|
|
+ for j in range(len(inner_table[index])):
|
|
|
|
+ inner_table[index][j][1] = 0
|
|
|
|
+ zero_head_index_list.append(index)
|
|
|
|
+ return inner_table
|
|
|
|
+
|
|
|
|
+ def repair_by_tenderer(inner_table):
|
|
|
|
+ """
|
|
|
|
+ 根据第一第二第三候选人修复当前格子的表头值
|
|
|
|
+ """
|
|
|
|
+ # 修复第一第二第三中标候选人作为表头
|
|
|
|
+ first_tenderer = ['第一中标候选人', '第一中标人', '第一中标(成交)人', '第一候选人']
|
|
|
|
+ second_tenderer = ['第二中标候选人', '第二中标(成交)候选人', '第二候选人']
|
|
|
|
+ third_tenderer = ['第三中标候选人', '第三中标(成交)候选人', '第三候选人']
|
|
|
|
+ # n1 next one, n2 next two, l1 last one, l2 last two
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ row = inner_table[i]
|
|
|
|
+ n1_row, n2_row = None, None
|
|
|
|
+ if i+1 < len(inner_table):
|
|
|
|
+ n1_row = inner_table[i+1]
|
|
|
|
+ if i+2 < len(inner_table):
|
|
|
|
+ n2_row = inner_table[i+2]
|
|
|
|
+ for j in range(len(row)):
|
|
|
|
+ row_col = row[j]
|
|
|
|
+ n1_row_col, n2_row_col = None, None
|
|
|
|
+ row_n1_col, row_n2_col = None, None
|
|
|
|
+ n1_row_n1_col, n2_row_n1_col, n1_row_n2_col = None, None, None
|
|
|
|
+ if n1_row:
|
|
|
|
+ n1_row_col = n1_row[j]
|
|
|
|
+ if n2_row:
|
|
|
|
+ n2_row_col = n2_row[j]
|
|
|
|
+ if j+1 < len(row):
|
|
|
|
+ row_n1_col = row[j+1]
|
|
|
|
+ if j+2 < len(row):
|
|
|
|
+ row_n2_col = row[j+2]
|
|
|
|
+ if n1_row and j+1 < len(n1_row):
|
|
|
|
+ n1_row_n1_col = n1_row[j+1]
|
|
|
|
+ if n2_row and j+1 < len(n2_row):
|
|
|
|
+ n2_row_n1_col = n2_row[j+1]
|
|
|
|
+ if n1_row and j+2 < len(n1_row):
|
|
|
|
+ n1_row_n2_col = n1_row[j+2]
|
|
|
|
+
|
|
|
|
+ # 连续作为行表头
|
|
|
|
+ if row_col[0] in first_tenderer and row_n1_col and row_n1_col[1] == 0:
|
|
|
|
+ if n1_row_col and n1_row_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
|
|
|
|
+ inner_table[i][j][1] = 1
|
|
|
|
+ inner_table[i+1][j][1] = 1
|
|
|
|
+ if n2_row_col and n2_row_col[0] in third_tenderer and n2_row_n1_col and n2_row_n1_col[1] == 0:
|
|
|
|
+ inner_table[i+2][j][1] = 1
|
|
|
|
+
|
|
|
|
+ # 连续作为列表头
|
|
|
|
+ if row_col[0] in first_tenderer and n1_row_col and n1_row_col[1] == 0:
|
|
|
|
+ if row_n1_col and row_n1_col[0] in second_tenderer and n1_row_n1_col and n1_row_n1_col[1] == 0:
|
|
|
|
+ inner_table[i][j][1] = 1
|
|
|
|
+ inner_table[i][j+1][1] = 1
|
|
|
|
+ if row_n2_col and row_n2_col[0] in third_tenderer and n1_row_n2_col and n1_row_n2_col[1] == 0:
|
|
|
|
+ inner_table[i][j+2][1] = 1
|
|
|
|
+
|
|
|
|
+ return inner_table
|
|
|
|
+
|
|
|
|
+ def repair_by_keywords(inner_table):
|
|
|
|
+ """
|
|
|
|
+ 根据关键词修复当前格子的表头值
|
|
|
|
+ """
|
|
|
|
+ # 修复表头关键词未作为表头
|
|
|
|
+ # 末尾匹配匹配关键词,直接作为表头
|
|
|
|
+ head_keyword = ['供应商', '总价', '总价(元)', '总价\(元\)', '品目一', '品目二', '品目三']
|
|
|
|
+ # 末尾匹配关键词且前一列为表头且与前一列文本不同,直接不做表头
|
|
|
|
+ head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。']
|
|
|
|
+ # 开头匹配关键词,直接不做表头
|
|
|
|
+ head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
|
|
|
|
+ # 文本匹配关键词且前一列为表头,直接作为表头
|
|
|
|
+ head_keyword4 = ['综合排名']
|
|
|
|
+ # 文本在关键词中,直接不做表头
|
|
|
|
+ head_keyword5 = ['殡葬用地']
|
|
|
|
+ # 文本匹配关键词,直接不作表头
|
|
|
|
+ head_keyword6 = ['市场行情', '有限公司']
|
|
|
|
+
|
|
|
|
+ # n1 next one, n2 next two, l1 last one, l2 last two
|
|
|
|
+ for i in range(len(inner_table)):
|
|
|
|
+ row = inner_table[i]
|
|
|
|
+ for j in range(len(row)):
|
|
|
|
+ row_col = row[j]
|
|
|
|
+ row_l1_col = None
|
|
|
|
+ if j-1 >= 0:
|
|
|
|
+ row_l1_col = row[j-1]
|
|
|
|
+
|
|
|
|
+ # match = re.search('[\u4e00-\u9fa50-9a-zA-Z::]+', row_col[0])
|
|
|
|
+ # if inner_table[i][j][1] == 0 and match and match.group() in head_keyword:
|
|
|
|
+ # inner_table[i][j][1] = 1
|
|
|
|
+ for key in head_keyword:
|
|
|
|
+ match = re.search(key+'$', row_col[0])
|
|
|
|
+ if match:
|
|
|
|
+ inner_table[i][j][1] = 1
|
|
|
|
+ for key in head_keyword2:
|
|
|
|
+ match = re.search(key+'$', row_col[0])
|
|
|
|
+ if j > 0 and row_l1_col and row_l1_col[1] == 1 and row_l1_col[0] != row_col[0] and match and row_col[1] == 1:
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+ for key in head_keyword3:
|
|
|
|
+ match = re.search('^'+key, row_col[0])
|
|
|
|
+ if match and row_col[1] == 1:
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+ for key in head_keyword4:
|
|
|
|
+ match = re.search(key, row_col[0])
|
|
|
|
+ if j > 0 and row_l1_col and row_l1_col[1] == 1 and match and row_col[1] == 0:
|
|
|
|
+ inner_table[i][j][1] = 1
|
|
|
|
+ if row_col[0] in head_keyword5:
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+ for key in head_keyword6:
|
|
|
|
+ match = re.search(key, row_col[0])
|
|
|
|
+ if match:
|
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
|
+
|
|
|
|
+ return inner_table
|
|
|
|
+
|
|
|
|
+ _inner_table = pre_process(_inner_table)
|
|
|
|
+ compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
|
+ if show:
|
|
|
|
+ print('table_head_repair_process1', show_row_index, _inner_table[show_row_index])
|
|
|
|
+
|
|
|
|
+ _inner_table = repair_by_colon(_inner_table)
|
|
|
|
+ if _inner_table != compare_inner_table:
|
|
|
|
+ compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
|
+ log('table_head repair2 ' + str(docid))
|
|
|
|
+ if show:
|
|
|
|
+ print('table_head_repair_process2', show_row_index, _inner_table[show_row_index])
|
|
|
|
+
|
|
|
|
+ _inner_table = repair_by_keywords(_inner_table)
|
|
|
|
+ if _inner_table != compare_inner_table:
|
|
|
|
+ compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
|
+ log('table_head repair3 ' + str(docid))
|
|
|
|
+ if show:
|
|
|
|
+ print('table_head_repair_process3', show_row_index, _inner_table[show_row_index])
|
|
|
|
+
|
|
|
|
+ _inner_table = repair_by_tenderer(_inner_table)
|
|
|
|
+ if _inner_table != compare_inner_table:
|
|
|
|
+ compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
|
+ log('table_head repair4 ' + str(docid))
|
|
|
|
+ if show:
|
|
|
|
+ print('table_head_repair_process4', show_row_index, _inner_table[show_row_index])
|
|
|
|
+
|
|
|
|
+ _inner_table = repair_by_duplicate(_inner_table)
|
|
|
|
+ if _inner_table != compare_inner_table:
|
|
|
|
+ compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
|
+ log('table_head repair5 ' + str(docid))
|
|
|
|
+ if show:
|
|
|
|
+ print('table_head_repair_process5', show_row_index, _inner_table[show_row_index])
|
|
|
|
+
|
|
|
|
+ _inner_table = repair_by_around(_inner_table)
|
|
|
|
+ if _inner_table != compare_inner_table:
|
|
|
|
+ compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
|
+ log('table_head repair6 ' + str(docid))
|
|
|
|
+ if show:
|
|
|
|
+ print('table_head_repair_process6', show_row_index, _inner_table[show_row_index])
|
|
|
|
+
|
|
|
|
+ _inner_table = repair_by_tenderer(_inner_table)
|
|
|
|
+ if _inner_table != compare_inner_table:
|
|
|
|
+ compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
|
+ log('table_head repair7 ' + str(docid))
|
|
|
|
+ if show:
|
|
|
|
+ print('table_head_repair_process7', show_row_index, _inner_table[show_row_index])
|
|
|
|
+
|
|
|
|
+ _inner_table = repair_by_keywords(_inner_table)
|
|
|
|
+ if _inner_table != compare_inner_table:
|
|
|
|
+ compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
|
+ log('table_head repair8 ' + str(docid))
|
|
|
|
+ if show:
|
|
|
|
+ print('table_head_repair_process8', show_row_index, _inner_table[show_row_index])
|
|
|
|
+
|
|
|
|
+ return _inner_table
|
|
|
|
+
|
|
|
|
+
|
|
re_num = re.compile("[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十]")
|
|
re_num = re.compile("[二三四五六七八九]十[一二三四五六七八九]?|十[一二三四五六七八九]|[一二三四五六七八九十]")
|
|
num_dict = {
|
|
num_dict = {
|
|
"一": 1, "二": 2,
|
|
"一": 1, "二": 2,
|
|
@@ -2191,7 +2715,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
|
|
|
|
|
|
article_processed = get_preprocessed_outline(article_processed)
|
|
article_processed = get_preprocessed_outline(article_processed)
|
|
# print('article_processed')
|
|
# print('article_processed')
|
|
- article_processed = tableToText(article_processed)
|
|
|
|
|
|
+ article_processed = tableToText(article_processed, doc_id)
|
|
article_processed = segment(article_processed)
|
|
article_processed = segment(article_processed)
|
|
|
|
|
|
article_processed = article_processed.replace('(', '(').replace(')', ')') #2022/8/10 统一为中文括号
|
|
article_processed = article_processed.replace('(', '(').replace(')', ')') #2022/8/10 统一为中文括号
|