|
@@ -698,7 +698,7 @@ def tableToText(soup, docid=None):
|
|
|
inner_table[i][j] = col
|
|
|
|
|
|
# 模型预测表头
|
|
|
- predict_list = predict(inner_table)
|
|
|
+ # predict_list = predict(inner_table)
|
|
|
|
|
|
start_time = time.time()
|
|
|
predict_list = predict(inner_table)
|
|
@@ -719,6 +719,11 @@ def tableToText(soup, docid=None):
|
|
|
# repairTable(inner_table)
|
|
|
inner_table = table_head_repair_process(inner_table, docid)
|
|
|
|
|
|
+ # 组合结果
|
|
|
+ for i in range(len(inner_table)):
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
+ inner_table[i][j] = [origin_inner_table[i][j][0], int(inner_table[i][j][1])]
|
|
|
+
|
|
|
if show:
|
|
|
print("="*80)
|
|
|
print("table_head after repair")
|
|
@@ -1420,8 +1425,9 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
|
|
|
for i in range(len(inner_table)):
|
|
|
for j in range(len(inner_table[i])):
|
|
|
# 删除前后逗号
|
|
|
- inner_table[i][j][0] = re.sub('^[,,]+', '', inner_table[i][j][0])
|
|
|
- inner_table[i][j][0] = re.sub('[,,]+$', '', inner_table[i][j][0])
|
|
|
+ inner_table[i][j][0] = re.sub('^[,, ]+', '', inner_table[i][j][0])
|
|
|
+ inner_table[i][j][0] = re.sub('[,, ]+$', '', inner_table[i][j][0])
|
|
|
+ inner_table[i][j][0] = re.sub('[, ]+', '', inner_table[i][j][0])
|
|
|
return inner_table
|
|
|
|
|
|
def repair_by_colon(inner_table):
|
|
@@ -1442,7 +1448,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
|
|
|
if start_index == 0:
|
|
|
continue
|
|
|
if end_index == len(_text):
|
|
|
- if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
|
|
|
+ if len(inner_table[i]) == 2 and j <= len(inner_table[i]) - 2 and inner_table[i][j+1][0] and (_text in inner_table[i][j+1][0] or inner_table[i][j+1][0] in _text):
|
|
|
inner_table[i][j][1] = 0
|
|
|
inner_table[i][j+1][1] = 0
|
|
|
else:
|
|
@@ -1461,19 +1467,121 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
|
|
|
"""
|
|
|
根据列重复修复当前格子的表头值
|
|
|
"""
|
|
|
+ # 统计每个值的表头情况
|
|
|
+ col_head_dict = {}
|
|
|
+ for i in range(len(inner_table)):
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
+ col = inner_table[i][j]
|
|
|
+ if col[0] in col_head_dict.keys():
|
|
|
+ col_head_dict[col[0]] += [col[1]]
|
|
|
+ else:
|
|
|
+ col_head_dict[col[0]] = [col[1]]
|
|
|
+
|
|
|
# 多个重复列的预测值不同,以第一个为准
|
|
|
for i in range(len(inner_table)):
|
|
|
col = inner_table[i][0]
|
|
|
- for j in range(1, len(inner_table[i])):
|
|
|
+ key = col[0] + '\t' + str(0)
|
|
|
+ dup_dict = {}
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
if inner_table[i][j][0] == col[0]:
|
|
|
- if inner_table[i][j][1] != col[1]:
|
|
|
- if col != inner_table[i][0]:
|
|
|
- inner_table[i][j][1] = col[1]
|
|
|
- else:
|
|
|
- inner_table[i][0][1] = inner_table[i][j][1]
|
|
|
- col = inner_table[i][0]
|
|
|
+ if key in dup_dict.keys():
|
|
|
+ dup_dict[key] += [j]
|
|
|
+ else:
|
|
|
+ dup_dict[key] = [j]
|
|
|
+
|
|
|
+ # if inner_table[i][j][1] != col[1]:
|
|
|
+ # if col != inner_table[i][0]:
|
|
|
+ # inner_table[i][j][1] = col[1]
|
|
|
+ # else:
|
|
|
+ # inner_table[i][0][1] = inner_table[i][j][1]
|
|
|
+ # col = inner_table[i][0]
|
|
|
else:
|
|
|
col = inner_table[i][j]
|
|
|
+ key = col[0] + '\t' + str(j)
|
|
|
+ dup_dict[key] = [j]
|
|
|
+
|
|
|
+ # print('dup_dict', dup_dict)
|
|
|
+ #
|
|
|
+ for key in dup_dict.keys():
|
|
|
+ index_list = dup_dict.get(key)
|
|
|
+ if len(index_list) <= 1:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 需要表头不同
|
|
|
+ table_head_list = []
|
|
|
+ for index in index_list:
|
|
|
+ table_head_list.append(inner_table[i][index][1])
|
|
|
+ table_head_list = list(set(table_head_list))
|
|
|
+ if len(table_head_list) <= 1:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 若是职业特殊处理
|
|
|
+ col = key.split('\t')[0]
|
|
|
+ if re.search('([^人]员|工程师|建造师|经理|安全负责人|技术负责人|合同商务负责人)$', col):
|
|
|
+ table_head_flag = 0
|
|
|
+ # 看是否包含表头
|
|
|
+ else:
|
|
|
+ table_head_flag = 0
|
|
|
+ for index in index_list:
|
|
|
+ if inner_table[i][index][1] == 1:
|
|
|
+ table_head_flag = 1
|
|
|
+ break
|
|
|
+
|
|
|
+ table_head = None
|
|
|
+ if index_list[0] > 0 and index_list[-1] == len(inner_table[i]) - 1:
|
|
|
+ table_head = inner_table[i][index_list[0]][1]
|
|
|
+ elif index_list[0] > 0 and index_list[-1] != len(inner_table[i]) - 1:
|
|
|
+ # 查看前面是否有表头-非表头表达
|
|
|
+ is_head_not_head = 0
|
|
|
+ for k in range(index_list[0]):
|
|
|
+ if k+1 < index_list[0] and inner_table[i][k][1] == 1 and inner_table[i][k+1][1] == 0:
|
|
|
+ is_head_not_head = 1
|
|
|
+ break
|
|
|
+ # 查看前后有没有表头
|
|
|
+ start_has_head = 0
|
|
|
+ end_has_head = 0
|
|
|
+ if not is_head_not_head:
|
|
|
+ for k in range(index_list[0], len(inner_table[i])):
|
|
|
+ if inner_table[i][k][0] == inner_table[i][index_list[0]][0]:
|
|
|
+ continue
|
|
|
+ if inner_table[i][k][1] == 1:
|
|
|
+ end_has_head = 1
|
|
|
+ break
|
|
|
+ for k in range(index_list[0]):
|
|
|
+ if inner_table[i][k][0] == inner_table[i][index_list[0]][0]:
|
|
|
+ continue
|
|
|
+ if inner_table[i][k][1] == 1:
|
|
|
+ start_has_head = 1
|
|
|
+ break
|
|
|
+
|
|
|
+ head_list = col_head_dict.get(inner_table[i][index_list[0]][0])
|
|
|
+ if is_head_not_head:
|
|
|
+ table_head = table_head_flag
|
|
|
+ elif len(head_list) >= 4:
|
|
|
+ if head_list.count(0) > head_list.count(1):
|
|
|
+ table_head = 0
|
|
|
+ else:
|
|
|
+ table_head = 1
|
|
|
+ elif not start_has_head and not end_has_head:
|
|
|
+ table_head = 0
|
|
|
+ else:
|
|
|
+ table_head = table_head_flag
|
|
|
+ elif index_list[0] == 0 and index_list[-1] == len(inner_table[i]) - 1:
|
|
|
+ table_head = table_head_flag
|
|
|
+ elif index_list[0] == 0 and index_list[-1] != len(inner_table[i]) - 1:
|
|
|
+ head_list = col_head_dict.get(inner_table[i][index_list[0]][0])
|
|
|
+ if len(head_list) >= 4:
|
|
|
+ if head_list.count(0) > head_list.count(1):
|
|
|
+ table_head = 0
|
|
|
+ else:
|
|
|
+ table_head = 1
|
|
|
+ else:
|
|
|
+ table_head = table_head_flag
|
|
|
+
|
|
|
+ if table_head is not None:
|
|
|
+ for index in index_list:
|
|
|
+ inner_table[i][index][1] = table_head
|
|
|
+
|
|
|
return inner_table
|
|
|
|
|
|
def repair_by_around(inner_table):
|
|
@@ -1581,18 +1689,24 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
|
|
|
根据关键词修复当前格子的表头值
|
|
|
"""
|
|
|
# 修复表头关键词未作为表头
|
|
|
- # 末尾匹配匹配关键词,直接作为表头
|
|
|
+ # 末尾匹配匹配关键词且字数小于7,直接作为表头
|
|
|
head_keyword = ['供应商', '总价', '总价(元)', '总价\(元\)', '品目一', '品目二', '品目三']
|
|
|
# 末尾匹配关键词且前一列为表头且与前一列文本不同,直接不做表头
|
|
|
- head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。']
|
|
|
+ head_keyword2 = ['管理中心', '有限公司', '项目采购', '确定。', ]
|
|
|
# 开头匹配关键词,直接不做表头
|
|
|
- head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑']
|
|
|
+ head_keyword3 = ['详见', '选定', '咨询服务', '标准物资', '电汇', '承兑', '低档', '高档',
|
|
|
+ '更换配置']
|
|
|
# 文本匹配关键词且前一列为表头,直接作为表头
|
|
|
head_keyword4 = ['综合排名']
|
|
|
# 文本在关键词中,直接不做表头
|
|
|
- head_keyword5 = ['殡葬用地']
|
|
|
+ head_keyword5 = ['殡葬用地', '电脑包', '电池']
|
|
|
# 文本匹配关键词,直接不作表头
|
|
|
- head_keyword6 = ['市场行情', '有限公司']
|
|
|
+ head_keyword6 = ['市场行情', '有限公司', '能提供']
|
|
|
+ # 末尾匹配关键词,直接不做表头
|
|
|
+ head_keyword7 = ['基金', '结转', '结余', '税', '结余分配', '协议供货', '房屋',
|
|
|
+ '纳税人', '自然人', '计算所得额']
|
|
|
+ # 文本匹配关键词且整行都是表头,直接做表头
|
|
|
+ head_keyword8 = ['备注']
|
|
|
|
|
|
# n1 next one, n2 next two, l1 last one, l2 last two
|
|
|
for i in range(len(inner_table)):
|
|
@@ -1608,7 +1722,7 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
|
|
|
# inner_table[i][j][1] = 1
|
|
|
for key in head_keyword:
|
|
|
match = re.search(key+'$', row_col[0])
|
|
|
- if match:
|
|
|
+ if match and len(inner_table[i][j][0]) <= 6:
|
|
|
inner_table[i][j][1] = 1
|
|
|
for key in head_keyword2:
|
|
|
match = re.search(key+'$', row_col[0])
|
|
@@ -1628,9 +1742,34 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
|
|
|
match = re.search(key, row_col[0])
|
|
|
if match:
|
|
|
inner_table[i][j][1] = 0
|
|
|
+ for key in head_keyword7:
|
|
|
+ match = re.search(key+'$', row_col[0])
|
|
|
+ if match and row_col[1] == 1:
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
+ if row_col[0] in head_keyword8 and row_col[1] == 0:
|
|
|
+ print('head_keyword8 row_col0', row_col[0])
|
|
|
+ all_head_flag = 1
|
|
|
+ for k in range(len(row)):
|
|
|
+ if row[k][0] in ['', row_col[0]]:
|
|
|
+ continue
|
|
|
+ if row[k][1] == 0:
|
|
|
+ print('row[k]', row[k])
|
|
|
+ all_head_flag = 0
|
|
|
+ break
|
|
|
+ print('all_head_flag', all_head_flag)
|
|
|
+ if all_head_flag:
|
|
|
+ inner_table[i][j][1] = 1
|
|
|
+
|
|
|
|
|
|
return inner_table
|
|
|
|
|
|
+ def repair_by_length(inner_table):
|
|
|
+ for i in range(len(inner_table)):
|
|
|
+ for j in range(len(inner_table[i])):
|
|
|
+ if len(inner_table[i][j][0]) >= 30:
|
|
|
+ inner_table[i][j][1] = 0
|
|
|
+ return inner_table
|
|
|
+
|
|
|
_inner_table = pre_process(_inner_table)
|
|
|
compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
if show:
|
|
@@ -1685,6 +1824,13 @@ def table_head_repair_process(_inner_table, docid=None, show=0, show_row_index=0
|
|
|
if show:
|
|
|
print('table_head_repair_process8', show_row_index, _inner_table[show_row_index])
|
|
|
|
|
|
+ _inner_table = repair_by_length(_inner_table)
|
|
|
+ if _inner_table != compare_inner_table:
|
|
|
+ compare_inner_table = copy.deepcopy(_inner_table)
|
|
|
+ print('table_head repair9 ' + str(docid))
|
|
|
+ if show:
|
|
|
+ print('table_head_repair_process9', show_row_index, _inner_table[show_row_index])
|
|
|
+
|
|
|
return _inner_table
|
|
|
|
|
|
|