|
@@ -166,10 +166,10 @@ def tableToText(soup):
|
|
inner_table[i][j][0] = inner_table[i-1][j][0]
|
|
inner_table[i][j][0] = inner_table[i-1][j][0]
|
|
inner_table[i][j][1] = inner_table[i-1][j][1]
|
|
inner_table[i][j][1] = inner_table[i-1][j][1]
|
|
|
|
|
|
- def repairTable(inner_table,dye_set = set(),key_set = set(),fix_value="~~"):
|
|
|
|
- '''
|
|
|
|
|
|
+ def repairTable(inner_table, dye_set=set(), key_set=set(), fix_value="~~"):
|
|
|
|
+ """
|
|
@summary: 修复表头识别,将明显错误的进行修正
|
|
@summary: 修复表头识别,将明显错误的进行修正
|
|
- '''
|
|
|
|
|
|
+ """
|
|
def repairNeeded(line):
|
|
def repairNeeded(line):
|
|
first_1 = -1
|
|
first_1 = -1
|
|
last_1 = -1
|
|
last_1 = -1
|
|
@@ -192,22 +192,22 @@ def tableToText(soup):
|
|
count_0 += 1
|
|
count_0 += 1
|
|
if first_1 ==-1 or last_0 == -1:
|
|
if first_1 ==-1 or last_0 == -1:
|
|
return False
|
|
return False
|
|
- #异常情况:第一个不是表头;最后一个是表头;表头个数远大于属性值个数
|
|
|
|
- if first_1-0>0 or last_0-len(line)+1<0 or last_1==len(line)-1 or count_1-count_0>=3:
|
|
|
|
|
|
+ # 异常情况:第一个不是表头;最后一个是表头;表头个数远大于属性值个数
|
|
|
|
+ if first_1-0 > 0 or last_0-len(line)+1 < 0 or last_1 == len(line)-1 or count_1-count_0 >= 3:
|
|
return True
|
|
return True
|
|
return False
|
|
return False
|
|
|
|
|
|
- def getsimilarity(line,line1):
|
|
|
|
|
|
+ def getsimilarity(line, line1):
|
|
same_count = 0
|
|
same_count = 0
|
|
- for item,item1 in zip(line,line1):
|
|
|
|
- if item[1]==item1[1]:
|
|
|
|
|
|
+ for item, item1 in zip(line,line1):
|
|
|
|
+ if item[1] == item1[1]:
|
|
same_count += 1
|
|
same_count += 1
|
|
return same_count/len(line)
|
|
return same_count/len(line)
|
|
|
|
|
|
def selfrepair(inner_table,index,dye_set,key_set):
|
|
def selfrepair(inner_table,index,dye_set,key_set):
|
|
- '''
|
|
|
|
|
|
+ """
|
|
@summary: 计算每个节点受到的挤压度来判断是否需要染色
|
|
@summary: 计算每个节点受到的挤压度来判断是否需要染色
|
|
- '''
|
|
|
|
|
|
+ """
|
|
#print("B",inner_table[index])
|
|
#print("B",inner_table[index])
|
|
min_presure = 3
|
|
min_presure = 3
|
|
list_dye = []
|
|
list_dye = []
|
|
@@ -239,7 +239,7 @@ def tableToText(soup):
|
|
for i in range(len(list_dye)):
|
|
for i in range(len(list_dye)):
|
|
end = list_dye[i][2]
|
|
end = list_dye[i][2]
|
|
dye_flag = False
|
|
dye_flag = False
|
|
- #首尾要求压力减一
|
|
|
|
|
|
+ # 首尾要求压力减一
|
|
if i==0:
|
|
if i==0:
|
|
if list_dye[i+1][1]-list_dye[i][1]+1>=min_presure-1:
|
|
if list_dye[i+1][1]-list_dye[i][1]+1>=min_presure-1:
|
|
dye_flag = True
|
|
dye_flag = True
|
|
@@ -271,9 +271,6 @@ def tableToText(soup):
|
|
begin = end
|
|
begin = end
|
|
#print("E",inner_table[index])
|
|
#print("E",inner_table[index])
|
|
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
def otherrepair(inner_table,index,dye_set,key_set):
|
|
def otherrepair(inner_table,index,dye_set,key_set):
|
|
list_provide_repair = []
|
|
list_provide_repair = []
|
|
if index==0 and len(inner_table)>1:
|
|
if index==0 and len(inner_table)>1:
|
|
@@ -298,26 +295,27 @@ def tableToText(soup):
|
|
dye_set.add((inner_table[index][i][0],inner_table[provide_index][i][1]))
|
|
dye_set.add((inner_table[index][i][0],inner_table[provide_index][i][1]))
|
|
key_set.add(inner_table[index][i][0])
|
|
key_set.add(inner_table[index][i][0])
|
|
inner_table[index][i][1] = 0 if inner_table[provide_index][i][1] ==1 else 1
|
|
inner_table[index][i][1] = 0 if inner_table[provide_index][i][1] ==1 else 1
|
|
|
|
+
|
|
len_dye_set = len(dye_set)
|
|
len_dye_set = len(dye_set)
|
|
height = len(inner_table)
|
|
height = len(inner_table)
|
|
for i in range(height):
|
|
for i in range(height):
|
|
if repairNeeded(inner_table[i]):
|
|
if repairNeeded(inner_table[i]):
|
|
- selfrepair(inner_table,i,dye_set,key_set)
|
|
|
|
|
|
+ selfrepair(inner_table, i, dye_set, key_set)
|
|
#otherrepair(inner_table,i,dye_set,key_set)
|
|
#otherrepair(inner_table,i,dye_set,key_set)
|
|
for h in range(len(inner_table)):
|
|
for h in range(len(inner_table)):
|
|
for w in range(len(inner_table[0])):
|
|
for w in range(len(inner_table[0])):
|
|
if inner_table[h][w][0] in key_set:
|
|
if inner_table[h][w][0] in key_set:
|
|
for item in dye_set:
|
|
for item in dye_set:
|
|
- if inner_table[h][w][0]==item[0]:
|
|
|
|
|
|
+ if inner_table[h][w][0] == item[0]:
|
|
inner_table[h][w][1] = item[1]
|
|
inner_table[h][w][1] = item[1]
|
|
- #如果两个set长度不相同,则有同一个key被反复染色,将导致无限迭代
|
|
|
|
- if len(dye_set)!=len(key_set):
|
|
|
|
|
|
+ # 如果两个set长度不相同,则有同一个key被反复染色,将导致无限迭代
|
|
|
|
+ if len(dye_set) != len(key_set):
|
|
for i in range(height):
|
|
for i in range(height):
|
|
if repairNeeded(inner_table[i]):
|
|
if repairNeeded(inner_table[i]):
|
|
selfrepair(inner_table,i,dye_set,key_set)
|
|
selfrepair(inner_table,i,dye_set,key_set)
|
|
#otherrepair(inner_table,i,dye_set,key_set)
|
|
#otherrepair(inner_table,i,dye_set,key_set)
|
|
return
|
|
return
|
|
- if len(dye_set)==len_dye_set:
|
|
|
|
|
|
+ if len(dye_set) == len_dye_set:
|
|
'''
|
|
'''
|
|
for i in range(height):
|
|
for i in range(height):
|
|
if repairNeeded(inner_table[i]):
|
|
if repairNeeded(inner_table[i]):
|
|
@@ -439,6 +437,11 @@ def tableToText(soup):
|
|
for i in range(len(inner_table)):
|
|
for i in range(len(inner_table)):
|
|
for j in range(len(inner_table[i])):
|
|
for j in range(len(inner_table[i])):
|
|
inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
|
|
inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
|
|
|
|
+
|
|
|
|
+ # 表头修正
|
|
|
|
+ repairTable(inner_table)
|
|
|
|
+
|
|
|
|
+ # 按表头分割表格
|
|
head_list = sliceTable(inner_table)
|
|
head_list = sliceTable(inner_table)
|
|
return inner_table, head_list
|
|
return inner_table, head_list
|
|
|
|
|