|
@@ -155,201 +155,175 @@ def tableToText(soup):
|
|
if inner_table[i][j][0]==pad_col:
|
|
if inner_table[i][j][0]==pad_col:
|
|
inner_table[i][j][0] = inner_table[i-1][j][0]
|
|
inner_table[i][j][0] = inner_table[i-1][j][0]
|
|
inner_table[i][j][1] = inner_table[i-1][j][1]
|
|
inner_table[i][j][1] = inner_table[i-1][j][1]
|
|
-
|
|
|
|
- def setHead_initem(inner_table,pat_head,fix_value="~~",prob_min=0.5):
|
|
|
|
- def repairTable(inner_table,dye_set = set(),key_set = set()):
|
|
|
|
|
|
+
|
|
|
|
+ def repairTable(inner_table,dye_set = set(),key_set = set(),fix_value="~~"):
|
|
|
|
+ '''
|
|
|
|
+ @summary: 修复表头识别,将明显错误的进行修正
|
|
|
|
+ '''
|
|
|
|
+ def repairNeeded(line):
|
|
|
|
+ first_1 = -1
|
|
|
|
+ last_1 = -1
|
|
|
|
+ first_0 = -1
|
|
|
|
+ last_0 = -1
|
|
|
|
+ count_1 = 0
|
|
|
|
+ count_0 = 0
|
|
|
|
+ for i in range(len(line)):
|
|
|
|
+ if line[i][0]==fix_value:
|
|
|
|
+ continue
|
|
|
|
+ if line[i][1]==1:
|
|
|
|
+ if first_1==-1:
|
|
|
|
+ first_1 = i
|
|
|
|
+ last_1 = i
|
|
|
|
+ count_1 += 1
|
|
|
|
+ if line[i][1]==0:
|
|
|
|
+ if first_0 == -1:
|
|
|
|
+ first_0 = i
|
|
|
|
+ last_0 = i
|
|
|
|
+ count_0 += 1
|
|
|
|
+ if first_1 ==-1 or last_0 == -1:
|
|
|
|
+ return False
|
|
|
|
+ #异常情况:第一个不是表头;最后一个是表头;表头个数远大于属性值个数
|
|
|
|
+ if first_1-0>0 or last_0-len(line)+1<0 or last_1==len(line)-1 or count_1-count_0>=3:
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ def getsimilarity(line,line1):
|
|
|
|
+ same_count = 0
|
|
|
|
+ for item,item1 in zip(line,line1):
|
|
|
|
+ if item[1]==item1[1]:
|
|
|
|
+ same_count += 1
|
|
|
|
+ return same_count/len(line)
|
|
|
|
+
|
|
|
|
+ def selfrepair(inner_table,index,dye_set,key_set):
|
|
'''
|
|
'''
|
|
- @summary: 修复表头识别,将明显错误的进行修正
|
|
|
|
|
|
+ @summary: 计算每个节点受到的挤压度来判断是否需要染色
|
|
'''
|
|
'''
|
|
- def repairNeeded(line):
|
|
|
|
- first_1 = -1
|
|
|
|
- last_1 = -1
|
|
|
|
- first_0 = -1
|
|
|
|
- last_0 = -1
|
|
|
|
- count_1 = 0
|
|
|
|
- count_0 = 0
|
|
|
|
- for i in range(len(line)):
|
|
|
|
- if line[i][0]==fix_value:
|
|
|
|
- continue
|
|
|
|
- if line[i][1]==1:
|
|
|
|
- if first_1==-1:
|
|
|
|
- first_1 = i
|
|
|
|
- last_1 = i
|
|
|
|
- count_1 += 1
|
|
|
|
- if line[i][1]==0:
|
|
|
|
- if first_0 == -1:
|
|
|
|
- first_0 = i
|
|
|
|
- last_0 = i
|
|
|
|
- count_0 += 1
|
|
|
|
- if first_1 ==-1 or last_0 == -1:
|
|
|
|
- return False
|
|
|
|
- #异常情况:第一个不是表头;最后一个是表头;表头个数远大于属性值个数
|
|
|
|
- if first_1-0>0 or last_0-len(line)+1<0 or last_1==len(line)-1 or count_1-count_0>=3:
|
|
|
|
- return True
|
|
|
|
- return False
|
|
|
|
-
|
|
|
|
- def getsimilarity(line,line1):
|
|
|
|
- same_count = 0
|
|
|
|
- for item,item1 in zip(line,line1):
|
|
|
|
- if item[1]==item1[1]:
|
|
|
|
- same_count += 1
|
|
|
|
- return same_count/len(line)
|
|
|
|
-
|
|
|
|
- def selfrepair(inner_table,index,dye_set,key_set):
|
|
|
|
- '''
|
|
|
|
- @summary: 计算每个节点受到的挤压度来判断是否需要染色
|
|
|
|
- '''
|
|
|
|
- #print("B",inner_table[index])
|
|
|
|
- min_presure = 3
|
|
|
|
- list_dye = []
|
|
|
|
- first = None
|
|
|
|
- count = 0
|
|
|
|
- temp_set = set()
|
|
|
|
- _index = 0
|
|
|
|
- for item in inner_table[index]:
|
|
|
|
- if first is None:
|
|
|
|
- first = item[1]
|
|
|
|
|
|
+ #print("B",inner_table[index])
|
|
|
|
+ min_presure = 3
|
|
|
|
+ list_dye = []
|
|
|
|
+ first = None
|
|
|
|
+ count = 0
|
|
|
|
+ temp_set = set()
|
|
|
|
+ _index = 0
|
|
|
|
+ for item in inner_table[index]:
|
|
|
|
+ if first is None:
|
|
|
|
+ first = item[1]
|
|
|
|
+ if item[0] not in temp_set:
|
|
|
|
+ count += 1
|
|
|
|
+ temp_set.add(item[0])
|
|
|
|
+ else:
|
|
|
|
+ if first == item[1]:
|
|
if item[0] not in temp_set:
|
|
if item[0] not in temp_set:
|
|
- count += 1
|
|
|
|
temp_set.add(item[0])
|
|
temp_set.add(item[0])
|
|
|
|
+ count += 1
|
|
else:
|
|
else:
|
|
- if first == item[1]:
|
|
|
|
- if item[0] not in temp_set:
|
|
|
|
- temp_set.add(item[0])
|
|
|
|
- count += 1
|
|
|
|
- else:
|
|
|
|
- list_dye.append([first,count,_index])
|
|
|
|
- first = item[1]
|
|
|
|
- temp_set.add(item[0])
|
|
|
|
- count = 1
|
|
|
|
- _index += 1
|
|
|
|
- list_dye.append([first,count,_index])
|
|
|
|
- if len(list_dye)>1:
|
|
|
|
- begin = 0
|
|
|
|
- end = 0
|
|
|
|
- for i in range(len(list_dye)):
|
|
|
|
- end = list_dye[i][2]
|
|
|
|
- dye_flag = False
|
|
|
|
- #首尾要求压力减一
|
|
|
|
- if i==0:
|
|
|
|
- if list_dye[i+1][1]-list_dye[i][1]+1>=min_presure-1:
|
|
|
|
|
|
+ list_dye.append([first,count,_index])
|
|
|
|
+ first = item[1]
|
|
|
|
+ temp_set.add(item[0])
|
|
|
|
+ count = 1
|
|
|
|
+ _index += 1
|
|
|
|
+ list_dye.append([first,count,_index])
|
|
|
|
+ if len(list_dye)>1:
|
|
|
|
+ begin = 0
|
|
|
|
+ end = 0
|
|
|
|
+ for i in range(len(list_dye)):
|
|
|
|
+ end = list_dye[i][2]
|
|
|
|
+ dye_flag = False
|
|
|
|
+ #首尾要求压力减一
|
|
|
|
+ if i==0:
|
|
|
|
+ if list_dye[i+1][1]-list_dye[i][1]+1>=min_presure-1:
|
|
|
|
+ dye_flag = True
|
|
|
|
+ dye_type = list_dye[i+1][0]
|
|
|
|
+ elif i==len(list_dye)-1:
|
|
|
|
+ if list_dye[i-1][1]-list_dye[i][1]+1>=min_presure-1:
|
|
|
|
+ dye_flag = True
|
|
|
|
+ dye_type = list_dye[i-1][0]
|
|
|
|
+ else:
|
|
|
|
+ if list_dye[i][1]>1:
|
|
|
|
+ if list_dye[i+1][1]-list_dye[i][1]+1>=min_presure:
|
|
dye_flag = True
|
|
dye_flag = True
|
|
dye_type = list_dye[i+1][0]
|
|
dye_type = list_dye[i+1][0]
|
|
- elif i==len(list_dye)-1:
|
|
|
|
- if list_dye[i-1][1]-list_dye[i][1]+1>=min_presure-1:
|
|
|
|
|
|
+ if list_dye[i-1][1]-list_dye[i][1]+1>=min_presure:
|
|
dye_flag = True
|
|
dye_flag = True
|
|
dye_type = list_dye[i-1][0]
|
|
dye_type = list_dye[i-1][0]
|
|
else:
|
|
else:
|
|
- if list_dye[i][1]>1:
|
|
|
|
- if list_dye[i+1][1]-list_dye[i][1]+1>=min_presure:
|
|
|
|
- dye_flag = True
|
|
|
|
- dye_type = list_dye[i+1][0]
|
|
|
|
- if list_dye[i-1][1]-list_dye[i][1]+1>=min_presure:
|
|
|
|
- dye_flag = True
|
|
|
|
- dye_type = list_dye[i-1][0]
|
|
|
|
- else:
|
|
|
|
- if list_dye[i+1][1]+list_dye[i-1][1]-list_dye[i][1]+1>=min_presure:
|
|
|
|
- dye_flag = True
|
|
|
|
- dye_type = list_dye[i+1][0]
|
|
|
|
- if list_dye[i+1][1]+list_dye[i-1][1]-list_dye[i][1]+1>=min_presure:
|
|
|
|
- dye_flag = True
|
|
|
|
- dye_type = list_dye[i-1][0]
|
|
|
|
- if dye_flag:
|
|
|
|
- for h in range(begin,end):
|
|
|
|
- inner_table[index][h][1] = dye_type
|
|
|
|
- dye_set.add((inner_table[index][h][0],dye_type))
|
|
|
|
- key_set.add(inner_table[index][h][0])
|
|
|
|
- begin = end
|
|
|
|
- #print("E",inner_table[index])
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- def otherrepair(inner_table,index,dye_set,key_set):
|
|
|
|
- list_provide_repair = []
|
|
|
|
- if index==0 and len(inner_table)>1:
|
|
|
|
- list_provide_repair.append(index+1)
|
|
|
|
- elif index==len(inner_table)-1:
|
|
|
|
- list_provide_repair.append(index-1)
|
|
|
|
- else:
|
|
|
|
- list_provide_repair.append(index+1)
|
|
|
|
- list_provide_repair.append(index-1)
|
|
|
|
- for provide_index in list_provide_repair:
|
|
|
|
- if not repairNeeded(inner_table[provide_index]):
|
|
|
|
- same_prob = getsimilarity(inner_table[index], inner_table[provide_index])
|
|
|
|
- if same_prob>=0.8:
|
|
|
|
- for i in range(len(inner_table[provide_index])):
|
|
|
|
- if inner_table[index][i][1]!=inner_table[provide_index][i][1]:
|
|
|
|
- dye_set.add((inner_table[index][i][0],inner_table[provide_index][i][1]))
|
|
|
|
- key_set.add(inner_table[index][i][0])
|
|
|
|
- inner_table[index][i][1] = inner_table[provide_index][i][1]
|
|
|
|
- elif same_prob<=0.2:
|
|
|
|
- for i in range(len(inner_table[provide_index])):
|
|
|
|
- if inner_table[index][i][1]==inner_table[provide_index][i][1]:
|
|
|
|
- dye_set.add((inner_table[index][i][0],inner_table[provide_index][i][1]))
|
|
|
|
- key_set.add(inner_table[index][i][0])
|
|
|
|
- inner_table[index][i][1] = 0 if inner_table[provide_index][i][1] ==1 else 1
|
|
|
|
- len_dye_set = len(dye_set)
|
|
|
|
- height = len(inner_table)
|
|
|
|
|
|
+ if list_dye[i+1][1]+list_dye[i-1][1]-list_dye[i][1]+1>=min_presure:
|
|
|
|
+ dye_flag = True
|
|
|
|
+ dye_type = list_dye[i+1][0]
|
|
|
|
+ if list_dye[i+1][1]+list_dye[i-1][1]-list_dye[i][1]+1>=min_presure:
|
|
|
|
+ dye_flag = True
|
|
|
|
+ dye_type = list_dye[i-1][0]
|
|
|
|
+ if dye_flag:
|
|
|
|
+ for h in range(begin,end):
|
|
|
|
+ inner_table[index][h][1] = dye_type
|
|
|
|
+ dye_set.add((inner_table[index][h][0],dye_type))
|
|
|
|
+ key_set.add(inner_table[index][h][0])
|
|
|
|
+ begin = end
|
|
|
|
+ #print("E",inner_table[index])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def otherrepair(inner_table,index,dye_set,key_set):
|
|
|
|
+ list_provide_repair = []
|
|
|
|
+ if index==0 and len(inner_table)>1:
|
|
|
|
+ list_provide_repair.append(index+1)
|
|
|
|
+ elif index==len(inner_table)-1:
|
|
|
|
+ list_provide_repair.append(index-1)
|
|
|
|
+ else:
|
|
|
|
+ list_provide_repair.append(index+1)
|
|
|
|
+ list_provide_repair.append(index-1)
|
|
|
|
+ for provide_index in list_provide_repair:
|
|
|
|
+ if not repairNeeded(inner_table[provide_index]):
|
|
|
|
+ same_prob = getsimilarity(inner_table[index], inner_table[provide_index])
|
|
|
|
+ if same_prob>=0.8:
|
|
|
|
+ for i in range(len(inner_table[provide_index])):
|
|
|
|
+ if inner_table[index][i][1]!=inner_table[provide_index][i][1]:
|
|
|
|
+ dye_set.add((inner_table[index][i][0],inner_table[provide_index][i][1]))
|
|
|
|
+ key_set.add(inner_table[index][i][0])
|
|
|
|
+ inner_table[index][i][1] = inner_table[provide_index][i][1]
|
|
|
|
+ elif same_prob<=0.2:
|
|
|
|
+ for i in range(len(inner_table[provide_index])):
|
|
|
|
+ if inner_table[index][i][1]==inner_table[provide_index][i][1]:
|
|
|
|
+ dye_set.add((inner_table[index][i][0],inner_table[provide_index][i][1]))
|
|
|
|
+ key_set.add(inner_table[index][i][0])
|
|
|
|
+ inner_table[index][i][1] = 0 if inner_table[provide_index][i][1] ==1 else 1
|
|
|
|
+ len_dye_set = len(dye_set)
|
|
|
|
+ height = len(inner_table)
|
|
|
|
+ for i in range(height):
|
|
|
|
+ if repairNeeded(inner_table[i]):
|
|
|
|
+ selfrepair(inner_table,i,dye_set,key_set)
|
|
|
|
+ #otherrepair(inner_table,i,dye_set,key_set)
|
|
|
|
+ for h in range(len(inner_table)):
|
|
|
|
+ for w in range(len(inner_table[0])):
|
|
|
|
+ if inner_table[h][w][0] in key_set:
|
|
|
|
+ for item in dye_set:
|
|
|
|
+ if inner_table[h][w][0]==item[0]:
|
|
|
|
+ inner_table[h][w][1] = item[1]
|
|
|
|
+ #如果两个set长度不相同,则有同一个key被反复染色,将导致无限迭代
|
|
|
|
+ if len(dye_set)!=len(key_set):
|
|
for i in range(height):
|
|
for i in range(height):
|
|
if repairNeeded(inner_table[i]):
|
|
if repairNeeded(inner_table[i]):
|
|
selfrepair(inner_table,i,dye_set,key_set)
|
|
selfrepair(inner_table,i,dye_set,key_set)
|
|
#otherrepair(inner_table,i,dye_set,key_set)
|
|
#otherrepair(inner_table,i,dye_set,key_set)
|
|
- for h in range(len(inner_table)):
|
|
|
|
- for w in range(len(inner_table[0])):
|
|
|
|
- if inner_table[h][w][0] in key_set:
|
|
|
|
- for item in dye_set:
|
|
|
|
- if inner_table[h][w][0]==item[0]:
|
|
|
|
- inner_table[h][w][1] = item[1]
|
|
|
|
- #如果两个set长度不相同,则有同一个key被反复染色,将导致无限迭代
|
|
|
|
- if len(dye_set)!=len(key_set):
|
|
|
|
- for i in range(height):
|
|
|
|
- if repairNeeded(inner_table[i]):
|
|
|
|
- selfrepair(inner_table,i,dye_set,key_set)
|
|
|
|
- #otherrepair(inner_table,i,dye_set,key_set)
|
|
|
|
- return
|
|
|
|
- if len(dye_set)==len_dye_set:
|
|
|
|
- '''
|
|
|
|
- for i in range(height):
|
|
|
|
- if repairNeeded(inner_table[i]):
|
|
|
|
- otherrepair(inner_table,i,dye_set,key_set)
|
|
|
|
- '''
|
|
|
|
- return
|
|
|
|
- repairTable(inner_table, dye_set, key_set)
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- set_item = set()
|
|
|
|
|
|
+ return
|
|
|
|
+ if len(dye_set)==len_dye_set:
|
|
|
|
+ '''
|
|
|
|
+ for i in range(height):
|
|
|
|
+ if repairNeeded(inner_table[i]):
|
|
|
|
+ otherrepair(inner_table,i,dye_set,key_set)
|
|
|
|
+ '''
|
|
|
|
+ return
|
|
|
|
+ repairTable(inner_table, dye_set, key_set)
|
|
|
|
+
|
|
|
|
+ def sliceTable(inner_table,fix_value="~~"):
|
|
|
|
+ #进行分块
|
|
height = len(inner_table)
|
|
height = len(inner_table)
|
|
width = len(inner_table[0])
|
|
width = len(inner_table[0])
|
|
- for i in range(height):
|
|
|
|
- for j in range(width):
|
|
|
|
- item = inner_table[i][j][0]
|
|
|
|
- set_item.add(item)
|
|
|
|
- list_item = list(set_item)
|
|
|
|
- x = []
|
|
|
|
- for item in list_item:
|
|
|
|
- x.append(getPredictor("form").encode(item))
|
|
|
|
- predict_y = getPredictor("form").predict(np.array(x),type="item")
|
|
|
|
- _dict = dict()
|
|
|
|
-
|
|
|
|
- for item,values in zip(list_item,list(predict_y)):
|
|
|
|
- _dict[item] = values[1]
|
|
|
|
- # print("##",item,values)
|
|
|
|
- #print(_dict)
|
|
|
|
- for i in range(height):
|
|
|
|
- for j in range(width):
|
|
|
|
- item = inner_table[i][j][0]
|
|
|
|
- inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
|
|
|
|
-
|
|
|
|
- repairTable(inner_table)
|
|
|
|
-
|
|
|
|
- #进行分块
|
|
|
|
head_list = []
|
|
head_list = []
|
|
head_list.append(0)
|
|
head_list.append(0)
|
|
last_head = None
|
|
last_head = None
|
|
|
|
+ last_is_same_value = False;
|
|
for h in range(height):
|
|
for h in range(height):
|
|
is_all_key = True#是否是全表头行
|
|
is_all_key = True#是否是全表头行
|
|
is_all_value = True#是否是全属性值
|
|
is_all_value = True#是否是全属性值
|
|
@@ -361,13 +335,13 @@ def tableToText(soup):
|
|
if last_head is not None:
|
|
if last_head is not None:
|
|
if inner_table[h-1][w][0]!=fix_value and inner_table[h-1][w][1] == 0:
|
|
if inner_table[h-1][w][0]!=fix_value and inner_table[h-1][w][1] == 0:
|
|
is_all_key = False
|
|
is_all_key = False
|
|
-
|
|
|
|
|
|
+
|
|
if inner_table[h][w][0]==1:
|
|
if inner_table[h][w][0]==1:
|
|
is_all_value = False
|
|
is_all_value = False
|
|
-
|
|
|
|
|
|
+
|
|
if inner_table[h][w][1]!= inner_table[h-1][w][1]:
|
|
if inner_table[h][w][1]!= inner_table[h-1][w][1]:
|
|
is_same_with_lastHead = False
|
|
is_same_with_lastHead = False
|
|
-
|
|
|
|
|
|
+
|
|
if inner_table[h][w][0]!=fix_value and inner_table[h][w][0]!=same_value:
|
|
if inner_table[h][w][0]!=fix_value and inner_table[h][w][0]!=same_value:
|
|
is_same_value = False
|
|
is_same_value = False
|
|
else:
|
|
else:
|
|
@@ -375,20 +349,85 @@ def tableToText(soup):
|
|
is_same_value = False
|
|
is_same_value = False
|
|
if h>0 and inner_table[h][0][0]!=inner_table[h-1][0][0]:
|
|
if h>0 and inner_table[h][0][0]!=inner_table[h-1][0][0]:
|
|
is_same_first_item = False
|
|
is_same_first_item = False
|
|
-
|
|
|
|
|
|
+
|
|
last_head = h
|
|
last_head = h
|
|
-
|
|
|
|
|
|
+
|
|
|
|
+ if last_is_same_value:
|
|
|
|
+ last_is_same_value = is_same_value
|
|
|
|
+ continue
|
|
|
|
+
|
|
if is_same_value:
|
|
if is_same_value:
|
|
head_list.append(h)
|
|
head_list.append(h)
|
|
- head_list.append(h+1)
|
|
|
|
|
|
+ last_is_same_value = is_same_value
|
|
continue
|
|
continue
|
|
if not is_all_key:
|
|
if not is_all_key:
|
|
if not is_same_with_lastHead:
|
|
if not is_same_with_lastHead:
|
|
head_list.append(h)
|
|
head_list.append(h)
|
|
-
|
|
|
|
-
|
|
|
|
|
|
+
|
|
|
|
+
|
|
head_list.append(height)
|
|
head_list.append(height)
|
|
|
|
+ return head_list
|
|
|
|
+
|
|
|
|
+ def setHead_initem(inner_table,pat_head,fix_value="~~",prob_min=0.5):
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ set_item = set()
|
|
|
|
+ height = len(inner_table)
|
|
|
|
+ width = len(inner_table[0])
|
|
|
|
+ for i in range(height):
|
|
|
|
+ for j in range(width):
|
|
|
|
+ item = inner_table[i][j][0]
|
|
|
|
+ set_item.add(item)
|
|
|
|
+ list_item = list(set_item)
|
|
|
|
+ x = []
|
|
|
|
+ for item in list_item:
|
|
|
|
+ x.append(getPredictor("form").encode(item))
|
|
|
|
+ predict_y = getPredictor("form").predict(np.array(x),type="item")
|
|
|
|
+ _dict = dict()
|
|
|
|
+
|
|
|
|
+ for item,values in zip(list_item,list(predict_y)):
|
|
|
|
+ _dict[item] = values[1]
|
|
|
|
+ # print("##",item,values)
|
|
|
|
+ #print(_dict)
|
|
|
|
+ for i in range(height):
|
|
|
|
+ for j in range(width):
|
|
|
|
+ item = inner_table[i][j][0]
|
|
|
|
+ inner_table[i][j][1] = 1 if _dict[item]>prob_min else (1 if re.search(pat_head,item) is not None and len(item)<8 else 0)
|
|
|
|
+
|
|
|
|
+ # print("=====")
|
|
|
|
+ # for item in inner_table:
|
|
|
|
+ # print(item)
|
|
|
|
+ # print("======")
|
|
|
|
+
|
|
|
|
+ repairTable(inner_table)
|
|
|
|
+ head_list = sliceTable(inner_table)
|
|
|
|
+
|
|
|
|
|
|
|
|
+ return inner_table,head_list
|
|
|
|
+
|
|
|
|
+ def setHead_incontext(inner_table,pat_head,fix_value="~~",prob_min=0.5):
|
|
|
|
+
|
|
|
|
+ data_x,data_position = getPredictor("form").getModel("context").encode(inner_table)
|
|
|
|
+ predict_y = getPredictor("form").getModel("context").predict(data_x)
|
|
|
|
+
|
|
|
|
+ for _position,_y in zip(data_position,predict_y):
|
|
|
|
+ _w = _position[0]
|
|
|
|
+ _h = _position[1]
|
|
|
|
+ if _y[1]>prob_min:
|
|
|
|
+ inner_table[_h][_w][1] = 1
|
|
|
|
+ else:
|
|
|
|
+ inner_table[_h][_w][1] = 0
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # print("=====")
|
|
|
|
+ # for item in inner_table:
|
|
|
|
+ # print(item)
|
|
|
|
+ # print("======")
|
|
|
|
+
|
|
|
|
+ repairTable(inner_table)
|
|
|
|
+ head_list = sliceTable(inner_table)
|
|
|
|
+
|
|
|
|
+
|
|
return inner_table,head_list
|
|
return inner_table,head_list
|
|
|
|
|
|
#设置表头
|
|
#设置表头
|
|
@@ -588,7 +627,7 @@ def tableToText(soup):
|
|
def getTableText(inner_table,head_list,key_direct=False):
|
|
def getTableText(inner_table,head_list,key_direct=False):
|
|
# packPattern = "(标包|[标包][号段名])"
|
|
# packPattern = "(标包|[标包][号段名])"
|
|
packPattern = "(标包|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))" # 2020/11/23 大网站规则,补充采购类包名
|
|
packPattern = "(标包|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))" # 2020/11/23 大网站规则,补充采购类包名
|
|
- rankPattern = "(排名|排序|名次|序号|评标结果|评审结果)" # 2020/11/23 大网站规则,添加序号为排序
|
|
|
|
|
|
+ rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标)" # 2020/11/23 大网站规则,添加序号为排序
|
|
entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
|
|
entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
|
|
height = len(inner_table)
|
|
height = len(inner_table)
|
|
width = len(inner_table[0])
|
|
width = len(inner_table[0])
|
|
@@ -770,7 +809,8 @@ def tableToText(soup):
|
|
if len(inner_table)>0 and len(inner_table[0])>0:
|
|
if len(inner_table)>0 and len(inner_table[0])>0:
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
#inner_table,head_list = setHead_withRule(inner_table,pat_head,pat_value,3)
|
|
#inner_table,head_list = setHead_inline(inner_table)
|
|
#inner_table,head_list = setHead_inline(inner_table)
|
|
- inner_table,head_list = setHead_initem(inner_table,pat_head)
|
|
|
|
|
|
+ # inner_table,head_list = setHead_initem(inner_table,pat_head)
|
|
|
|
+ inner_table,head_list = setHead_incontext(inner_table,pat_head)
|
|
# print(inner_table)
|
|
# print(inner_table)
|
|
# for begin in range(len(head_list[:-1])):
|
|
# for begin in range(len(head_list[:-1])):
|
|
# for item in inner_table[head_list[begin]:head_list[begin+1]]:
|
|
# for item in inner_table[head_list[begin]:head_list[begin+1]]:
|
|
@@ -779,12 +819,11 @@ def tableToText(soup):
|
|
|
|
|
|
removeFix(inner_table)
|
|
removeFix(inner_table)
|
|
|
|
|
|
- '''
|
|
|
|
- print("----")
|
|
|
|
- print(head_list)
|
|
|
|
- for item in inner_table:
|
|
|
|
- print(item)
|
|
|
|
- '''
|
|
|
|
|
|
+ # print("----")
|
|
|
|
+ # print(head_list)
|
|
|
|
+ # for item in inner_table:
|
|
|
|
+ # print(item)
|
|
|
|
+
|
|
|
|
|
|
tbody.string = getTableText(inner_table,head_list)
|
|
tbody.string = getTableText(inner_table,head_list)
|
|
#print(tbody.string)
|
|
#print(tbody.string)
|