Эх сурвалжийг харах

表头识别添加修复过程

fangjiasheng 2 жил өмнө
parent
commit
7fe09f2cfd

+ 22 - 19
BiddingKG/dl/interface/Preprocessing.py

@@ -166,10 +166,10 @@ def tableToText(soup):
                     inner_table[i][j][0] = inner_table[i-1][j][0]
                     inner_table[i][j][1] = inner_table[i-1][j][1]
 
-    def repairTable(inner_table,dye_set = set(),key_set = set(),fix_value="~~"):
-        '''
+    def repairTable(inner_table, dye_set=set(), key_set=set(), fix_value="~~"):
+        """
         @summary: 修复表头识别,将明显错误的进行修正
-        '''
+        """
         def repairNeeded(line):
             first_1 = -1
             last_1 = -1
@@ -192,22 +192,22 @@ def tableToText(soup):
                     count_0 += 1
             if first_1 ==-1 or last_0 == -1:
                 return False
-            #异常情况:第一个不是表头;最后一个是表头;表头个数远大于属性值个数
-            if first_1-0>0 or last_0-len(line)+1<0 or last_1==len(line)-1 or count_1-count_0>=3:
+            # 异常情况:第一个不是表头;最后一个是表头;表头个数远大于属性值个数
+            if first_1-0 > 0 or last_0-len(line)+1 < 0 or last_1 == len(line)-1 or count_1-count_0 >= 3:
                 return True
             return False
 
-        def getsimilarity(line,line1):
+        def getsimilarity(line, line1):
             same_count = 0
-            for item,item1 in zip(line,line1):
-                if item[1]==item1[1]:
+            for item, item1 in zip(line,line1):
+                if item[1] == item1[1]:
                     same_count += 1
             return same_count/len(line)
 
         def selfrepair(inner_table,index,dye_set,key_set):
-            '''
+            """
             @summary: 计算每个节点受到的挤压度来判断是否需要染色
-            '''
+            """
             #print("B",inner_table[index])
             min_presure = 3
             list_dye = []
@@ -239,7 +239,7 @@ def tableToText(soup):
                 for i in range(len(list_dye)):
                     end = list_dye[i][2]
                     dye_flag = False
-                    #首尾要求压力减一
+                    # 首尾要求压力减一
                     if i==0:
                         if list_dye[i+1][1]-list_dye[i][1]+1>=min_presure-1:
                             dye_flag = True
@@ -271,9 +271,6 @@ def tableToText(soup):
                     begin = end
                 #print("E",inner_table[index])
 
-
-
-
         def otherrepair(inner_table,index,dye_set,key_set):
             list_provide_repair = []
             if index==0 and len(inner_table)>1:
@@ -298,26 +295,27 @@ def tableToText(soup):
                                 dye_set.add((inner_table[index][i][0],inner_table[provide_index][i][1]))
                                 key_set.add(inner_table[index][i][0])
                             inner_table[index][i][1] = 0 if inner_table[provide_index][i][1] ==1 else 1
+
         len_dye_set = len(dye_set)
         height = len(inner_table)
         for i in range(height):
             if repairNeeded(inner_table[i]):
-                selfrepair(inner_table,i,dye_set,key_set)
+                selfrepair(inner_table, i, dye_set, key_set)
                 #otherrepair(inner_table,i,dye_set,key_set)
         for h in range(len(inner_table)):
             for w in range(len(inner_table[0])):
                 if inner_table[h][w][0] in key_set:
                     for item in dye_set:
-                        if inner_table[h][w][0]==item[0]:
+                        if inner_table[h][w][0] == item[0]:
                             inner_table[h][w][1] = item[1]
-        #如果两个set长度不相同,则有同一个key被反复染色,将导致无限迭代
-        if len(dye_set)!=len(key_set):
+        # 如果两个set长度不相同,则有同一个key被反复染色,将导致无限迭代
+        if len(dye_set) != len(key_set):
             for i in range(height):
                 if repairNeeded(inner_table[i]):
                     selfrepair(inner_table,i,dye_set,key_set)
                     #otherrepair(inner_table,i,dye_set,key_set)
             return
-        if len(dye_set)==len_dye_set:
+        if len(dye_set) == len_dye_set:
             '''
             for i in range(height):
                 if repairNeeded(inner_table[i]):
@@ -439,6 +437,11 @@ def tableToText(soup):
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
+
+        # 表头修正
+        repairTable(inner_table)
+
+        # 按表头分割表格
         head_list = sliceTable(inner_table)
         return inner_table, head_list