|
@@ -2925,7 +2925,15 @@ class ProductAttributesPredictor():
|
|
|
return False
|
|
|
elif len(table.find_all(['table'])) >= 1:
|
|
|
# print('过滤表格:包含多个表格的为假表格')
|
|
|
- return False
|
|
|
+ inner_table_num = len(table.find_all(['table']))
|
|
|
+ text_num = 0 # 表格只有一格作文本框的数量,docid:631910513
|
|
|
+ for inner_table in table.find_all(['table']):
|
|
|
+ if len(inner_table.find_all(['tr']))==0 or (len(inner_table.find_all(['tr']))==1 and len(inner_table.find_all(['tr'])[0].find_all(['td']))<=1):
|
|
|
+ text_num += 1
|
|
|
+ if inner_table_num - text_num > 0:
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return True
|
|
|
else:
|
|
|
return True
|
|
|
|
|
@@ -3339,11 +3347,10 @@ class ProductAttributesPredictor():
|
|
|
continue
|
|
|
if not self.isTrueTable(table):
|
|
|
continue
|
|
|
+ self.fixSpan(table)
|
|
|
+ inner_table = self.getTable(table)
|
|
|
|
|
|
- # self.fixSpan(table)
|
|
|
- # inner_table = self.getTable(table)
|
|
|
-
|
|
|
- inner_table = self.tb.table2list(table)
|
|
|
+ # inner_table = self.tb.table2list(table)
|
|
|
table.extract()
|
|
|
# print(inner_table)
|
|
|
i = 0
|