浏览代码

采购意向表格处理优化

znj 2 天之前
父节点
当前提交
7f677bc9fd
共有 2 个文件被更改,包括 16 次插入9 次删除
  1. 4 4
      BiddingKG/dl/channel/channel_bert.py
  2. 12 5
      BiddingKG/dl/interface/predictor.py

+ 4 - 4
BiddingKG/dl/channel/channel_bert.py

@@ -427,12 +427,12 @@ def channel_predict(title,text):
     text = re.sub("##attachment##。?","",text)
     text = text_process(text)
 
-    if len(text)<=100:
+    if len(text)<=200:
         # 正文内容过短时,不预测
         return
-    elif len(text)<=150:
-        # 正文内容过短时,重复正文
-        text = text * 2
+    # elif len(text)<=150:
+    #     # 正文内容过短时,重复正文
+    #     text = text * 2
     text = text[:2000]
     title = text_process(title)
     title = title[:100]

+ 12 - 5
BiddingKG/dl/interface/predictor.py

@@ -2925,7 +2925,15 @@ class ProductAttributesPredictor():
             return False
         elif len(table.find_all(['table'])) >= 1:
             # print('过滤表格:包含多个表格的为假表格')
-            return False
+            inner_table_num = len(table.find_all(['table']))
+            text_num = 0 # 表格只有一格作文本框的数量,docid:631910513
+            for inner_table in table.find_all(['table']):
+                if len(inner_table.find_all(['tr']))==0 or (len(inner_table.find_all(['tr']))==1 and len(inner_table.find_all(['tr'])[0].find_all(['td']))<=1):
+                    text_num += 1
+            if inner_table_num - text_num > 0:
+                return False
+            else:
+                return True
         else:
             return True
 
@@ -3339,11 +3347,10 @@ class ProductAttributesPredictor():
                 continue
             if not self.isTrueTable(table):
                 continue
+            self.fixSpan(table)
+            inner_table = self.getTable(table)
 
-            # self.fixSpan(table)
-            # inner_table = self.getTable(table)
-
-            inner_table = self.tb.table2list(table)
+            # inner_table = self.tb.table2list(table)
             table.extract()
             # print(inner_table)
             i = 0