Browse Source

修复表格提取问题

lsm 2 năm trước cách đây
mục cha
commit
334ca54ee1
1 tập tin đã thay đổi với 10 bổ sung0 xóa
  1. 10 0
      BiddingKG/dl/interface/predictor.py

+ 10 - 0
BiddingKG/dl/interface/predictor.py

@@ -4672,6 +4672,11 @@ class TablePremExtractor(object):
             bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
             win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
 
+            if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset != set(): # 只要有一项为表头 停止匹配
+                break
+            if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) - set(['', ' ']) == set():  # 全部为空 停止匹配
+                break
+
             if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
                 package_code_raw = project_name
                 project_name = ""
@@ -4923,6 +4928,11 @@ class CandidateExtractor(object):
             second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
             third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
 
+            if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_sort, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
+                break
+            if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_sort, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' ']) == set():  # 全部为空 停止匹配
+                break
+
             if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
                 col_indx = headers['candidate'][0] -1
                 pre_col = df.loc[i, col_indx]