Ver código fonte

更新招标、代理角色与表格提取冲突处理;更新过滤包含业绩的表格处理

lsm 2 anos atrás
pai
commit
a4135a8673

+ 7 - 0
BiddingKG/dl/interface/getAttributes.py

@@ -3514,6 +3514,13 @@ def update_prem(old_prem, new_prem):
             for k in del_k:
                 old_prem.pop(k)
 
+        if 'Project' in old_prem:
+            for d in old_prem['Project']['roleList']:
+                if d['role_name'] in ['tenderee', 'agency']:
+                    tenderree_ = d['role_text']
+                    if tenderree_ in str(new_prem) and re.search('公司', tenderree_):
+                        old_prem['Project']['roleList'].remove(d) # 如果旧预测的招标人/代理人在表格预测里面去掉,防止错误召回,以表格提取的为准
+
         for k, v in new_prem.items():
             if k == 'Project':
                 if 'Project' in old_prem:

+ 13 - 2
BiddingKG/dl/interface/predictor.py

@@ -705,7 +705,7 @@ class PREMPredict():
                 elif re.search('第[4-9四五六]中标候选人', front):  #修复第4以上的预测错为中标人
                     label = 5
                     values[label] = 0.5
-                elif re.search('(序号|排名|排序|名次):[4-9],', front): # 293225236 附件中 排名预测错误
+                elif re.search('(序号|排名|排序|名次):([4-9]|\d{2,}),', front): # 293225236 附件中 排名预测错误
                     values[2] = 0.5
                     label = 5
             elif re.search('是否中标:是,供应商', front) and label == 5:
@@ -4722,7 +4722,7 @@ class TablePremExtractor(object):
                             header_dic['budget'] = (i, text)
                             break
             if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
-                     'tenderee' in header_dic or 'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标人或招标金额或中标人的进行提取
+                     'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
                 return flag, contain_header, header_dic
             elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
                 return flag,contain_header, header_dic
@@ -4901,6 +4901,17 @@ class TablePremExtractor(object):
 
         rs_dic = {}
         for table in tables:
+
+            text = table.text.strip()
+            previous = table.findPreviousSibling()
+            text2 = previous .text.strip() if previous else ""
+            # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
+            if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
+                tb_ex = table.extract()
+                if previous:
+                    sib = previous.extract()
+                continue
+
             trs = self.tb.table2list(table)
             # table.extract()
             i = 0