ソースを参照

优化表格提取及表格提取替换默认包中标人

lsm 1 年間 前
コミット
d2ed3d3e83

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -345,7 +345,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-03-01'}
+    version_date = {'version_date': '2024-03-11'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''

+ 7 - 2
BiddingKG/dl/interface/getAttributes.py

@@ -3948,7 +3948,7 @@ def update_prem(old_prem, new_prem):
                     del_k.append(k)
             for k in del_k:
                 old_prem.pop(k)
-
+        multi_tendereeMoney = [] # 多包招标金额
         for k, v in new_prem.items():
             if k == 'Project':
                 if 'Project' in old_prem:
@@ -3972,6 +3972,8 @@ def update_prem(old_prem, new_prem):
                 else:
                     old_prem[k] = v
             else:
+                if v['tendereeMoney'] != 0:
+                    multi_tendereeMoney.append(v['tendereeMoney'])
                 if k not in old_prem:  # 新有旧没有的包直接添加
                     old_prem[k] = v
                 else:
@@ -3992,10 +3994,13 @@ def update_prem(old_prem, new_prem):
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)
-        if len(new_prem)>1 and 'Project' in old_prem and 'win_tenderer' in str(new_prem): # 表格提取到中标人的,去掉project包中标人
+        if (len(new_prem)>1 or 'Project' not in new_prem) and 'Project' in old_prem and 'win_tenderer' in str(new_prem): # 表格提取到中标人的,去掉project包中标人
             for d in old_prem['Project']['roleList']:
                 if d['role_name'] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
                     old_prem['Project']['roleList'].remove(d) # 提取到其他包,去掉 project 里面的中标角色
+        if multi_tendereeMoney and float(old_prem['Project']['tendereeMoney'])!=0: # 表格提取到多标段招标金额,去掉Project包招标金额
+            old_prem['Project']['tendereeMoney'] = 0
+
     # return old_prem
 
 def fix_single_source(prem, channel_dic, original_docchannel):

+ 4 - 3
BiddingKG/dl/interface/predictor.py

@@ -5796,7 +5796,7 @@ class TablePremExtractor(object):
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
-            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价",
+            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价",
         }
 
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
@@ -5906,7 +5906,8 @@ class TablePremExtractor(object):
         not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
                           'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
 
-        if set(['project_code', 'package_code', 'tenderee', 'tenderer']) & set(headers) == set(): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683
+        if set(['project_code', 'package_code', 'tenderee', 'tenderer']) & set(headers) == set() and ('project_name' not in headers # 补充没有项目名称或有项目名称且是货物的才过滤掉
+            or re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683;  补充避免423647863采购意向被过滤
             # print('没有包号及角色的不要')
             return {}
         for i in df.index:
@@ -6143,7 +6144,7 @@ class CandidateExtractor(object):
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否中标|是否入围|是否入库|入围结论',
             "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称)?$",
-            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
+            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
             "third_tenderer": "第三名|第三(中标|成交)?候选人",