Procházet zdrojové kódy

优化表格提取,没有包的不同产品同中标人的,每个中标人只提取一个包

lsm před 2 roky
rodič
revize
27c55271da

binární
BiddingKG/dl/interface/header_set.pkl


+ 13 - 6
BiddingKG/dl/interface/predictor.py

@@ -4900,8 +4900,8 @@ class TablePremExtractor(object):
         self.head_rule_dic = {
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
-            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
-            "win_sort": "是否(中标|成交)|排名|排序|名次|未(中标|成交)原因",
+            "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的)(名称?|内容)",
+            "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
@@ -5000,6 +5000,8 @@ class TablePremExtractor(object):
         multi_same_package = False # 非连续的重复包号
         package_fix2raw = dict()  # 处理后包号:处理前包号 字典
         link_set = set()
+        not_package = True if re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
+                          'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
         for i in df.index:
             same_package = False  # 连续重复包号,一般是 rowspan 造成;一包 多个采购
             project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
@@ -5033,7 +5035,7 @@ class TablePremExtractor(object):
 
             if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
                 continue
-            if win_sort != "" and re.search('是否(中标|成交)', headers['win_sort'][1]) and re.search('否|未(中标|成交)', win_sort):
+            if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
                 continue
             if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
                 continue
@@ -5048,9 +5050,14 @@ class TablePremExtractor(object):
 
             if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
                 break
-            if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
-                continue
-            link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
+            if not_package:
+                if (project_code, package_code, tenderee, tenderer, budget_, bid_amount_) in link_set:
+                    continue
+                link_set.add((project_code, package_code, tenderee, tenderer, budget_, bid_amount_))
+            else:
+                if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
+                    continue
+                link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
 
             package = package_code if package_code else str(len(prem_dic)+1) #str(i+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
             package = uniform_package_name(package)