|
@@ -4900,8 +4900,8 @@ class TablePremExtractor(object):
|
|
|
self.head_rule_dic = {
|
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
- "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
|
|
|
- "win_sort": "是否(中标|成交)|排名|排序|名次|未(中标|成交)原因",
|
|
|
+ "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的)(名称?|内容)",
|
|
|
+ "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因",
|
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
@@ -5000,6 +5000,8 @@ class TablePremExtractor(object):
|
|
|
multi_same_package = False # 非连续的重复包号
|
|
|
package_fix2raw = dict() # 处理后包号:处理前包号 字典
|
|
|
link_set = set()
|
|
|
+ not_package = True if re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
|
|
|
+ 'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
|
|
|
for i in df.index:
|
|
|
same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
|
|
|
project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
|
|
@@ -5033,7 +5035,7 @@ class TablePremExtractor(object):
|
|
|
|
|
|
if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
|
|
|
continue
|
|
|
- if win_sort != "" and re.search('是否(中标|成交)', headers['win_sort'][1]) and re.search('否|未(中标|成交)', win_sort):
|
|
|
+ if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
|
|
|
continue
|
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
|
continue
|
|
@@ -5048,9 +5050,14 @@ class TablePremExtractor(object):
|
|
|
|
|
|
if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
break
|
|
|
- if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
|
|
|
- continue
|
|
|
- link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
|
+ if not_package:
|
|
|
+ if (project_code, package_code, tenderee, tenderer, budget_, bid_amount_) in link_set:
|
|
|
+ continue
|
|
|
+ link_set.add((project_code, package_code, tenderee, tenderer, budget_, bid_amount_))
|
|
|
+ else:
|
|
|
+ if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
|
|
|
+ continue
|
|
|
+ link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
|
|
|
|
package = package_code if package_code else str(len(prem_dic)+1) #str(i+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
package = uniform_package_name(package)
|