|
@@ -705,7 +705,7 @@ class PREMPredict():
|
|
|
elif re.search('第[4-9四五六]中标候选人', front): #修复第4以上的预测错为中标人
|
|
|
label = 5
|
|
|
values[label] = 0.5
|
|
|
- elif re.search('(序号|排名|排序|名次):[4-9],', front): # 293225236 附件中 排名预测错误
|
|
|
+ elif re.search('(序号|排名|排序|名次):([4-9]|\d{2,}),', front): # 293225236 附件中 排名预测错误
|
|
|
values[2] = 0.5
|
|
|
label = 5
|
|
|
elif re.search('是否中标:是,供应商', front) and label == 5:
|
|
@@ -4475,7 +4475,7 @@ class DistrictPredictor():
|
|
|
return ''
|
|
|
|
|
|
def get_project_addr(text):
|
|
|
- p1 = '(项目|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
|
|
|
+ p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
|
|
|
if re.search(p1, text):
|
|
|
return re.search(p1, text).group(0)
|
|
|
else:
|
|
@@ -4669,14 +4669,14 @@ class TablePremExtractor(object):
|
|
|
def __init__(self):
|
|
|
'''各要素表头规则'''
|
|
|
self.head_rule_dic = {
|
|
|
- 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
|
|
|
+ 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])(编号|编码)",
|
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
"project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
|
|
|
- "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
|
|
|
- "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
|
|
|
+ "win_sort": "是否(中标|成交)|排名|排序|名次|未(中标|成交)原因",
|
|
|
+ "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
|
- "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
- "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
+ "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
+ "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
}
|
|
|
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
@@ -4686,7 +4686,7 @@ class TablePremExtractor(object):
|
|
|
|
|
|
|
|
|
def find_header(self, td_list):
|
|
|
- td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$', '', it) for it in td_list]
|
|
|
+ td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$|/万?元', '', it) for it in td_list]
|
|
|
header_dic = dict()
|
|
|
flag = False
|
|
|
contain_header = False
|
|
@@ -4721,9 +4721,11 @@ class TablePremExtractor(object):
|
|
|
if re.search('^金额((万?元))?$', text):
|
|
|
header_dic['budget'] = (i, text)
|
|
|
break
|
|
|
- if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic or 'tenderer' in header_dic) and (
|
|
|
- 'budget' in header_dic or 'bid_amount' in header_dic):
|
|
|
+ if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
|
+ 'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
|
|
|
return flag, contain_header, header_dic
|
|
|
+ elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
|
+ return flag,contain_header, header_dic
|
|
|
elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
|
contain_header = True
|
|
|
return flag, contain_header, dict()
|
|
@@ -4800,9 +4802,11 @@ class TablePremExtractor(object):
|
|
|
|
|
|
if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
|
|
|
continue
|
|
|
- if win_sort != "" and re.search('是否中标', headers['win_sort'][1]) and re.search('否', win_sort) == None:
|
|
|
+ if win_sort != "" and re.search('是否(中标|成交)', headers['win_sort'][1]) and re.search('否|未(中标|成交)', win_sort):
|
|
|
+ continue
|
|
|
+ if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
|
continue
|
|
|
- if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]):
|
|
|
+ if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and re.search('推荐中标候选人', headers['tenderer'][1])==None:
|
|
|
tenderer = ""
|
|
|
|
|
|
# tenderee = tenderee if self.is_role(tenderee) else ""
|
|
@@ -4817,7 +4821,7 @@ class TablePremExtractor(object):
|
|
|
continue
|
|
|
link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
|
|
|
|
- package = package_code if package_code else str(i+1)
|
|
|
+ package = package_code if package_code else str(len(prem_dic)+1) #str(i+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
package = uniform_package_name(package)
|
|
|
|
|
|
if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
@@ -4897,6 +4901,17 @@ class TablePremExtractor(object):
|
|
|
|
|
|
rs_dic = {}
|
|
|
for table in tables:
|
|
|
+
|
|
|
+ text = table.text.strip()
|
|
|
+ previous = table.findPreviousSibling()
|
|
|
+ text2 = previous .text.strip() if previous else ""
|
|
|
+ # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
|
|
|
+ if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
|
|
|
+ tb_ex = table.extract()
|
|
|
+ if previous:
|
|
|
+ sib = previous.extract()
|
|
|
+ continue
|
|
|
+
|
|
|
trs = self.tb.table2list(table)
|
|
|
# table.extract()
|
|
|
i = 0
|
|
@@ -4960,7 +4975,7 @@ class CandidateExtractor(object):
|
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
|
'win_or_not': '是否中标|是否入围|是否入库|入围结论',
|
|
|
"candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
|
|
|
- "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
+ "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
|
"third_tenderer": "第三名|第三(中标|成交)?候选人",
|