|
@@ -1509,7 +1509,7 @@ class RoleRuleFinalAdd():
|
|
sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
|
|
sear_ent2 = re.search('[,:](户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
sear_ent2 = re.search('[,:](户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
|
|
- sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点|)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
|
|
|
|
+ sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
|
|
sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
|
|
sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
|
|
sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
|
|
sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
|
|
|
|
|
|
@@ -4475,7 +4475,7 @@ class DistrictPredictor():
|
|
return ''
|
|
return ''
|
|
|
|
|
|
def get_project_addr(text):
|
|
def get_project_addr(text):
|
|
- p1 = '(项目|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
|
|
|
|
|
|
+ p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
|
|
if re.search(p1, text):
|
|
if re.search(p1, text):
|
|
return re.search(p1, text).group(0)
|
|
return re.search(p1, text).group(0)
|
|
else:
|
|
else:
|
|
@@ -4669,14 +4669,14 @@ class TablePremExtractor(object):
|
|
def __init__(self):
|
|
def __init__(self):
|
|
'''各要素表头规则'''
|
|
'''各要素表头规则'''
|
|
self.head_rule_dic = {
|
|
self.head_rule_dic = {
|
|
- 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
|
|
|
|
|
|
+ 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])(编号|编码)",
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
"project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
|
|
"project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
|
|
- "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
|
|
|
|
- "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
|
|
|
|
|
|
+ "win_sort": "是否(中标|成交)|排名|排序|名次|未(中标|成交)原因",
|
|
|
|
+ "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
- "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
|
- "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
|
|
|
+ "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
|
|
+ "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
}
|
|
}
|
|
|
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
@@ -4686,7 +4686,7 @@ class TablePremExtractor(object):
|
|
|
|
|
|
|
|
|
|
def find_header(self, td_list):
|
|
def find_header(self, td_list):
|
|
- td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$', '', it) for it in td_list]
|
|
|
|
|
|
+ td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$|/万?元', '', it) for it in td_list]
|
|
header_dic = dict()
|
|
header_dic = dict()
|
|
flag = False
|
|
flag = False
|
|
contain_header = False
|
|
contain_header = False
|
|
@@ -4721,9 +4721,11 @@ class TablePremExtractor(object):
|
|
if re.search('^金额((万?元))?$', text):
|
|
if re.search('^金额((万?元))?$', text):
|
|
header_dic['budget'] = (i, text)
|
|
header_dic['budget'] = (i, text)
|
|
break
|
|
break
|
|
- if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic or 'tenderer' in header_dic) and (
|
|
|
|
- 'budget' in header_dic or 'bid_amount' in header_dic):
|
|
|
|
|
|
+ if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
|
|
|
|
+ 'tenderee' in header_dic or 'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标人或招标金额或中标人的进行提取
|
|
return flag, contain_header, header_dic
|
|
return flag, contain_header, header_dic
|
|
|
|
+ elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
|
|
+ return flag,contain_header, header_dic
|
|
elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
|
|
contain_header = True
|
|
contain_header = True
|
|
return flag, contain_header, dict()
|
|
return flag, contain_header, dict()
|
|
@@ -4800,9 +4802,11 @@ class TablePremExtractor(object):
|
|
|
|
|
|
if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
|
|
if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
|
|
continue
|
|
continue
|
|
- if win_sort != "" and re.search('是否中标', headers['win_sort'][1]) and re.search('否', win_sort) == None:
|
|
|
|
|
|
+ if win_sort != "" and re.search('是否(中标|成交)', headers['win_sort'][1]) and re.search('否|未(中标|成交)', win_sort):
|
|
|
|
+ continue
|
|
|
|
+ if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
continue
|
|
continue
|
|
- if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]):
|
|
|
|
|
|
+ if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and re.search('推荐中标候选人', headers['tenderer'][1])==None:
|
|
tenderer = ""
|
|
tenderer = ""
|
|
|
|
|
|
# tenderee = tenderee if self.is_role(tenderee) else ""
|
|
# tenderee = tenderee if self.is_role(tenderee) else ""
|
|
@@ -4817,7 +4821,7 @@ class TablePremExtractor(object):
|
|
continue
|
|
continue
|
|
link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
|
|
|
|
- package = package_code if package_code else str(i+1)
|
|
|
|
|
|
+ package = package_code if package_code else str(len(prem_dic)+1) #str(i+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
package = uniform_package_name(package)
|
|
package = uniform_package_name(package)
|
|
|
|
|
|
if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
|
|
@@ -4960,7 +4964,7 @@ class CandidateExtractor(object):
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
'win_or_not': '是否中标|是否入围|是否入库|入围结论',
|
|
'win_or_not': '是否中标|是否入围|是否入库|入围结论',
|
|
"candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
|
|
"candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
|
|
- "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
|
|
|
|
+ "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
"third_tenderer": "第三名|第三(中标|成交)?候选人",
|
|
"third_tenderer": "第三名|第三(中标|成交)?候选人",
|