|
@@ -2326,11 +2326,11 @@ class RoleGrade():
|
|
# log('如果org中标人同时为招标人角色,降低中标概率:%s, %s' % (ent.entity_text, ent.label))
|
|
# log('如果org中标人同时为招标人角色,降低中标概率:%s, %s' % (ent.entity_text, ent.label))
|
|
ent.values[2] = 0.6
|
|
ent.values[2] = 0.6
|
|
flag = 1
|
|
flag = 1
|
|
- if flag == 0 and company_winner != []:
|
|
|
|
- for ent in org_winner:
|
|
|
|
- if ent.label == 2 and ent.values[2] > 0.6:
|
|
|
|
- # log('如果同时包含org和company中标人,降低org中标人概率为0.6:%s, %s' % (ent.entity_text, ent.values[2]))
|
|
|
|
- ent.values[2] = 0.6
|
|
|
|
|
|
+ # if flag == 0 and company_winner != []: # 2024/04/18 注释掉 避免提取不到 273351465 供应商(乙方:湖南省第二测绘院
|
|
|
|
+ # for ent in org_winner:
|
|
|
|
+ # if ent.label == 2 and ent.values[2] > 0.6:
|
|
|
|
+ # # log('如果同时包含org和company中标人,降低org中标人概率为0.6:%s, %s' % (ent.entity_text, ent.values[2]))
|
|
|
|
+ # ent.values[2] = 0.6
|
|
|
|
|
|
|
|
|
|
class MoneyGrade():
|
|
class MoneyGrade():
|
|
@@ -3913,7 +3913,7 @@ class DocChannel():
|
|
'中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$', # |开标(记录|信息|情况)
|
|
'中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$', # |开标(记录|信息|情况)
|
|
'资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
|
|
'资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
|
|
'招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
|
|
'招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
|
|
- '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果',
|
|
|
|
|
|
+ '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果|开标会',
|
|
'验收合同': '(验收|履约)(公告|公示)|(验收|履约)(结果|报告|意见|单)(公告|公示)'
|
|
'验收合同': '(验收|履约)(公告|公示)|(验收|履约)(结果|报告|意见|单)(公告|公示)'
|
|
}
|
|
}
|
|
|
|
|
|
@@ -6060,8 +6060,8 @@ class TablePremExtractor(object):
|
|
'''各要素表头规则'''
|
|
'''各要素表头规则'''
|
|
self.head_rule_dic = {
|
|
self.head_rule_dic = {
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
- 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
|
|
|
|
- "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的|^包)(名称?|内容)",
|
|
|
|
|
|
+ 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
|
|
|
|
+ "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
"win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
|
|
"win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因|推荐顺序",
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
@@ -6121,7 +6121,7 @@ class TablePremExtractor(object):
|
|
'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
|
|
'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
|
|
return flag, contain_header, header_dic
|
|
return flag, contain_header, header_dic
|
|
elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
|
|
- if re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_sort' not in header_dic: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
|
|
|
|
|
|
+ if re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_sort' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None: # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
|
|
# print('只有供应商名称 没排名和包号的去掉')
|
|
# print('只有供应商名称 没排名和包号的去掉')
|
|
return flag, contain_header, dict()
|
|
return flag, contain_header, dict()
|
|
return flag,contain_header, header_dic
|
|
return flag,contain_header, header_dic
|
|
@@ -6162,7 +6162,7 @@ class TablePremExtractor(object):
|
|
for ner in ners[0]:
|
|
for ner in ners[0]:
|
|
if ner[2] in ['org', 'company', 'location']:
|
|
if ner[2] in ['org', 'company', 'location']:
|
|
roles.append(ner[3])
|
|
roles.append(ner[3])
|
|
- if roles and len(''.join(roles)) > len(text)*0.8:
|
|
|
|
|
|
+ if roles and (len(''.join(roles)) > len(text)*0.8 or text.startswith(roles[0])):
|
|
return roles[0]
|
|
return roles[0]
|
|
else:
|
|
else:
|
|
return ''
|
|
return ''
|
|
@@ -6177,7 +6177,7 @@ class TablePremExtractor(object):
|
|
'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
|
|
'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
|
|
|
|
|
|
if set(['project_code', 'package_code', 'tenderee', 'tenderer']) & set(headers) == set() and ('project_name' not in headers # 补充没有项目名称或有项目名称且是货物的才过滤掉
|
|
if set(['project_code', 'package_code', 'tenderee', 'tenderer']) & set(headers) == set() and ('project_name' not in headers # 补充没有项目名称或有项目名称且是货物的才过滤掉
|
|
- or re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
|
|
|
|
|
|
+ or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683; 补充避免423647863采购意向被过滤
|
|
# print('没有包号及角色的不要')
|
|
# print('没有包号及角色的不要')
|
|
return {}
|
|
return {}
|
|
for i in df.index:
|
|
for i in df.index:
|
|
@@ -6332,6 +6332,25 @@ class TablePremExtractor(object):
|
|
prem_dic[v] = prem_dic.pop(k)
|
|
prem_dic[v] = prem_dic.pop(k)
|
|
return prem_dic
|
|
return prem_dic
|
|
|
|
|
|
|
|
+ def update_prem(self, rs_dic, tmp_dic):
|
|
|
|
+ '''
|
|
|
|
+ 合并更新 prem
|
|
|
|
+ :param rs_dic: 返回结果
|
|
|
|
+ :param tmp_dic: 待合并结果
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ for pack in tmp_dic:
|
|
|
|
+ if pack in rs_dic:
|
|
|
|
+ for k in tmp_dic[pack]:
|
|
|
|
+ if rs_dic[pack][k] in ['', 0]:
|
|
|
|
+ rs_dic[pack][k] = tmp_dic[pack][k]
|
|
|
|
+ elif rs_dic[pack][k] == []:
|
|
|
|
+ rs_dic[pack][k] = tmp_dic[pack][k]
|
|
|
|
+ elif k == 'roleList' and len(rs_dic[pack][k])>0 and rs_dic[pack][k][0].get('role_money', {}).get('money', 0) == 0:
|
|
|
|
+ rs_dic[pack][k] = tmp_dic[pack][k]
|
|
|
|
+ else:
|
|
|
|
+ rs_dic[pack] = tmp_dic[pack]
|
|
|
|
+
|
|
def get_prem(self, soup, web_source_name=''):
|
|
def get_prem(self, soup, web_source_name=''):
|
|
tables = soup.find_all('table')
|
|
tables = soup.find_all('table')
|
|
tables.reverse()
|
|
tables.reverse()
|
|
@@ -6373,7 +6392,8 @@ class TablePremExtractor(object):
|
|
df = pd.DataFrame(table_items)
|
|
df = pd.DataFrame(table_items)
|
|
prem_ = self.extract_from_df(df, headers, web_source_name)
|
|
prem_ = self.extract_from_df(df, headers, web_source_name)
|
|
# rs_dic.update(prem_)
|
|
# rs_dic.update(prem_)
|
|
- table_prem.update(prem_)
|
|
|
|
|
|
+ # table_prem.update(prem_)
|
|
|
|
+ self.update_prem(table_prem, prem_)
|
|
i = j - 1
|
|
i = j - 1
|
|
i += 1
|
|
i += 1
|
|
if table_prem and len(trs) == 2 and 'package_code' not in headers and '1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
if table_prem and len(trs) == 2 and 'package_code' not in headers and '1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
|
|
@@ -6385,7 +6405,8 @@ class TablePremExtractor(object):
|
|
package_sib = uniform_package_name(package_sib)
|
|
package_sib = uniform_package_name(package_sib)
|
|
table_prem[package_sib] = table_prem.pop('1')
|
|
table_prem[package_sib] = table_prem.pop('1')
|
|
if table_prem:
|
|
if table_prem:
|
|
- rs_dic.update(table_prem)
|
|
|
|
|
|
+ # rs_dic.update(table_prem)
|
|
|
|
+ self.update_prem(rs_dic, table_prem)
|
|
table.extract()
|
|
table.extract()
|
|
return rs_dic
|
|
return rs_dic
|
|
|
|
|