|
@@ -870,7 +870,7 @@ class PREMPredict():
|
|
|
elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', front)==None:
|
|
|
values[2] = 0.5
|
|
|
label = 5
|
|
|
- elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$', front): # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
|
|
|
+ elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$', front): # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
|
|
|
label = 5
|
|
|
elif re.search(',来源:$', front) and re.search('^,', behind): # 修复 472062585 项目采购-关于定制手机询比价采购中标公告,来源:深圳市网联安瑞网络科技有限公司 预测为中标
|
|
|
label = 0
|
|
@@ -1559,7 +1559,7 @@ class RoleRulePredictor():
|
|
|
return (_label, _prob, _flag, keyword)
|
|
|
|
|
|
|
|
|
- def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5, all_winner=False):
|
|
|
+ def predict(self, list_articles, list_sentences, list_entitys, list_codenames, channel_dic, on_value=0.5, all_winner=False):
|
|
|
|
|
|
for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
|
|
|
list_codenames):
|
|
@@ -1827,9 +1827,7 @@ class RoleRulePredictor():
|
|
|
p_entity.values[0] = 0.8 + p_entity.values[0] / 10
|
|
|
p_entity.label = 0
|
|
|
# print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
|
|
|
- if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and re.search(
|
|
|
- '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|磋商|交易|评审)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
|
|
|
- article.title+article.content[:100]):
|
|
|
+ if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告']:
|
|
|
for p_entity in candidates:
|
|
|
# print('只有一个候选人的作为中标人', p_entity.entity_text)
|
|
|
p_entity.label = 2
|
|
@@ -3562,6 +3560,8 @@ class ProductAttributesPredictor():
|
|
|
if link['unitPrice'] != "" and link['quantity'] != '':
|
|
|
try:
|
|
|
total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
|
|
|
+ if float(link['unitPrice'])>10000 and float(link['quantity'])>100: # 修复 325105750 总价做单价 造成中标金额错误
|
|
|
+ total_product_money = 0
|
|
|
except:
|
|
|
log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
|
|
|
|
|
@@ -4034,9 +4034,9 @@ class DocChannel():
|
|
|
'招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|(供应|招标)计划表?$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
|
|
|
'公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
|
|
|
'招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
|
|
|
- '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
|
|
|
+ '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍|停止)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
|
|
|
'合同公告': '(合同(成交|变更)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$', # |(履约|验收)(结果)?
|
|
|
- '候选人公示': '候选人(变更)?公示|评标(结果)?公示|评审结果', #中标前公示|中标预公示|
|
|
|
+ '候选人公示': '候选人(变更)?公示|评标(结果)?(公[告示]|报告)|评审结果', #中标前公示|中标预公示|
|
|
|
'中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$|项目中标', # |开标(记录|信息|情况)
|
|
|
'资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
|
|
|
'招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
|
|
@@ -4271,14 +4271,13 @@ class DocChannel():
|
|
|
log('正则把中标信息修改为空')
|
|
|
return channel_dic
|
|
|
|
|
|
- def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
|
|
|
+ def predict_merge(self, title, list_sentence, html, original_docchannel='', web_source_no=''):
|
|
|
'''
|
|
|
正则,模型混合预测,返回公告类型及生命周期
|
|
|
:param title: 公告标题
|
|
|
:param content: 预处理后的返回的句子实体列表 list_sentence
|
|
|
:param html: 公告原文 html 内容
|
|
|
:param bidway: 招标方式
|
|
|
- :param prem: 提取的prem 字典
|
|
|
:return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
|
|
|
'''
|
|
|
def cut_single_cn_space(text):
|
|
@@ -4315,12 +4314,6 @@ class DocChannel():
|
|
|
kw.append(re.search(p, text).group(0))
|
|
|
return num, ';'.join(kw)
|
|
|
|
|
|
- def is_contain_winner(extract_json):
|
|
|
- if re.search('win_tenderer', extract_json):
|
|
|
- return True
|
|
|
- else:
|
|
|
- return False
|
|
|
-
|
|
|
def is_single_source(bidway, title):
|
|
|
if re.search('单一来源|单一性采购', title):
|
|
|
return True
|
|
@@ -4407,12 +4400,16 @@ class DocChannel():
|
|
|
if '采购意向' in life_kw_title or '采购意向' in life_list:
|
|
|
if '中标信息' in life_kw_title or '中标信息' in life_list:
|
|
|
return '中标信息', msc
|
|
|
+ elif '候选人公示' in life_kw_title:
|
|
|
+ return '候选人公示', msc
|
|
|
elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
|
|
|
return '', msc
|
|
|
return '采购意向', msc
|
|
|
elif '招标预告' in life_kw_title or '招标预告' in life_list:
|
|
|
if '中标信息' in life_kw_title or '中标信息' in life_list:
|
|
|
return '中标信息', msc
|
|
|
+ elif '候选人公示' in life_kw_title:
|
|
|
+ return '候选人公示', msc
|
|
|
elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
|
|
|
return '', msc
|
|
|
return '招标预告', msc
|
|
@@ -4435,8 +4432,6 @@ class DocChannel():
|
|
|
return '', msc
|
|
|
return '招标答疑', msc
|
|
|
elif '开标记录' in life_kw_title:
|
|
|
- if '开标结果' in title and is_contain_winner(prem_json):
|
|
|
- return '中标信息', msc
|
|
|
return '开标记录', msc
|
|
|
elif '验收合同' in life_kw_title:
|
|
|
return '验收合同', msc
|
|
@@ -4514,86 +4509,6 @@ class DocChannel():
|
|
|
prob = pred[0][id]
|
|
|
return id, prob
|
|
|
|
|
|
- def final_change(msc):
|
|
|
- '''
|
|
|
- 修改逻辑:
|
|
|
- 1、中标公告、合同公告无中标人且原始为非中标,返回原类型
|
|
|
- 2、废标公告有中标人且标题无废标关键词,返回中标信息
|
|
|
- 3、答疑公告标题无答疑关键且原始为招标,返回原始类别
|
|
|
- 4、招标公告有中标人且原始为中标,返回中标信息
|
|
|
- 5、预测为招标,原始为预告、意向,返回原始类别
|
|
|
- 6、预测及原始均在变更、答疑,返回原始类别
|
|
|
- 7、预测为采招数据,原始为产权且有关键词,返回原始类别
|
|
|
- 8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
|
|
|
- 9、若预测为非采招数据且源网为采招数据且有招标关键词返回采招数据
|
|
|
- 10、招标公告有中标人,且标题有直购关键词,改为中标信息
|
|
|
- 11、预测预告,原始为意向、招标且标题无预告关键词,返回原始类别
|
|
|
- '''
|
|
|
- if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
|
|
|
- original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
|
|
|
- prem_json)==False and re.search(self.title_life_dic['中标信息'], title)==None:
|
|
|
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
- msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
|
|
|
- elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
|
|
|
- self.title_life_dic['废标公告'], title) == None:
|
|
|
- result['docchannel']['docchannel'] = '中标信息'
|
|
|
- msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
|
|
|
- elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
|
|
|
- self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
|
|
|
- original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
|
|
|
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
- msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
|
|
|
- elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and origin_dic.get(
|
|
|
- original_docchannel, '') == '中标信息':
|
|
|
- result['docchannel']['docchannel'] = '中标信息'
|
|
|
- msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
|
|
|
- elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
|
|
|
- original_docchannel, '') in ['采购意向', '招标预告']:
|
|
|
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
- msc += '最终规则修改:预测为招标,原始为预告、意向,返回原始类别'
|
|
|
- elif result['docchannel']['docchannel'] in ['招标预告'] and origin_dic.get(
|
|
|
- original_docchannel, '') in ['采购意向', '招标公告'] and re.search(
|
|
|
- self.title_life_dic['招标预告'], title)==None:
|
|
|
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
- msc += '最终规则修改:预测预告,原始为意向、招标且标题无预告关键词,返回原始类别'
|
|
|
- elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
|
|
|
- original_docchannel, '') in ['招标答疑', '公告变更']:
|
|
|
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
- msc += '最终规则修改:预测及原始均在答疑、变更,返回原始类别'
|
|
|
- elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
|
|
|
- original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产', text):
|
|
|
- result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
|
|
|
- msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
|
|
|
- elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
|
|
|
- original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(
|
|
|
- self.title_life_dic['废标公告'], title) == None:
|
|
|
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
- msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
|
|
|
- elif result['docchannel']['docchannel'] in ['招标公告', '招标预告'] and is_contain_winner(
|
|
|
- prem_json) and re.search('直购', title):
|
|
|
- result['docchannel']['docchannel'] = '中标信息'
|
|
|
- msc += "最终规则修改:预测为招标却有中标人且标题有直购关键词返回中标"
|
|
|
-
|
|
|
- if result['docchannel']['doctype'] in ['产权交易', '土地矿产', '拍卖出让'] and origin_dic.get(
|
|
|
- original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] \
|
|
|
- and (re.search(self.title_type_dic['采招数据'], title) or re.search('工程|服务|采购|询价|磋商', title) or re.search('(采购|招投?标|投标)(信息|内容|项目|公告|数量|人|单位|方式)|(建设|工程|服务|施工|监理|勘察|设计)项目|(%s)'%self.type_dic['采招数据'], text)):
|
|
|
- result['docchannel']['doctype'] = '采招数据'
|
|
|
- msc += ' 最终规则修改:预测为非采招数据,原始为采招数据且有招标关键词,返回采招数据'
|
|
|
- elif result['docchannel']['doctype'] in ['土地矿产'] and origin_dic.get(original_docchannel, '') in ['拍卖出让', '产权交易']:
|
|
|
- if origin_dic.get(original_docchannel, '') in ['拍卖出让'] and (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
|
|
|
- result['docchannel']['doctype'] = '拍卖出让'
|
|
|
- msc += "最终规则修改:预测为土地矿产原始为拍卖且有拍卖关键词,返回拍卖"
|
|
|
- elif (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)):
|
|
|
- result['docchannel']['doctype'] = '产权交易'
|
|
|
- msc += "最终规则修改:预测为土地矿产原始为产权交易且有产权交易关键词,返回产权交易"
|
|
|
-
|
|
|
- '''下面是新格式增加返回字段'''
|
|
|
- if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
|
|
|
- result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
|
|
|
- else:
|
|
|
- result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
|
|
|
- return msc
|
|
|
-
|
|
|
not_extract_dic = {
|
|
|
104: '招标文件',
|
|
|
106: '法律法规',
|
|
@@ -4628,10 +4543,13 @@ class DocChannel():
|
|
|
118: '废标公告',
|
|
|
119: '候选人公示',
|
|
|
120: '合同公告'}
|
|
|
+
|
|
|
+ self.origin_dic = origin_dic
|
|
|
+
|
|
|
if original_docchannel in not_extract_dic:
|
|
|
return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel], 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '公告类别不在提取范围'
|
|
|
if web_source_no in ['02104-7', '04733', 'DX007628-6']: # 这些数据源无法识别
|
|
|
- return {'docchannel': {'docchannel': '', 'doctype': '采招数据', 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '此数据源公告分类不明确,返回数据源类别'
|
|
|
+ return {'docchannel': {'docchannel': origin_dic.get(original_docchannel, '原始类别'), 'doctype': '采招数据', 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '此数据源公告分类不明确,返回数据源类别'
|
|
|
if original_docchannel == 303:
|
|
|
return {'docchannel': {'docchannel': '处罚公告', 'doctype': '处罚公告', 'life_docchannel': '处罚公告'}}, "源类别为处罚公告"
|
|
|
|
|
@@ -4640,7 +4558,7 @@ class DocChannel():
|
|
|
title = title[:20] + title[-30:]
|
|
|
|
|
|
text = html2text(html)
|
|
|
- prem_json = json.dumps(prem, ensure_ascii=False)
|
|
|
+
|
|
|
result = {'docchannel': {'docchannel': '', 'doctype': ''}}
|
|
|
|
|
|
doc_type, type_kw = get_type(title, text)
|
|
@@ -4674,10 +4592,113 @@ class DocChannel():
|
|
|
result['docchannel']['docchannel'] = life_model
|
|
|
msc += life_model + ' 概率:%.4f;\n'%life_prob
|
|
|
|
|
|
- msc = final_change(msc)
|
|
|
+ # msc = final_change(msc)
|
|
|
# print('channel ', msc)
|
|
|
return result, msc
|
|
|
|
|
|
+ def final_change(self, result, prem, title, text, original_docchannel, msc):
|
|
|
+ '''
|
|
|
+
|
|
|
+ :param result: channel 结果字典
|
|
|
+ :param prem:
|
|
|
+ :param title: 标题
|
|
|
+ :param text: 正文
|
|
|
+ :param original_docchannel: 站源类别
|
|
|
+ :param msc: 备注
|
|
|
+ :return: channel结果字典
|
|
|
+ '''
|
|
|
+ '''
|
|
|
+ 修改逻辑:
|
|
|
+ 1、中标公告、合同公告无中标人且原始为非中标,返回原类型
|
|
|
+ 2、废标公告有中标人且标题无废标关键词,返回中标信息
|
|
|
+ 3、答疑公告标题无答疑关键且原始为招标,返回原始类别
|
|
|
+ 4、招标公告有中标人且原始为中标,返回中标信息
|
|
|
+ 5、预测为招标,原始为预告、意向,返回原始类别
|
|
|
+ 6、预测及原始均在变更、答疑,返回原始类别
|
|
|
+ 7、预测为采招数据,原始为产权且有关键词,返回原始类别
|
|
|
+ 8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
|
|
|
+ 9、若预测为非采招数据且源网为采招数据且有招标关键词返回采招数据
|
|
|
+ 10、招标公告有中标人,且标题有直购关键词,改为中标信息
|
|
|
+ 11、预测预告,原始为意向、招标且标题无预告关键词,返回原始类别
|
|
|
+
|
|
|
+ '''
|
|
|
+ def is_contain_winner(extract_json):
|
|
|
+ if re.search('win_tenderer', extract_json):
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+ origin_dic = self.origin_dic
|
|
|
+ prem_json = json.dumps(prem, ensure_ascii=False)
|
|
|
+ if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
|
|
|
+ prem_json) == False and re.search(self.title_life_dic['中标信息'], title) == None:
|
|
|
+ result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
|
|
|
+ elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
|
|
|
+ self.title_life_dic['废标公告'], title) == None:
|
|
|
+ result['docchannel']['docchannel'] = '中标信息'
|
|
|
+ msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
|
|
|
+ elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
|
|
|
+ self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
|
|
|
+ result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
|
|
|
+ elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and origin_dic.get(
|
|
|
+ original_docchannel, '') == '中标信息':
|
|
|
+ result['docchannel']['docchannel'] = '中标信息'
|
|
|
+ msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
|
|
|
+ elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['采购意向', '招标预告']:
|
|
|
+ result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:预测为招标,原始为预告、意向,返回原始类别'
|
|
|
+ elif result['docchannel']['docchannel'] in ['招标预告'] and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['采购意向', '招标公告'] and re.search(
|
|
|
+ self.title_life_dic['招标预告'], title) == None:
|
|
|
+ result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:预测预告,原始为意向、招标且标题无预告关键词,返回原始类别'
|
|
|
+ elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['招标答疑', '公告变更']:
|
|
|
+ result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:预测及原始均在答疑、变更,返回原始类别'
|
|
|
+ elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产', text):
|
|
|
+ result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
|
|
|
+ elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
|
|
|
+ original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(
|
|
|
+ self.title_life_dic['废标公告'], title) == None:
|
|
|
+ result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
+ msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
|
|
|
+ elif result['docchannel']['docchannel'] in ['招标公告', '招标预告'] and is_contain_winner(
|
|
|
+ prem_json) and re.search('直购', title):
|
|
|
+ result['docchannel']['docchannel'] = '中标信息'
|
|
|
+ msc += "最终规则修改:预测为招标却有中标人且标题有直购关键词返回中标"
|
|
|
+ elif result['docchannel']['docchannel'] == '开标记录' and '开标结果' in title and is_contain_winner(prem_json):
|
|
|
+ msc += "最终规则修改:开标结果包含中标人的作为中标信息"
|
|
|
+ result['docchannel']['docchannel'] = '中标信息'
|
|
|
+ if result['docchannel']['doctype'] in ['产权交易', '土地矿产', '拍卖出让'] and origin_dic.get(
|
|
|
+ original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] \
|
|
|
+ and (re.search(self.title_type_dic['采招数据'], title) or re.search('工程|服务|采购|询价|磋商', title) or re.search(
|
|
|
+ '(采购|招投?标|投标)(信息|内容|项目|公告|数量|人|单位|方式)|(建设|工程|服务|施工|监理|勘察|设计)项目|(%s)' % self.type_dic['采招数据'], text)):
|
|
|
+ result['docchannel']['doctype'] = '采招数据'
|
|
|
+ msc += ' 最终规则修改:预测为非采招数据,原始为采招数据且有招标关键词,返回采招数据'
|
|
|
+ elif result['docchannel']['doctype'] in ['土地矿产'] and origin_dic.get(original_docchannel, '') in ['拍卖出让', '产权交易']:
|
|
|
+ if origin_dic.get(original_docchannel, '') in ['拍卖出让'] and (
|
|
|
+ re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
|
|
|
+ result['docchannel']['doctype'] = '拍卖出让'
|
|
|
+ msc += "最终规则修改:预测为土地矿产原始为拍卖且有拍卖关键词,返回拍卖"
|
|
|
+ elif (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)):
|
|
|
+ result['docchannel']['doctype'] = '产权交易'
|
|
|
+ msc += "最终规则修改:预测为土地矿产原始为产权交易且有产权交易关键词,返回产权交易"
|
|
|
+
|
|
|
+ '''下面是新格式增加返回字段'''
|
|
|
+ if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
|
|
|
+ result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
|
|
|
+ else:
|
|
|
+ result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
|
|
|
+ return result, msc
|
|
|
+
|
|
|
# 保证金支付方式提取
|
|
|
class DepositPaymentWay():
|
|
|
def __init__(self,):
|
|
@@ -6001,7 +6022,7 @@ class DistrictPredictor():
|
|
|
return province_l, city_l, district_l
|
|
|
|
|
|
def get_pro_city_dis_score(text, text_weight=1):
|
|
|
- text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾', ' ', text)
|
|
|
+ text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区', ' ', text)
|
|
|
text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
|
text = re.sub('茂名滨海新区', '茂名市', text)
|
|
@@ -6275,6 +6296,8 @@ class DistrictPredictor():
|
|
|
text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) # 预防提取错 合肥 路南 新会 等地区
|
|
|
|
|
|
if pro_addr and re.search('\w{2,}([省市县旗盟]|自治[区州县旗])', pro_addr):
|
|
|
+ if re.search('[市县旗盟]', pro_addr)==None: # 修复 486623506 项目地址不完整
|
|
|
+ pro_addr = text1 + ' '+ pro_addr
|
|
|
msc += '## 使用项目地址输入:%s ##;' % pro_addr
|
|
|
rs = self.get_area(pro_addr, '')
|
|
|
msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
|
|
@@ -6422,7 +6445,6 @@ def is_head_line(list_item):
|
|
|
predict_y = getPredictor("form").predict(np.array(x), type="item")
|
|
|
count = 0
|
|
|
for item, values in zip(list_item, list(predict_y)):
|
|
|
- print(item, values[1])
|
|
|
if values[1] > 0.6:
|
|
|
count += 1
|
|
|
if count/len(list_item)>0.6:
|
|
@@ -6511,6 +6533,8 @@ class TablePremExtractor(object):
|
|
|
elif re.search('^((投标|应答|响应|候选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|(存款|投标)?银行|供应商)(名称)?$|^机构名称$|^单位(名称)?$', text) and re.search('未', text)==None:
|
|
|
other_tenderer2 = (i, text)
|
|
|
if num>1:
|
|
|
+ if re.search(self.head_rule_dic['project_code'], text) and re.search(self.head_rule_dic['package_code'], text): # 修复 528486798 分标编号-包号
|
|
|
+ continue
|
|
|
# print('表头错误,一个td匹配到两个表头:', header_dic)
|
|
|
return flag, contain_header, dict(), not_sure_winner
|
|
|
if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
|
|
@@ -7399,21 +7423,41 @@ class WebsourceTenderee():
|
|
|
]}
|
|
|
return prem
|
|
|
|
|
|
+def get_header_line(list_item):
|
|
|
+ '''
|
|
|
+ 判断列表内文本哪些是表头,哪些不是
|
|
|
+ :param list_item: [ '批复结果', '许可/同意', '批复文号',]
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ rs = []
|
|
|
+ x = []
|
|
|
+ for item in list_item:
|
|
|
+ x.append(getPredictor("form").encode(item))
|
|
|
+ predict_y = getPredictor("form").predict(np.array(x), type="item")
|
|
|
+ for item, values in zip(list_item, list(predict_y)):
|
|
|
+ lb = 1 if values[1] > 0.5 else 0
|
|
|
+ if item in ['许可/同意', '办结(通过)', '办结(准予许可)','批准']:
|
|
|
+ lb = 0
|
|
|
+ elif item in ['环境影响评价机构', '建设单位或地方政府作出的相关环保承诺']:
|
|
|
+ lb = 1
|
|
|
+ rs.append(lb)
|
|
|
+ return rs
|
|
|
+
|
|
|
class ApprovalPredictor():
|
|
|
def __init__(self):
|
|
|
'''
|
|
|
项目(法人)单位
|
|
|
'''
|
|
|
self.other_part = {
|
|
|
- "project_name": "((项目|工程|采购|招标|计划|建设|规划)名称?|生产建设项目|申请项目):(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?", # 项目名称
|
|
|
- "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案|索引)(编[号码]|号)):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?", # 项目编号
|
|
|
- "doc_num": "((审[批查核]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认|受理|申请报告|文件|意见书|办件)[文编]?号|综合受理号|文书?号|合格书号):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-.]{5,30}号?)[,。]?(\w{2,10}:|$)?", # 文号
|
|
|
- "pro_type": "((申[报请]|审核备|项目|立项)(类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?", # 项目类型
|
|
|
- "year_limit": "((建设|工程|服务|项目)(起止|\w{,2})?(年限|期限|时长|工期)):(约|超过|大概|建设工期|共计|合计)?(?P<main>[\d一二三四五六七八九十]+个月|\d{1,3}(日?历?天|小时)|20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?([至—-]+20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?)?)[(,。](\w{2,10}:|$)?", # 建设年限
|
|
|
- "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|招标|采购))?内容|(建设|工程|项目)(主要)?(规模|内容|概况|面积)([及和](主要)?(规模|内容|概况|面积))?(如下)?):(?P<main>[^:。]{2,250})[,。](\w{2,10}:|$)?", # 建设规模
|
|
|
- "approval_items": "((审[批查核]|批[复准]|申请|监管)(事项|内容|名称)|事项名称|事项审批):(?P<main>[^:。]{2,70})[,。](\w{2,10}:|$)?", # 审批事项
|
|
|
- "properties": "((建设|工程|项目)性质):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 建设性质
|
|
|
- "approval_result": "((审[批查核]|批[复准]|核[发准]|许可|抽查|备案)(结果|决定|结论|状态|回复|意见)|(办[理件]|,)(状态|意见|结果)|项目(当前|目前)?状态):(?P<main>[^:。]{2,20})[,。](\w{2,10}:|$)?", # 审批结果
|
|
|
+ "project_name": "((项目|工程|采购|招标|计划|建设|规划)名称?|生产建设项目|申请项目):(?P<main>[^:。]{5,50})[,。]([\w()]{2,15}:|$)?", # 项目名称
|
|
|
+ "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案|索引)([编代][号码]|号)):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)([\w()]{2,15}:|$)?", # 项目编号
|
|
|
+ "doc_num": "((环评|\w{,3})(审[批查核]|批[复准]|立项|[定知文]书|[公发批]文|用地|决定|备案|核准|许可|确认|受理|申请报告|文[件书]|意见书|办件)[文编证]?号|综合受理号|文书?号|合格书号|申报号|(办件|事项)[编代][号码]|收件号))?为?:?(?P<main>[()〔〕【】\[\]0-9]{,8}([\w()〔〕【】]{2,15})?[()〔〕【】\[\]a-zA-Z0-9-.]{3,30}号?)[,。]?([\w()]{2,15}:|$)?", # 文号
|
|
|
+ "pro_type": "((申[报请]|审核备|项目|立项)(类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业):(?P<main>[^:。]{2,30})[,。]([\w()]{2,15}:|$)?", # 项目类型
|
|
|
+ "year_limit": "((建设|工程|服务|项目)(起止|\w{,2})?(年限|期限|时长|工期)):(约|超过|大概|建设工期|共计|合计)?(?P<main>[\d一二三四五六七八九十]+个月|\d{1,3}(日?历?天|小时)|20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?([至—-]+20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?)?)[(,。]([\w()]{2,15}:|$)?", # 建设年限
|
|
|
+ "construction_scale": "([\d一二三四五六七八九十]{1,2}、|([\d一二三四五六七八九十]{1,2}))?(工程|项目|\w{,4})?((建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|招标|采购))?内容|(建设|工程|项目)(主要)?(规模|内容|概况|面积)([及和](主要)?(规模|内容|概况|面积))?(如下|为)?)|^规模(情况)?):(?P<main>[^:。]{2,500})[,。]?([\w()]{2,30}:|$)?", # 建设规模 #56924861 主要环境影响及预防或者减轻不良环境影响的对策和措施:
|
|
|
+ "approval_items": "((审[批查核]|批[复准]|申请|监管|受理)(事项|内容|名称)|事项名称|事项审批):(?P<main>[^:。]{2,150})[,。]([\w()]{2,15}:|$)?", # 审批事项
|
|
|
+ "properties": "((建设|工程|项目)性质):(?P<main>[^:。]{2,50})[,。]([\w()]{2,15}:|$)?", # 建设性质
|
|
|
+ "approval_result": "((审[批查核]|批[复准]|核[发准]|许可|抽查|备案)(结果|决定|结论|状态|回复|意见)|(办[理件]|,)(状态|意见|结果)|项目(当前|目前)?状态):(?P<main>[^:。]{2,20})[,。]([\w()]{2,15}:|$)?", # 审批结果
|
|
|
"phone": "(联系)?电话:(?P<main>1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|" # 联系电话
|
|
|
'\+86.?1[3-9]\d{9}|'
|
|
|
'0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
|
|
@@ -7423,12 +7467,12 @@ class ApprovalPredictor():
|
|
|
'0[1-9]\d{1,2}[-—-―]{0,2}[2-9]\d{6}\d?|'
|
|
|
'[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|'
|
|
|
'400\d{7}转\d{1,4}|'
|
|
|
- '[2-9]\d{6,7})[,。](\w{2,10}:|$)?'
|
|
|
+ '[2-9]\d{6,7})[,。]([\w()]{2,15}:|$)?'
|
|
|
}
|
|
|
|
|
|
self.role_type = {
|
|
|
"declare_company": "(申[请报]|填报|呈报)(人|部门|机关|单位|企业|公司|机构|组织)", # 申报单位
|
|
|
- "construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方|业主)|主送机关|法人单位|甲方", # 建设单位
|
|
|
+ "construct_company": "(业主|建设|用地|委托|发包|产权|项目|法人|采购|招标|询价))?(部门|机关|单位|企业|公司|方|业主|人)|主送机关|法人单位|甲方", # 建设单位
|
|
|
"approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办))?(部门|机关|单位|企业|公司|机构)|实施主体", # 审批部门
|
|
|
"evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" , # 环评机构
|
|
|
"compilation_unit": "编制单位", # 编制单位 20240701加
|
|
@@ -7446,14 +7490,38 @@ class ApprovalPredictor():
|
|
|
}
|
|
|
|
|
|
self.addr_type = {
|
|
|
- "project_addr": "(建设|工程|项目|施工|地块|用地)\w{,2}(地址|地点|位置|所在地)|[宗土]地坐落" # 建设地址
|
|
|
+ "project_addr": "((建设|工程|项目|施工|地块|用地)\w{,2}(地址|地点|位置|所在地)|[宗土]地坐落)" # 建设地址
|
|
|
}
|
|
|
|
|
|
self.money_type = {
|
|
|
- "total_tendereeMoney": "(项目|概算|投资)金额|项目投资|总投资|总预算|总概算|投资(规模|总额|估算|概算)|批复概算|投资额", # 总投资
|
|
|
+ "total_tendereeMoney": "(项目|概算|投资)金额|项目投资|总投资|总预算|总概算|投资(规模|总额|估算|概算)|批复概算|投资额|项目概算", # 总投资
|
|
|
}
|
|
|
|
|
|
- def predict(self, list_sentences, list_entitys, span=12):
|
|
|
+ def recursive_text(self, tag):
|
|
|
+ '''
|
|
|
+ 递归获取 soup 节点文本
|
|
|
+ :param tag:
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ texts = []
|
|
|
+ for child in tag.children:
|
|
|
+ if child.name:
|
|
|
+ if child.name in ['p'] and len(child.find_all('br'))>2:
|
|
|
+ texts.extend(self.recursive_text(child))
|
|
|
+ if child.name in ["td", "th", "p", "li", "h1", "h2", "h3", "h4", "h5",
|
|
|
+ "h6"] and child.get_text().strip():
|
|
|
+ texts.append(re.sub('\s', '', child.get_text().strip().replace(':', ':').replace('(', '(').replace(')', ')')))
|
|
|
+
|
|
|
+ else:
|
|
|
+ texts.extend(self.recursive_text(child))
|
|
|
+ else:
|
|
|
+ if child.strip():
|
|
|
+ texts.append(re.sub('\s', '', child.strip().replace(':', ':').replace('(', '(').replace(')', ')')))
|
|
|
+ return texts
|
|
|
+
|
|
|
+ def predict(self, list_sentences, list_entitys, html, span=12):
|
|
|
+ soup = BeautifulSoup(html)
|
|
|
+ texts_list = self.recursive_text(soup)
|
|
|
rs_dic = {k: "" for k in
|
|
|
self.other_part.keys() | self.role_type.keys() | self.date_type.keys() | self.addr_type.keys() | self.money_type.keys() | self.person_type.keys()}
|
|
|
rs_dic['moneysource'] = ""
|
|
@@ -7475,13 +7543,19 @@ class ApprovalPredictor():
|
|
|
b, e = entity.wordOffset_begin, entity.wordOffset_end
|
|
|
if entity.entity_type in ['org', 'company']:
|
|
|
flag = 1
|
|
|
+ role_l = []
|
|
|
for k, v in self.role_type.items():
|
|
|
- if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
|
- if rs_dic[k] == '':
|
|
|
- rs_dic[k] = entity.entity_text
|
|
|
- multi_project[k] = entity.entity_text
|
|
|
- found_key = 1
|
|
|
- flag = 0
|
|
|
+ ser = re.search(v, sentences[entity.sentence_index][max(0, b - span):b])
|
|
|
+ if ser:
|
|
|
+ role_l.append((k, ser.end()))
|
|
|
+ if role_l:
|
|
|
+ role_l = sorted(role_l, key=lambda x: x[1]) # 解决 400064746000 表格某个为空导致两个表头相近提取错误 申报单位名称:备案机关:海门经济技术开发区管理委员会,备案证号:海开审备〔2024〕346号
|
|
|
+ k, _ = role_l[-1]
|
|
|
+ if rs_dic[k] == '':
|
|
|
+ rs_dic[k] = entity.entity_text
|
|
|
+ multi_project[k] = entity.entity_text
|
|
|
+ found_key = 1
|
|
|
+ flag = 0
|
|
|
if flag and entity.entity_type == "org" and re.search('(局|委员会|委|厅)$', entity.entity_text):
|
|
|
org_set.add(entity.entity_text)
|
|
|
elif entity.entity_type in ['person']:
|
|
@@ -7493,15 +7567,22 @@ class ApprovalPredictor():
|
|
|
found_key = 1
|
|
|
break
|
|
|
elif entity.entity_type in ['time']:
|
|
|
+ time_l = []
|
|
|
for k, v in self.date_type.items():
|
|
|
- if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
|
- time = timeFormat(entity.entity_text, default_first_day=False) if k in ['time_completion'] else timeFormat(entity.entity_text)
|
|
|
- if time == "":
|
|
|
- continue
|
|
|
- if rs_dic[k] == '':
|
|
|
- rs_dic[k] = time
|
|
|
- multi_project[k] = time
|
|
|
- found_key = 1
|
|
|
+ ser = re.search(v, sentences[entity.sentence_index][max(0, b - span):b])
|
|
|
+ if ser:
|
|
|
+ time_l.append((k, ser.end()))
|
|
|
+ if time_l:
|
|
|
+ time_l = sorted(time_l, key=lambda x: x[1])
|
|
|
+ k, end = time_l[-1]
|
|
|
+ time = timeFormat(entity.entity_text, default_first_day=False) if k in [
|
|
|
+ 'time_completion'] else timeFormat(entity.entity_text)
|
|
|
+ if time == "":
|
|
|
+ continue
|
|
|
+ if rs_dic[k] == '':
|
|
|
+ rs_dic[k] = time
|
|
|
+ multi_project[k] = time
|
|
|
+ found_key = 1
|
|
|
elif entity.entity_type in ['location']:
|
|
|
for k, v in self.addr_type.items():
|
|
|
if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
@@ -7535,25 +7616,46 @@ class ApprovalPredictor():
|
|
|
rs_dic[k] = entity.entity_text
|
|
|
multi_project[k] = entity.entity_text
|
|
|
found_key = 1
|
|
|
- for k, v in self.other_part.items():
|
|
|
- for iter in re.finditer(v, text):
|
|
|
- if rs_dic[k] == '':
|
|
|
- rs_dic[k] = iter.group('main')
|
|
|
- multi_project[k] = iter.group('main')
|
|
|
+ for k, v in self.other_part.items(): # 规则提取非实体类信息
|
|
|
+ ser = re.search(v, text)
|
|
|
+ if ser:
|
|
|
+ if rs_dic[k] == '' or (k == 'project_name' and ',审批事项:' in rs_dic[k]): # 修复 54087410 项目名称包含错误
|
|
|
+ rs_dic[k] = ser.group('main')
|
|
|
+ multi_project[k] = ser.group('main')
|
|
|
found_key = 1
|
|
|
- break
|
|
|
- for k, v in self.date_type.items():
|
|
|
- for iter in re.finditer(v+':?(?P<main>20\d{2}-\d{1,2}(-\d{1,2})?|20\d{2}/\d{1,2}(/\d{1,2})?|20\d{2}\.\d{1,2}(\.\d{1,2})?|20\d{2}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])?)', text): # 规则补充实体识别不到的日期时间
|
|
|
- time = timeFormat(iter.group('main'), default_first_day=False) if k in ['time_completion'] else timeFormat(iter.group('main'))
|
|
|
+ for k, v in self.date_type.items(): # 规则补充时间实体
|
|
|
+ if multi_project[k] != '':
|
|
|
+ continue
|
|
|
+ ser = re.search(v+':?(?P<main>20\d{2}-\d{1,2}(-\d{1,2})?|20\d{2}/\d{1,2}(/\d{1,2})?|20\d{2}\.\d{1,2}(\.\d{1,2})?|20\d{2}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])?)', text)
|
|
|
+ if ser:# 规则补充实体识别不到的日期时间
|
|
|
+ time = timeFormat(ser.group('main'), default_first_day=False) if k in ['time_completion'] else timeFormat(ser.group('main'))
|
|
|
if time == "":
|
|
|
continue
|
|
|
if rs_dic[k] == '':
|
|
|
rs_dic[k] = time
|
|
|
multi_project[k] = time
|
|
|
found_key = 1
|
|
|
- break
|
|
|
+ for k, v in self.addr_type.items(): # 规则补充地址实体 400063690529 实体不完整 建设地点:湖北省-咸宁市-通城县 通城县大坪乡沙口村15组(通城经济开发区)
|
|
|
+ ser = re.search(v + ':?(?P<main>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])',text)
|
|
|
+ if ser:
|
|
|
+ if rs_dic[k] == '' or len(rs_dic[k]) < len(ser.group('main')):
|
|
|
+ rs_dic[k] = ser.group('main')
|
|
|
+ if len(multi_project[k]) < len(ser.group('main')):
|
|
|
+ multi_project[k] = ser.group('main')
|
|
|
+ found_key = 1
|
|
|
+ for k, v in self.role_type.items(): # 规则补充公司实体
|
|
|
+ if multi_project[k] != '':
|
|
|
+ continue
|
|
|
+ ser = re.search('(%s):(?P<main>[\w()]{6,30}(局|发改|超市|棋牌室|店|(个体工商户)))[,。]'%self.role_type[k], text)
|
|
|
+ if ser:
|
|
|
+ if rs_dic[k] == '':
|
|
|
+ rs_dic[k] = ser.group('main')
|
|
|
+ multi_project[k] = ser.group('main')
|
|
|
+
|
|
|
if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
|
|
|
code_name_set.add(multi_project['project_code']+multi_project['project_name'])
|
|
|
+ if len(set([k for k,v in multi_project.items() if v!=''])-set(['project_name', 'project_code']))<2: # 除了包其他要素少于两个的不作为多包
|
|
|
+ continue
|
|
|
district = getPredictor('district').get_area(
|
|
|
multi_project['approver'] + multi_project['project_name'] + multi_project['project_addr'], '')
|
|
|
if district['district']['province'] != '全国':
|
|
@@ -7561,7 +7663,7 @@ class ApprovalPredictor():
|
|
|
multi_project['province'] = district['district']['province']
|
|
|
multi_project['city'] = district['district']['city']
|
|
|
multi_project['district'] = district['district']['district']
|
|
|
- multi_project = {k:v for k,v in multi_project.items() if v != ''}
|
|
|
+ multi_project = {k: v for k, v in multi_project.items() if v != ''}
|
|
|
rs_l.append(multi_project)
|
|
|
if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())==set(rs_l[1].keys()):
|
|
|
return rs_l
|
|
@@ -7575,6 +7677,41 @@ class ApprovalPredictor():
|
|
|
rs_dic['district'] = district['district']['district']
|
|
|
if len(org_set) == 1 and rs_dic['approver'] == "":
|
|
|
rs_dic['approver'] == org_set.pop()
|
|
|
+
|
|
|
+ n = 0
|
|
|
+ scale_l = [] # 保存以建设规模开头的文本,如果只有一个且比原来长的替换为此文本,避免提取不完成情况
|
|
|
+ for text in texts_list: # 补充纠正内容
|
|
|
+ for k, v in self.other_part.items():
|
|
|
+ kw = v.split(':')[0]
|
|
|
+ if re.search('^(%s)$'%kw, text) and rs_dic[k]=='': # 处理非表格表头内容 排列数据 例:400064764198,web_no: XM0016-5
|
|
|
+ if n >1 and n+2 < len(texts_list) and get_header_line(texts_list[n-2:n+3]) == [1,0,1,0,1]:
|
|
|
+ rs_dic[k] = texts_list[n+1]
|
|
|
+ elif n in [0,1] and n+2 < len(texts_list) and get_header_line(texts_list[n:n+3]) == [1,0,1]:
|
|
|
+ rs_dic[k] = texts_list[n + 1]
|
|
|
+ elif n >1 and n+2 == len(texts_list) and get_header_line(texts_list[n-2:n+2]) == [1,0,1,0]:
|
|
|
+ rs_dic[k] = texts_list[n + 1]
|
|
|
+ elif k == 'construction_scale' and re.search('^(?[一二三四五六七八九十][)、]', text) and n+1 < len(texts_list): # 大纲 例:53375037
|
|
|
+ rs_dic[k] = texts_list[n + 1]
|
|
|
+ if k == 'construction_scale' and len(rs_dic.get(k, '')) < len(text):
|
|
|
+ ser = re.search('^(%s):(?P<main>.+)'%kw, text)
|
|
|
+ if ser:
|
|
|
+ rs_dic[k] = ser.group('main')
|
|
|
+
|
|
|
+ n += 1
|
|
|
+ if 0<len(rs_dic['construction_scale'])<len(text) and rs_dic['construction_scale'][-1] not in [',', '。'] and text.find(rs_dic['construction_scale'])==0:
|
|
|
+ scale_l.append(text)
|
|
|
+ if len(scale_l)==1 and len(scale_l[0])>len(rs_dic['construction_scale']): # 规则补充不完整规模信息 例:53334434
|
|
|
+ rs_dic['construction_scale'] = scale_l[0]
|
|
|
+ if 0<len(rs_dic['construction_scale'])<8 and re.search('([编代][号码]|名称|时间|日期|金额|单位|机构)$', rs_dic['construction_scale']):
|
|
|
+ rs_dic['construction_scale'] = ''
|
|
|
+
|
|
|
+ for k, v in rs_dic.items(): # 限制最大长度
|
|
|
+ if len(v)>500:
|
|
|
+ v = v[:500]+'...后面省略%d字'%(len(v)-500)
|
|
|
+ rs_dic[k] = v
|
|
|
+ if v == 'null':
|
|
|
+ rs_dic[k] = ''
|
|
|
+
|
|
|
rs_dic = {k: v for k, v in rs_dic.items() if v != ''}
|
|
|
return [rs_dic]
|
|
|
return []
|
|
@@ -7995,3 +8132,5 @@ if __name__=="__main__":
|
|
|
# df['pos'] = df.apply(lambda x: 1 if x['label']==x['rule_label'] else 0, axis=1)
|
|
|
# # df.to_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果_rule_predict.xlsx', index=False, columns=columns)
|
|
|
# df.to_excel('E:\实体识别数据/2023-08-24所有公告_重新预测结果60000-90000_rule_predict.xlsx', index=False, columns=columns)
|
|
|
+ # print(get_header_line(['环评项目登记号','/','环评批文文号','金环许[2023]126号','环评批文日期']))
|
|
|
+ # print(get_header_line(['序号', '项目名称', '建设地点', '建设单位', '环评机构', '项目概况', '主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺', '公众反馈意见的联系方式']))
|