|
@@ -846,7 +846,10 @@ class PREMPredict():
|
|
|
elif re.search('第一候补|第一后备|备选', front):
|
|
|
label = 3
|
|
|
values[label] = 0.6
|
|
|
- elif re.search('放弃中标资格$|是否中标:否|^(中标|成交)(公示|公告)', behind):
|
|
|
+ elif re.search('^放弃中标资格|是否中标:否|^(中标|成交)(公示|公告)', behind):
|
|
|
+ values[2] = 0.5
|
|
|
+ label = 5
|
|
|
+ elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', front)==None:
|
|
|
values[2] = 0.5
|
|
|
label = 5
|
|
|
elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$', front): # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位:
|
|
@@ -954,7 +957,7 @@ class PREMPredict():
|
|
|
values[label] = 0.49
|
|
|
elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
|
|
|
values[label] = 0.49
|
|
|
- elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
|
|
|
+ elif re.search('(含|在|包括|[大小等高低]于|达到)$|[\d.%]+[+×*-]$', front):
|
|
|
values[label] = 0.49
|
|
|
elif entity.notes == '单价' and float(entity.entity_text)<5000:
|
|
|
label = 2
|
|
@@ -1515,6 +1518,8 @@ class RoleRulePredictor():
|
|
|
_label = 5
|
|
|
elif _label == 2 and re.search('评委|未中标', after[:5]): # 397194341 过滤掉错误召回中标人
|
|
|
_label = 5
|
|
|
+ elif _label == 2 and re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', after) and re.search('中标|成交|中选|排名|排序|名次|第[一1]名', before[-10:])==None: #20240705 处理类似 493939047 错误
|
|
|
+ _label = 5
|
|
|
if _label == 5:
|
|
|
_label, _prob, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text) # 前后文匹配
|
|
|
keyword = 'whole_'+ keyword[:keyword.find(entity_text)] if keyword!="" else keyword
|
|
@@ -4453,7 +4458,8 @@ class DocChannel():
|
|
|
11、预测预告,原始为意向、招标且标题无预告关键词,返回原始类别
|
|
|
'''
|
|
|
if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
|
|
|
- original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False:
|
|
|
+ original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
|
|
|
+ prem_json)==False and re.search(self.title_life_dic['中标信息'], title)==None:
|
|
|
result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
|
|
|
msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
|
|
|
elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
|
|
@@ -5888,15 +5894,54 @@ class DistrictPredictor():
|
|
|
name, b, e = it
|
|
|
area_list.append((name, (e - b + e) / max_len / 2))
|
|
|
return area_list
|
|
|
+
|
|
|
+ def find_whole_areas(text):
|
|
|
+ '''
|
|
|
+ 通过正则匹配字符串返回地址
|
|
|
+ :param pettern: 地址正则 广东省|广西省|...
|
|
|
+ :param text: 待匹配文本
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
|
|
|
+ p_pro, p_city, p_dis, p_city, p_dis, p_dis)
|
|
|
+ province_l, city_l, district_l = [], [], []
|
|
|
+ for it in re.finditer(pettern, text):
|
|
|
+ if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
|
|
|
+ '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
|
|
|
+ continue
|
|
|
+ if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
|
|
|
+ continue
|
|
|
+ for k, v in it.groupdict().items():
|
|
|
+ if v != None:
|
|
|
+ if k in ['prov']:
|
|
|
+ province_l.append((it.group(k), it.start(k), it.end(k)))
|
|
|
+ elif k in ['city', 'city1']:
|
|
|
+ if re.search('^(经济开发区|开发区|新区)', text[it.end(k):]): # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
|
|
|
+ continue
|
|
|
+ city_l.append((it.group(k), it.start(k), it.end(k)))
|
|
|
+ if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end(k):]):
|
|
|
+ city_l.append((it.group(k), it.start(k), it.end(k)))
|
|
|
+ elif k in ['dist', 'dist1', 'dist2']:
|
|
|
+ if it.group(k)=='昌江' and '景德镇' not in it.group(0):
|
|
|
+ district_l.append(('昌江黎族', it.start(k), it.end(k)))
|
|
|
+ else:
|
|
|
+ district_l.append((it.group(k), it.start(k), it.end(k)))
|
|
|
+ return province_l, city_l, district_l
|
|
|
+
|
|
|
def get_pro_city_dis_score(text, text_weight=1):
|
|
|
text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾', ' ', text)
|
|
|
text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
|
text = re.sub('茂名滨海新区', '茂名市', text)
|
|
|
text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
|
|
|
- province_l = find_areas(p_pro, text)
|
|
|
- city_l = find_areas(p_city, text)
|
|
|
- district_l = find_areas(p_dis, text)
|
|
|
+ ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
|
|
|
+ if ser and '黎族' not in ser.group(0):
|
|
|
+ text = text.replace(ser.group(0), ser.group(0)+'黎族')
|
|
|
+ # province_l = find_areas(p_pro, text)
|
|
|
+ # city_l = find_areas(p_city, text)
|
|
|
+ # district_l = find_areas(p_dis, text)
|
|
|
+
|
|
|
+ province_l, city_l, district_l = find_whole_areas(text) # 20240703 优化地址提取,解决类似 海南昌江 得到 海南 南昌 结果
|
|
|
|
|
|
if len(province_l) == len(city_l) == 0:
|
|
|
district_l = [it for it in district_l if
|
|
@@ -6076,8 +6121,11 @@ class DistrictPredictor():
|
|
|
|
|
|
def get_project_addr(text):
|
|
|
p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
+ p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
|
|
|
if re.search(p1, text):
|
|
|
return re.search(p1, text).group('addr')
|
|
|
+ elif re.search(p2, text):
|
|
|
+ return re.search(p2, text).group('addr')
|
|
|
else:
|
|
|
return ''
|
|
|
|
|
@@ -7163,15 +7211,15 @@ class ApprovalPredictor():
|
|
|
项目(法人)单位
|
|
|
'''
|
|
|
self.other_part = {
|
|
|
- "project_name": "(项目|工程|采购|招标|计划)名称?:(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?", # 项目名称
|
|
|
- "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案)(编[号码]|号)):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?", # 项目编号
|
|
|
- "doc_num": "((审[批查核]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认)[文编]?号|综合受理号|文书号):(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-.]{5,30}号?)[,。]?(\w{2,10}:|$)?", # 文号
|
|
|
- "pro_type": "(申[报请](类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业|项目类型|立项类型):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?", # 项目类型
|
|
|
- "year_limit": "((建设|工程|服务|项目)(年限|期限|时长)):(?P<main>[\d个年月日.-]{2,20})[,。](\w{2,10}:|$)?", # 建设年限
|
|
|
- "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|工程|项目)规模(如下)?):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 建设规模
|
|
|
- "approval_items": "((审[批查核]|批[复准]申请)(事项|内容)|事项名称|事项审批):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 审批事项
|
|
|
+ "project_name": "((项目|工程|采购|招标|计划|建设|规划)名称?|生产建设项目|申请项目):(?P<main>[^:。]{5,50})[,。](\w{2,10}:|$)?", # 项目名称
|
|
|
+ "project_code": "(立案号|项目(统一)?代码|(项目|工程|采购|招标|计划|任务|备案|索引)(编[号码]|号)):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-]{5,30}号?)(\w{2,10}:|$)?", # 项目编号
|
|
|
+ "doc_num": "((审[批查核]|批[复准]|立项|[定知]书|[公发批]文|用地|决定|备案|核准|许可|确认|受理|申请报告|文件|意见书|办件)[文编]?号|综合受理号|文书?号|合格书号):?(?P<main>(\w{2,8})?[()〔〕【】\[\]a-zA-Z0-9-.]{5,30}号?)[,。]?(\w{2,10}:|$)?", # 文号
|
|
|
+ "pro_type": "((申[报请]|审核备|项目|立项)(类型|种类)|项目所属行业|行业(分类|归属)|产业领域|项目行业):(?P<main>[^:。]{2,30})[,。](\w{2,10}:|$)?", # 项目类型
|
|
|
+ "year_limit": "((建设|工程|服务|项目)(起止|\w{,2})?(年限|期限|时长|工期)):(约|超过|大概|建设工期|共计|合计)?(?P<main>[\d一二三四五六七八九十]+个月|\d{1,3}(日?历?天|小时)|20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?([至—-]+20\d{2}[年/-](\d{1,2}[月/-]?)?(\d{1,2}日?)?)?)[(,。](\w{2,10}:|$)?", # 建设年限
|
|
|
+ "construction_scale": "(建设内容[及和](建设)?规模|建设规模[及和](主要)?(建设)?内容|(建设|招标|采购))?内容|(建设|工程|项目)(主要)?(规模|内容|概况|面积)([及和](主要)?(规模|内容|概况|面积))?(如下)?):(?P<main>[^:。]{2,250})[,。](\w{2,10}:|$)?", # 建设规模
|
|
|
+ "approval_items": "((审[批查核]|批[复准]|申请|监管)(事项|内容|名称)|事项名称|事项审批):(?P<main>[^:。]{2,70})[,。](\w{2,10}:|$)?", # 审批事项
|
|
|
"properties": "((建设|工程|项目)性质):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 建设性质
|
|
|
- "approval_result": "((审[批查核]|批[复准])(结果|决定|结论|状态|回复)|(办理|,)(状态|意见|结果)):(?P<main>[^:。]{2,50})[,。](\w{2,10}:|$)?", # 审批结果
|
|
|
+ "approval_result": "((审[批查核]|批[复准]|核[发准]|许可|抽查|备案)(结果|决定|结论|状态|回复|意见)|(办[理件]|,)(状态|意见|结果)|项目(当前|目前)?状态):(?P<main>[^:。]{2,20})[,。](\w{2,10}:|$)?", # 审批结果
|
|
|
"phone": "(联系)?电话:(?P<main>1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|" # 联系电话
|
|
|
'\+86.?1[3-9]\d{9}|'
|
|
|
'0[1-9]\d{1,2}[-—-―][2-9]\d{6}\d?[-—-―]\d{1,4}|'
|
|
@@ -7185,22 +7233,26 @@ class ApprovalPredictor():
|
|
|
}
|
|
|
|
|
|
self.role_type = {
|
|
|
- "declare_company": "(申[请报]|填报|呈报)(部门|机关|单位|企业|公司|机构|组织)", # 申报单位
|
|
|
- "construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方)|主送机关|法人单位|甲方", # 建设单位
|
|
|
- "approver": "(审[批查核]|许可|批准|发证|批复|管理)(部门|机关|单位|企业|公司|机构)", # 审批部门
|
|
|
- "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" # 环评机构
|
|
|
+ "declare_company": "(申[请报]|填报|呈报)(人|部门|机关|单位|企业|公司|机构|组织)", # 申报单位
|
|
|
+ "construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方|业主)|主送机关|法人单位|甲方", # 建设单位
|
|
|
+ "approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办)(部门|机关|单位|企业|公司|机构)|实施主体", # 审批部门
|
|
|
+ "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" , # 环评机构
|
|
|
+ "compilation_unit": "编制单位", # 编制单位 20240701加
|
|
|
+ "publisher": "(发布|发文|公示|公告)(人|部门|机关|单位|企业|公司|机构|组织)" # 发布机构 20240703加
|
|
|
}
|
|
|
self.person_type = {
|
|
|
"legal_person": "项目法人|法定代表人|企业法人" # 项目法人
|
|
|
}
|
|
|
self.date_type = {
|
|
|
"time_declare": "(申[请报]|填报|呈报)(时间|日期)", # 申报时间
|
|
|
- "time_commencement": "(开工|动工|施工开始)(时间|日期)", # 开工时间
|
|
|
- "time_completion": "(竣工|完工|验收|(项目|建设|工程)(完成|结束))(备案)?(时间|日期)" # 竣工时间
|
|
|
+ "time_commencement": "(开工|动工|(项目|建设|工程|施工)开始)(时间|日期)", # 开工时间
|
|
|
+ "time_completion": "(竣工|完工|验收|(项目|建设|工程|施工)(完成|结束))(备案)?(时间|日期)", # 竣工时间
|
|
|
+ "time_approval": "(审[批查核查议]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|决定)(时间|日期)", # 审批时间 20240701加
|
|
|
+ "time_release": "(发布|发文|公告|生成|成文)(时间|日期)" # 发布时间
|
|
|
}
|
|
|
|
|
|
self.addr_type = {
|
|
|
- "project_addr": "(建设|工程|项目|施工)(地址|地点|位置|所在地)|[宗土]地坐落|用地位置" # 建设地址
|
|
|
+ "project_addr": "(建设|工程|项目|施工|地块|用地)\w{,2}(地址|地点|位置|所在地)|[宗土]地坐落" # 建设地址
|
|
|
}
|
|
|
|
|
|
self.money_type = {
|
|
@@ -7216,6 +7268,7 @@ class ApprovalPredictor():
|
|
|
rs_l = []
|
|
|
found_key = 0
|
|
|
code_name_set = set() # 项目编号、名称集合
|
|
|
+ org_set = set() # 保存可能为审批部门的角色
|
|
|
for entity in list_entitys[0]:
|
|
|
entities[entity.sentence_index].append(entity)
|
|
|
|
|
@@ -7227,12 +7280,16 @@ class ApprovalPredictor():
|
|
|
for entity in entities[i]:
|
|
|
b, e = entity.wordOffset_begin, entity.wordOffset_end
|
|
|
if entity.entity_type in ['org', 'company']:
|
|
|
+ flag = 1
|
|
|
for k, v in self.role_type.items():
|
|
|
if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
|
if rs_dic[k] == '':
|
|
|
rs_dic[k] = entity.entity_text
|
|
|
multi_project[k] = entity.entity_text
|
|
|
found_key = 1
|
|
|
+ flag = 0
|
|
|
+ if flag and entity.entity_type == "org" and re.search('(局|委员会|委|厅)$', entity.entity_text):
|
|
|
+ org_set.add(entity.entity_text)
|
|
|
elif entity.entity_type in ['person']:
|
|
|
for k, v in self.person_type.items():
|
|
|
if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
@@ -7244,9 +7301,12 @@ class ApprovalPredictor():
|
|
|
elif entity.entity_type in ['time']:
|
|
|
for k, v in self.date_type.items():
|
|
|
if re.search(v, sentences[entity.sentence_index][max(0, b - span):b]):
|
|
|
+ time = timeFormat(entity.entity_text, default_first_day=False) if k in ['time_completion'] else timeFormat(entity.entity_text)
|
|
|
+ if time == "":
|
|
|
+ continue
|
|
|
if rs_dic[k] == '':
|
|
|
- rs_dic[k] = entity.entity_text
|
|
|
- multi_project[k] = entity.entity_text
|
|
|
+ rs_dic[k] = time
|
|
|
+ multi_project[k] = time
|
|
|
found_key = 1
|
|
|
elif entity.entity_type in ['location']:
|
|
|
for k, v in self.addr_type.items():
|
|
@@ -7288,6 +7348,16 @@ class ApprovalPredictor():
|
|
|
multi_project[k] = iter.group('main')
|
|
|
found_key = 1
|
|
|
break
|
|
|
+ for k, v in self.date_type.items():
|
|
|
+ for iter in re.finditer(v+':?(?P<main>20\d{2}-\d{1,2}(-\d{1,2})?|20\d{2}/\d{1,2}(/\d{1,2})?|20\d{2}\.\d{1,2}(\.\d{1,2})?|20\d{2}(0[1-9]|1[0-2])(0[1-9]|[1-2][0-9]|3[0-1])?)', text): # 规则补充实体识别不到的日期时间
|
|
|
+ time = timeFormat(iter.group('main'), default_first_day=False) if k in ['time_completion'] else timeFormat(iter.group('main'))
|
|
|
+ if time == "":
|
|
|
+ continue
|
|
|
+ if rs_dic[k] == '':
|
|
|
+ rs_dic[k] = time
|
|
|
+ multi_project[k] = time
|
|
|
+ found_key = 1
|
|
|
+ break
|
|
|
if (multi_project['project_code'] != "" or multi_project['project_name'] != "") and multi_project['project_code']+multi_project['project_name'] not in code_name_set:
|
|
|
code_name_set.add(multi_project['project_code']+multi_project['project_name'])
|
|
|
district = getPredictor('district').get_area(
|
|
@@ -7309,6 +7379,8 @@ class ApprovalPredictor():
|
|
|
rs_dic['province'] = district['district']['province']
|
|
|
rs_dic['city'] = district['district']['city']
|
|
|
rs_dic['district'] = district['district']['district']
|
|
|
+ if len(org_set) == 1 and rs_dic['approver'] == "":
|
|
|
+ rs_dic['approver'] == org_set.pop()
|
|
|
rs_dic = {k: v for k, v in rs_dic.items() if v != ''}
|
|
|
return [rs_dic]
|
|
|
return []
|