|
@@ -806,7 +806,7 @@ class PREMPredict():
|
|
|
elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and is_agency(entity.entity_text):
|
|
|
label = 1
|
|
|
values[label] = 0.501
|
|
|
- elif re.search('采用$|异议受理部门', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
|
|
|
+ elif re.search('采用$|异议受理部门|本次招标有:$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统- 标公告,本次招标有:内黄县汇融钢材有限公司、安阳正元建筑工程有限公司、内黄县鸿业贸易有限责任公司三家合格供应商进行报名投标。
|
|
|
label = 5
|
|
|
elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
|
|
|
label = 2
|
|
@@ -858,7 +858,7 @@ class PREMPredict():
|
|
|
elif re.search('发布机构', front) and not is_agency(entity.entity_text):
|
|
|
label = 0
|
|
|
values[label] = 0.501
|
|
|
- elif re.search('开户银行:$', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
|
|
|
+ elif re.search('开户银行:$|环境影响评价机构|环评机构|评价机构', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
|
|
|
label = 5
|
|
|
elif re.search('委托$', front) and re.search('^(抽样|送检|看样)', behind):
|
|
|
label = 5
|
|
@@ -1606,8 +1606,8 @@ class RoleRulePredictor():
|
|
|
find_flag = True
|
|
|
_label = 0
|
|
|
p_entity.label = _label
|
|
|
- p_entity.values[int(_label)] = on_value
|
|
|
- if 6<len(p_entity.entity_text) < 20: # 标题中角色长度在一定范围内的加分 优化类似367720967 标题中两个实体选择错误问题
|
|
|
+ p_entity.values[int(_label)] = on_value + p_entity.values[int(_label)] / 10
|
|
|
+ if 6<len(p_entity.entity_text) < 20 and p_entity.entity_type == 'org': # 标题中角色长度在一定范围内的加分 优化类似367720967 标题中两个实体选择错误问题
|
|
|
p_entity.values[int(_label)] += 0.005
|
|
|
break
|
|
|
if p_entity.sentence_index >= 4:
|
|
@@ -2244,7 +2244,7 @@ class RoleGrade():
|
|
|
entity.label = 0 if entity.entity_type == 'org' else 2
|
|
|
entity.values[entity.label] = 0.55
|
|
|
continue
|
|
|
- elif re.search('(采购|招标)人(?或(采购|招标)?代理机构)?:$', text[max(0, b-span):b]):
|
|
|
+ elif re.search('(采购|招标)人(?或其?(采购|招标)?代理机构)?', text[max(0, b-span-2):b]): # 修复 275206588 招标人或其招标代理机构:(盖章)
|
|
|
entity.label = 1 if is_agency(entity.entity_text) else 0
|
|
|
entity.values[entity.label] = 0.8
|
|
|
continue
|
|
@@ -5679,7 +5679,7 @@ class DistrictPredictor():
|
|
|
return addr
|
|
|
|
|
|
def get_pro_city_dis_score(text, text_weight=1):
|
|
|
- text = re.sub('复合肥|海南岛|兴业银行|双河口', '', text)
|
|
|
+ text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光', ' ', text)
|
|
|
text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
|
province_l = find_areas(p_pro, text)
|
|
@@ -5842,7 +5842,7 @@ class DistrictPredictor():
|
|
|
area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
|
|
|
|
pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
|
|
|
- pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.2)
|
|
|
+ pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name[:3], text_weight=0.2) # 20240422 修改为站源名称只取前三字,避免类似 459056219 中金岭南阳光采购平台 错提取阳光
|
|
|
for k in pro_ids1:
|
|
|
if k in pro_ids:
|
|
|
pro_ids[k] += pro_ids1[k]
|
|
@@ -5911,14 +5911,18 @@ class DistrictPredictor():
|
|
|
# print('招标人地址',role_addr, tenderee_address)
|
|
|
|
|
|
project_name = project_name + title if project_name not in title else title
|
|
|
- project_name = project_name.replace(tenderee, '')
|
|
|
+ # project_name = project_name.replace(tenderee, '')
|
|
|
+ entity_list = getNers([project_name],useselffool=False) # 2024/4/26 修改为去重项目名称中所有公司名称
|
|
|
+ for tup in entity_list[0]:
|
|
|
+ if tup[2] in ['org', 'company']:
|
|
|
+ project_name = project_name.replace(tup[3], '')
|
|
|
|
|
|
text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
|
|
|
|
|
|
web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
|
|
|
text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) # 预防提取错 合肥 路南 新会 等地区
|
|
|
|
|
|
- if pro_addr:
|
|
|
+ if pro_addr and re.search('\w{2,}([省市县旗盟]|自治[区州县旗])', pro_addr):
|
|
|
msc += '## 使用项目地址输入:%s ##;' % pro_addr
|
|
|
rs = get_area(pro_addr, '')
|
|
|
msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
|
|
@@ -5929,7 +5933,7 @@ class DistrictPredictor():
|
|
|
|
|
|
# print('text1:', text1)
|
|
|
msc += '## 第一次预测输入:%s ##;' % text1
|
|
|
- rs = get_area(text1, web_source_name)
|
|
|
+ rs = get_area(text1, '') # 2024/4/22 调整第一次输入不带站源名称,避免出错
|
|
|
msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
|
|
|
rs['district']['province'], rs['district']['city'], rs['district']['district'])
|
|
|
# self.f.write('%s %s \n' % (list_articles[0].id, msc))
|
|
@@ -5942,7 +5946,7 @@ class DistrictPredictor():
|
|
|
# text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
|
|
|
text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
|
|
|
# print('text2:', text2)
|
|
|
- msc += '## 第二次预测输入:%s ##' % text2
|
|
|
+ msc += '## 第二次预测输入:%s %s##' % (text2,web_source_name)
|
|
|
rs2 = get_area(text2, web_source_name, in_content=True)
|
|
|
# rs2['district']['is_in_text'] = True
|
|
|
if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
|
|
@@ -6152,6 +6156,8 @@ class TablePremExtractor(object):
|
|
|
text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
|
|
|
, ',', text)
|
|
|
text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
|
|
|
+ text = re.sub('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
|
|
|
+ text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
|
|
|
if text in nlp_enterprise:
|
|
|
return text
|
|
|
if len(text) > 50 or len(text)<4:
|
|
@@ -6160,9 +6166,11 @@ class TablePremExtractor(object):
|
|
|
roles = []
|
|
|
if ners:
|
|
|
for ner in ners[0]:
|
|
|
- if ner[2] in ['org', 'company', 'location']:
|
|
|
+ if ner[2] in ['org', 'company']:
|
|
|
+ roles.append(ner[3])
|
|
|
+ elif ner[2] in ['location'] and re.search('^\w{3,10}(海关|殡仪馆|店|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场)$', ner[3]):
|
|
|
roles.append(ner[3])
|
|
|
- if roles and (len(''.join(roles)) > len(text)*0.8 or text.startswith(roles[0])):
|
|
|
+ if roles and len(''.join(roles)) > len(text)*0.8:
|
|
|
return roles[0]
|
|
|
else:
|
|
|
return ''
|
|
@@ -6206,10 +6214,10 @@ class TablePremExtractor(object):
|
|
|
package_code = package_code_raw
|
|
|
if re.search('合计|总计', package_code+project_code):
|
|
|
continue
|
|
|
- if package_code != '' and package_code == previous_package: # 处理 208162730 一个包采购多种东西情况
|
|
|
+ if package_code != '' and package_code + project_code == previous_package: # 处理 208162730 一个包采购多种东西情况
|
|
|
same_package = True
|
|
|
project_name = ''
|
|
|
- previous_package = package_code
|
|
|
+ previous_package = package_code + project_code
|
|
|
|
|
|
if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取 防止类似 328485591 作为多包
|
|
|
break
|
|
@@ -6225,6 +6233,16 @@ class TablePremExtractor(object):
|
|
|
# tenderee = tenderee if self.is_role(tenderee) else ""
|
|
|
# tenderer = tenderer if self.is_role(tenderer) else ""
|
|
|
|
|
|
+ package = uniform_package_name(package_code) if package_code else '自增'+str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
+ if project_name != "" and package.startswith('自增'):
|
|
|
+ pk_l = find_package(project_name)
|
|
|
+ if len(pk_l)==1:
|
|
|
+ package = uniform_package_name(pk_l[0].group(0))
|
|
|
+ elif re.search('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', tenderer) and package.startswith('自增'):
|
|
|
+ pk_l = find_package(tenderer)
|
|
|
+ if len(pk_l) == 1:
|
|
|
+ package = uniform_package_name(pk_l[0].group(0))
|
|
|
+
|
|
|
tenderee = self.get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
|
|
|
tenderer = self.get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
|
|
|
|
|
@@ -6239,7 +6257,6 @@ class TablePremExtractor(object):
|
|
|
continue
|
|
|
link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
|
|
|
|
|
|
- package = uniform_package_name(package_code) if package_code else str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
|
|
|
if project_code != "":
|
|
|
uni_project_code= uniform_package_name(project_code)
|
|
|
if uni_project_code != "" and package != "":
|
|
@@ -6339,17 +6356,20 @@ class TablePremExtractor(object):
|
|
|
:param tmp_dic: 待合并结果
|
|
|
:return:
|
|
|
'''
|
|
|
- for pack in tmp_dic:
|
|
|
- if pack in rs_dic:
|
|
|
- for k in tmp_dic[pack]:
|
|
|
- if rs_dic[pack][k] in ['', 0]:
|
|
|
- rs_dic[pack][k] = tmp_dic[pack][k]
|
|
|
- elif rs_dic[pack][k] == []:
|
|
|
- rs_dic[pack][k] = tmp_dic[pack][k]
|
|
|
- elif k == 'roleList' and len(rs_dic[pack][k])>0 and rs_dic[pack][k][0].get('role_money', {}).get('money', 0) == 0:
|
|
|
- rs_dic[pack][k] = tmp_dic[pack][k]
|
|
|
- else:
|
|
|
- rs_dic[pack] = tmp_dic[pack]
|
|
|
+ if '自增1' in tmp_dic and '自增1' not in rs_dic and len(tmp_dic)==len(rs_dic):
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ for pack in tmp_dic:
|
|
|
+ if pack in rs_dic:
|
|
|
+ for k in tmp_dic[pack]:
|
|
|
+ if rs_dic[pack][k] in ['', 0]:
|
|
|
+ rs_dic[pack][k] = tmp_dic[pack][k]
|
|
|
+ elif rs_dic[pack][k] == []:
|
|
|
+ rs_dic[pack][k] = tmp_dic[pack][k]
|
|
|
+ elif k == 'roleList' and len(rs_dic[pack][k])>0 and rs_dic[pack][k][0].get('role_money', {}).get('money', 0) == 0:
|
|
|
+ rs_dic[pack][k] = tmp_dic[pack][k]
|
|
|
+ else:
|
|
|
+ rs_dic[pack] = tmp_dic[pack]
|
|
|
|
|
|
def get_prem(self, soup, web_source_name=''):
|
|
|
tables = soup.find_all('table')
|
|
@@ -6403,7 +6423,7 @@ class TablePremExtractor(object):
|
|
|
if sib.name in ['p', 'div'] and len(sib_text)<30 and ser_sib:
|
|
|
package_sib = ser_sib.group(0)
|
|
|
package_sib = uniform_package_name(package_sib)
|
|
|
- table_prem[package_sib] = table_prem.pop('1')
|
|
|
+ table_prem[package_sib] = table_prem.pop('自增1')
|
|
|
if table_prem:
|
|
|
# rs_dic.update(table_prem)
|
|
|
self.update_prem(rs_dic, table_prem)
|
|
@@ -6569,9 +6589,9 @@ class CandidateExtractor(object):
|
|
|
|
|
|
# if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
|
|
|
# break
|
|
|
- if(candidate_,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set:
|
|
|
+ if(candidate_,win_tenderer, second_tenderer,third_tenderer, bid_amount_,package_code) in link_set:
|
|
|
continue
|
|
|
- link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_))
|
|
|
+ link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_,package_code))
|
|
|
package = package_code
|
|
|
package = uniform_package_name(package) if package !="" else "Project"
|
|
|
if candidate:
|