|
@@ -4266,6 +4266,7 @@ class DistrictPredictor():
|
|
self.full_name = full_name
|
|
self.full_name = full_name
|
|
self.short2id = short2id
|
|
self.short2id = short2id
|
|
self.full2id = full2id
|
|
self.full2id = full2id
|
|
|
|
+ # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
|
|
|
|
|
|
def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
|
|
def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
|
|
'''
|
|
'''
|
|
@@ -4330,7 +4331,7 @@ class DistrictPredictor():
|
|
if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
|
|
if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
|
|
type_w = 2
|
|
type_w = 2
|
|
else:
|
|
else:
|
|
- type_w = 1
|
|
|
|
|
|
+ type_w = 0.5
|
|
id_set.add(_id)
|
|
id_set.add(_id)
|
|
score2 += w * type_w
|
|
score2 += w * type_w
|
|
score_l.append([_id, score * w + score2] + area)
|
|
score_l.append([_id, score * w + score2] + area)
|
|
@@ -4409,17 +4410,14 @@ class DistrictPredictor():
|
|
|
|
|
|
def get_all_addr(list_entitys):
|
|
def get_all_addr(list_entitys):
|
|
tenderee_l = []
|
|
tenderee_l = []
|
|
- other_roles = []
|
|
|
|
addr_l = []
|
|
addr_l = []
|
|
for ent in list_entitys[0]:
|
|
for ent in list_entitys[0]:
|
|
- if ent.entity_type == 'location':
|
|
|
|
|
|
+ if ent.entity_type == 'location' and len(ent.entity_text)>2:
|
|
addr_l.append(ent.entity_text)
|
|
addr_l.append(ent.entity_text)
|
|
elif ent.entity_type in ['org', 'company']:
|
|
elif ent.entity_type in ['org', 'company']:
|
|
- if ent.label == 0:
|
|
|
|
|
|
+ if ent.label in [0, 1]: # 加招标或代理
|
|
tenderee_l.append(ent.entity_text)
|
|
tenderee_l.append(ent.entity_text)
|
|
- else:
|
|
|
|
- other_roles.append(ent.entity_text)
|
|
|
|
- return ' '.join(addr_l), ' '.join(tenderee_l), ' '.join(other_roles)
|
|
|
|
|
|
+ return ' '.join(addr_l), ' '.join(tenderee_l)
|
|
|
|
|
|
def get_title_addr(text):
|
|
def get_title_addr(text):
|
|
p1 = '(\w{2,8}[省市州区县][^\w]*)+'
|
|
p1 = '(\w{2,8}[省市州区县][^\w]*)+'
|
|
@@ -4436,21 +4434,26 @@ class DistrictPredictor():
|
|
content = list_articles[0].content
|
|
content = list_articles[0].content
|
|
|
|
|
|
tenderee, tenderee_address = get_ree_addr(prem)
|
|
tenderee, tenderee_address = get_ree_addr(prem)
|
|
|
|
+ msc = ""
|
|
pro_addr = get_project_addr(content)
|
|
pro_addr = get_project_addr(content)
|
|
if pro_addr != "":
|
|
if pro_addr != "":
|
|
|
|
+ msc += '使用规则提取的项目地址;'
|
|
tenderee_address = pro_addr
|
|
tenderee_address = pro_addr
|
|
else:
|
|
else:
|
|
role_addr = get_role_address(content)
|
|
role_addr = get_role_address(content)
|
|
if role_addr != "":
|
|
if role_addr != "":
|
|
|
|
+ msc += '使用规则提取的联系人地址;'
|
|
tenderee_address = role_addr
|
|
tenderee_address = role_addr
|
|
|
|
|
|
if tenderee_address == "":
|
|
if tenderee_address == "":
|
|
title_addr = get_title_addr(title)
|
|
title_addr = get_title_addr(title)
|
|
if title_addr != "":
|
|
if title_addr != "":
|
|
|
|
+ msc += '使用规则提取的标题地址;'
|
|
tenderee_address = title_addr
|
|
tenderee_address = title_addr
|
|
else:
|
|
else:
|
|
bid_addr = get_bid_addr(content)
|
|
bid_addr = get_bid_addr(content)
|
|
if bid_addr != "":
|
|
if bid_addr != "":
|
|
|
|
+ msc += '使用规则提取的开标地址;'
|
|
tenderee_address = bid_addr
|
|
tenderee_address = bid_addr
|
|
|
|
|
|
project_name = str(project_name)
|
|
project_name = str(project_name)
|
|
@@ -4466,24 +4469,29 @@ class DistrictPredictor():
|
|
web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
|
|
web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
|
|
text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) #预防提取错 合肥 路南 新会 等地区
|
|
text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) #预防提取错 合肥 路南 新会 等地区
|
|
# print('text1:', text1)
|
|
# print('text1:', text1)
|
|
|
|
+ msc += '## 第一次预测输入:%s ##;'%text1
|
|
rs = get_area(text1, web_source_name)
|
|
rs = get_area(text1, web_source_name)
|
|
-
|
|
|
|
|
|
+ msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
|
|
|
|
+ rs['district']['province'], rs['district']['city'], rs['district']['district'])
|
|
|
|
+ # self.f.write('%s %s \n' % (list_articles[0].id, msc))
|
|
if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
|
|
if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
|
|
- all_addr, tenderees, other_roles = get_all_addr(list_entitys)
|
|
|
|
- if tenderees != "":
|
|
|
|
- text2 = tenderees + " " + all_addr
|
|
|
|
- # print('所有地址:', all_addr)
|
|
|
|
- else:
|
|
|
|
- text2 = other_roles + " " + all_addr
|
|
|
|
- # text2 = title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
|
|
|
|
|
|
+ msc = ""
|
|
|
|
+ all_addr, tenderees = get_all_addr(list_entitys)
|
|
|
|
+ text2 = tenderees + " " + all_addr + ' ' + title
|
|
|
|
+ msc += '使用实体列表所有招标人+所有地址;'
|
|
|
|
+ # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
|
|
text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
|
|
text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
|
|
# print('text2:', text2)
|
|
# print('text2:', text2)
|
|
|
|
+ msc += '## 第二次预测输入:%s ##'%text2
|
|
rs2 = get_area(text2, web_source_name, not_in_content=False)
|
|
rs2 = get_area(text2, web_source_name, not_in_content=False)
|
|
rs2['district']['is_in_text'] = True
|
|
rs2['district']['is_in_text'] = True
|
|
if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
|
|
if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
|
|
rs = rs2
|
|
rs = rs2
|
|
elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
|
|
elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
|
|
rs = rs2
|
|
rs = rs2
|
|
|
|
+ msc += '预测结果:省份:%s, 城市:%s,区县:%s'%(
|
|
|
|
+ rs['district']['province'],rs['district']['city'],rs['district']['district'])
|
|
|
|
+ # self.f.write('%s %s \n'%(list_articles[0].id, msc))
|
|
return rs
|
|
return rs
|
|
|
|
|
|
class TableTag2List():
|
|
class TableTag2List():
|