123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- @author: bidikeji
- @time: 2024/5/9 14:40
- """
- import re
- import os
- import pickle
- class DistrictExtractor():
- def __init__(self):
- with open(os.path.join(os.path.dirname(__file__),'district_tuple.pkl'), 'rb') as f:
- district_tuple = pickle.load(f)
- self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
- def get_area(self, text, web_name, in_content=False):
- p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
- def get_final_addr(pro_ids, city_ids, dis_ids):
- '''
- 先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
- :param province_l: 匹配到的所有省份
- :param city_l: 匹配到的所有城市
- :param district_l: 匹配到的所有区县
- :return:
- '''
- big_area = ""
- pred_pro = ""
- pred_city = ""
- pred_dis = ""
- final_pro = ""
- final_city = ""
- if len(pro_ids) >= 1:
- pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
- final_pro, score = pro_l[0]
- if score >= 0.01:
- pred_pro = idx_dic[final_pro]['返回名称']
- big_area = idx_dic[final_pro]['大区']
- # else:
- # print("得分过低,过滤掉", idx_dic[final_pro]['返回名称'], score)
- if pred_pro != "" and len(city_ids) >= 1:
- city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
- for it in city_l:
- if idx_dic[it[0]]['省'] == final_pro:
- final_city = it[0]
- pred_city = idx_dic[final_city]['返回名称']
- break
- if final_city != "" and len(set(dis_ids)) >= 1:
- dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
- for it in dis_l:
- if idx_dic[it[0]]['市'] == final_city:
- pred_dis = idx_dic[it[0]]['返回名称']
- if pred_city in ['北京', '天津', '上海', '重庆']:
- pred_city = pred_dis
- pred_dis = ""
- return big_area, pred_pro, pred_city, pred_dis
- def find_areas(pettern, text):
- '''
- 通过正则匹配字符串返回地址
- :param pettern: 地址正则 广东省|广西省|...
- :param text: 待匹配文本
- :return:
- '''
- addr = []
- for it in re.finditer(pettern, text):
- if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
- '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
- continue
- if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
- continue
- addr.append((it.group(0), it.start(), it.end()))
- if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]):
- addr.append((it.group(0), it.start(), it.end()))
- return addr
- def chage_area2score(group_list, max_len):
- '''
- 把匹配的的地址转为分数
- :param group_list: [('name', b, e)]
- :return:
- '''
- area_list = []
- if group_list != []:
- for it in group_list:
- name, b, e = it
- area_list.append((name, (e - b + e) / max_len / 2))
- return area_list
- def get_pro_city_dis_score(text, text_weight=1):
- text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾', ' ', text)
- text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
- text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
- province_l = find_areas(p_pro, text)
- city_l = find_areas(p_city, text)
- district_l = find_areas(p_dis, text)
- if len(province_l) == len(city_l) == 0:
- district_l = [it for it in district_l if
- re.search('[市县旗区]$', it[0])] # 20240428去掉只有区县地址且不是全称的匹配,避免错误 例 凌云工业股份有限公司 提取地区为广西白色凌云
- province_l = chage_area2score(province_l, max_len=len(text))
- city_l = chage_area2score(city_l, max_len=len(text))
- district_l = chage_area2score(district_l, max_len=len(text))
- pro_ids = dict()
- city_ids = dict()
- dis_ids = dict()
- for pro in province_l:
- name, score = pro
- assert (name in full_dic['province'] or name in short_dic['province'])
- if name in full_dic['province']:
- idx = full_dic['province'][name]
- if idx not in pro_ids:
- pro_ids[idx] = 0
- pro_ids[idx] += (score + 2)
- else:
- idx = short_dic['province'][name]
- if idx not in pro_ids:
- pro_ids[idx] = 0
- pro_ids[idx] += (score + 1)
- for city in city_l:
- name, score = city
- if name in full_dic['city']:
- w = 0.1 if len(full_dic['city'][name]) > 1 else 1
- for idx in full_dic['city'][name]:
- if idx not in city_ids:
- city_ids[idx] = 0
- # weight = idx_dic[idx]['权重']
- city_ids[idx] += (score + 2) * w
- pro_idx = idx_dic[idx]['省']
- if pro_idx in pro_ids:
- pro_ids[pro_idx] += (score + 2) * w
- else:
- pro_ids[pro_idx] = (score + 2) * w * 0.5
- elif name in short_dic['city']:
- w = 0.1 if len(short_dic['city'][name]) > 1 else 1
- for idx in short_dic['city'][name]:
- if idx not in city_ids:
- city_ids[idx] = 0
- weight = idx_dic[idx]['权重']
- city_ids[idx] += (score + 1) * w * weight
- pro_idx = idx_dic[idx]['省']
- if pro_idx in pro_ids:
- pro_ids[pro_idx] += (score + 1) * w * weight
- else:
- pro_ids[pro_idx] = (score + 1) * w * weight * 0.5
- for dis in district_l:
- name, score = dis
- if name in full_dic['district']:
- w = 0.1 if len(full_dic['district'][name]) > 1 else 1
- for idx in full_dic['district'][name]:
- if idx not in dis_ids:
- dis_ids[idx] = 0
- # weight = idx_dic[idx]['权重']
- dis_ids[idx] += (score + 1) * w
- pro_idx = idx_dic[idx]['省']
- if pro_idx in pro_ids:
- pro_ids[pro_idx] += (score + 1) * w
- else:
- pro_ids[pro_idx] = (score + 1) * w * 0.5
- city_idx = idx_dic[idx]['市']
- if city_idx in city_ids:
- city_ids[city_idx] += (score + 1) * w
- else:
- city_ids[city_idx] = (score + 1) * w * 0.5
- elif name in short_dic['district']:
- w = 0.1 if len(short_dic['district'][name]) > 1 else 1
- for idx in short_dic['district'][name]:
- if idx not in dis_ids:
- dis_ids[idx] = 0
- weight = idx_dic[idx]['权重']
- dis_ids[idx] += (score + 0) * w
- pro_idx = idx_dic[idx]['省']
- if pro_idx in pro_ids:
- pro_ids[pro_idx] += (score + 0) * w * weight
- else:
- pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
- city_idx = idx_dic[idx]['市']
- if city_idx in city_ids:
- city_ids[city_idx] += (score + 0) * w * weight
- else:
- city_ids[city_idx] = (score + 0) * w * weight * 0.5
- for k, v in pro_ids.items():
- pro_ids[k] = v * text_weight
- for k, v in city_ids.items():
- city_ids[k] = v * text_weight
- for k, v in dis_ids.items():
- dis_ids[k] = v * text_weight
- return pro_ids, city_ids, dis_ids
- area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
- pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
- pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name[:3],
- text_weight=0.2) # 20240422 修改为站源名称只取前三字,避免类似 459056219 中金岭南阳光采购平台 错提取阳光
- for k in pro_ids1:
- if k in pro_ids:
- pro_ids[k] += pro_ids1[k]
- else:
- pro_ids[k] = pro_ids1[k]
- for k in city_ids1:
- if k in city_ids:
- city_ids[k] += city_ids1[k]
- else:
- city_ids[k] = city_ids1[k]
- for k in dis_ids1:
- if k in dis_ids:
- dis_ids[k] += dis_ids1[k]
- else:
- dis_ids[k] = dis_ids1[k]
- big_area, pred_pro, pred_city, pred_dis = get_final_addr(pro_ids, city_ids, dis_ids)
- if big_area != "":
- area_dic['area'] = big_area
- if pred_pro != "":
- area_dic['province'] = pred_pro
- if pred_city != "":
- area_dic['city'] = pred_city
- if pred_dis != "":
- area_dic['district'] = pred_dis
- if in_content:
- area_dic['is_in_text'] = True
- return {'district': area_dic}
- def predict(self, text):
- return self.get_area(text=text, web_name='')
- if __name__ == '__main__':
- dist = DistrictExtractor()
- text = '浙江省杭州市余杭区崇贤街道运河路5-4号13幢312室'
- print(dist.predict(text=text))
|