district_extractor.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. @author: bidikeji
  5. @time: 2024/5/9 14:40
  6. """
  7. import re
  8. import os
  9. import pickle
  10. class DistrictExtractor():
  11. def __init__(self):
  12. with open(os.path.join(os.path.dirname(__file__),'district_tuple.pkl'), 'rb') as f:
  13. district_tuple = pickle.load(f)
  14. self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
  15. def get_area(self, text, web_name, in_content=False):
  16. p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
  17. def get_final_addr(pro_ids, city_ids, dis_ids):
  18. '''
  19. 先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
  20. :param province_l: 匹配到的所有省份
  21. :param city_l: 匹配到的所有城市
  22. :param district_l: 匹配到的所有区县
  23. :return:
  24. '''
  25. big_area = ""
  26. pred_pro = ""
  27. pred_city = ""
  28. pred_dis = ""
  29. final_pro = ""
  30. final_city = ""
  31. if len(pro_ids) >= 1:
  32. pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
  33. final_pro, score = pro_l[0]
  34. if score >= 0.01:
  35. pred_pro = idx_dic[final_pro]['返回名称']
  36. big_area = idx_dic[final_pro]['大区']
  37. # else:
  38. # print("得分过低,过滤掉", idx_dic[final_pro]['返回名称'], score)
  39. if pred_pro != "" and len(city_ids) >= 1:
  40. city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
  41. for it in city_l:
  42. if idx_dic[it[0]]['省'] == final_pro:
  43. final_city = it[0]
  44. pred_city = idx_dic[final_city]['返回名称']
  45. break
  46. if final_city != "" and len(set(dis_ids)) >= 1:
  47. dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
  48. for it in dis_l:
  49. if idx_dic[it[0]]['市'] == final_city:
  50. pred_dis = idx_dic[it[0]]['返回名称']
  51. if pred_city in ['北京', '天津', '上海', '重庆']:
  52. pred_city = pred_dis
  53. pred_dis = ""
  54. return big_area, pred_pro, pred_city, pred_dis
  55. def find_areas(pettern, text):
  56. '''
  57. 通过正则匹配字符串返回地址
  58. :param pettern: 地址正则 广东省|广西省|...
  59. :param text: 待匹配文本
  60. :return:
  61. '''
  62. addr = []
  63. for it in re.finditer(pettern, text):
  64. if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
  65. '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
  66. continue
  67. if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
  68. continue
  69. addr.append((it.group(0), it.start(), it.end()))
  70. if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]):
  71. addr.append((it.group(0), it.start(), it.end()))
  72. return addr
  73. def chage_area2score(group_list, max_len):
  74. '''
  75. 把匹配的的地址转为分数
  76. :param group_list: [('name', b, e)]
  77. :return:
  78. '''
  79. area_list = []
  80. if group_list != []:
  81. for it in group_list:
  82. name, b, e = it
  83. area_list.append((name, (e - b + e) / max_len / 2))
  84. return area_list
  85. def get_pro_city_dis_score(text, text_weight=1):
  86. text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾', ' ', text)
  87. text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
  88. text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
  89. province_l = find_areas(p_pro, text)
  90. city_l = find_areas(p_city, text)
  91. district_l = find_areas(p_dis, text)
  92. if len(province_l) == len(city_l) == 0:
  93. district_l = [it for it in district_l if
  94. re.search('[市县旗区]$', it[0])] # 20240428去掉只有区县地址且不是全称的匹配,避免错误 例 凌云工业股份有限公司 提取地区为广西白色凌云
  95. province_l = chage_area2score(province_l, max_len=len(text))
  96. city_l = chage_area2score(city_l, max_len=len(text))
  97. district_l = chage_area2score(district_l, max_len=len(text))
  98. pro_ids = dict()
  99. city_ids = dict()
  100. dis_ids = dict()
  101. for pro in province_l:
  102. name, score = pro
  103. assert (name in full_dic['province'] or name in short_dic['province'])
  104. if name in full_dic['province']:
  105. idx = full_dic['province'][name]
  106. if idx not in pro_ids:
  107. pro_ids[idx] = 0
  108. pro_ids[idx] += (score + 2)
  109. else:
  110. idx = short_dic['province'][name]
  111. if idx not in pro_ids:
  112. pro_ids[idx] = 0
  113. pro_ids[idx] += (score + 1)
  114. for city in city_l:
  115. name, score = city
  116. if name in full_dic['city']:
  117. w = 0.1 if len(full_dic['city'][name]) > 1 else 1
  118. for idx in full_dic['city'][name]:
  119. if idx not in city_ids:
  120. city_ids[idx] = 0
  121. # weight = idx_dic[idx]['权重']
  122. city_ids[idx] += (score + 2) * w
  123. pro_idx = idx_dic[idx]['省']
  124. if pro_idx in pro_ids:
  125. pro_ids[pro_idx] += (score + 2) * w
  126. else:
  127. pro_ids[pro_idx] = (score + 2) * w * 0.5
  128. elif name in short_dic['city']:
  129. w = 0.1 if len(short_dic['city'][name]) > 1 else 1
  130. for idx in short_dic['city'][name]:
  131. if idx not in city_ids:
  132. city_ids[idx] = 0
  133. weight = idx_dic[idx]['权重']
  134. city_ids[idx] += (score + 1) * w * weight
  135. pro_idx = idx_dic[idx]['省']
  136. if pro_idx in pro_ids:
  137. pro_ids[pro_idx] += (score + 1) * w * weight
  138. else:
  139. pro_ids[pro_idx] = (score + 1) * w * weight * 0.5
  140. for dis in district_l:
  141. name, score = dis
  142. if name in full_dic['district']:
  143. w = 0.1 if len(full_dic['district'][name]) > 1 else 1
  144. for idx in full_dic['district'][name]:
  145. if idx not in dis_ids:
  146. dis_ids[idx] = 0
  147. # weight = idx_dic[idx]['权重']
  148. dis_ids[idx] += (score + 1) * w
  149. pro_idx = idx_dic[idx]['省']
  150. if pro_idx in pro_ids:
  151. pro_ids[pro_idx] += (score + 1) * w
  152. else:
  153. pro_ids[pro_idx] = (score + 1) * w * 0.5
  154. city_idx = idx_dic[idx]['市']
  155. if city_idx in city_ids:
  156. city_ids[city_idx] += (score + 1) * w
  157. else:
  158. city_ids[city_idx] = (score + 1) * w * 0.5
  159. elif name in short_dic['district']:
  160. w = 0.1 if len(short_dic['district'][name]) > 1 else 1
  161. for idx in short_dic['district'][name]:
  162. if idx not in dis_ids:
  163. dis_ids[idx] = 0
  164. weight = idx_dic[idx]['权重']
  165. dis_ids[idx] += (score + 0) * w
  166. pro_idx = idx_dic[idx]['省']
  167. if pro_idx in pro_ids:
  168. pro_ids[pro_idx] += (score + 0) * w * weight
  169. else:
  170. pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
  171. city_idx = idx_dic[idx]['市']
  172. if city_idx in city_ids:
  173. city_ids[city_idx] += (score + 0) * w * weight
  174. else:
  175. city_ids[city_idx] = (score + 0) * w * weight * 0.5
  176. for k, v in pro_ids.items():
  177. pro_ids[k] = v * text_weight
  178. for k, v in city_ids.items():
  179. city_ids[k] = v * text_weight
  180. for k, v in dis_ids.items():
  181. dis_ids[k] = v * text_weight
  182. return pro_ids, city_ids, dis_ids
  183. area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
  184. pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
  185. pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name[:3],
  186. text_weight=0.2) # 20240422 修改为站源名称只取前三字,避免类似 459056219 中金岭南阳光采购平台 错提取阳光
  187. for k in pro_ids1:
  188. if k in pro_ids:
  189. pro_ids[k] += pro_ids1[k]
  190. else:
  191. pro_ids[k] = pro_ids1[k]
  192. for k in city_ids1:
  193. if k in city_ids:
  194. city_ids[k] += city_ids1[k]
  195. else:
  196. city_ids[k] = city_ids1[k]
  197. for k in dis_ids1:
  198. if k in dis_ids:
  199. dis_ids[k] += dis_ids1[k]
  200. else:
  201. dis_ids[k] = dis_ids1[k]
  202. big_area, pred_pro, pred_city, pred_dis = get_final_addr(pro_ids, city_ids, dis_ids)
  203. if big_area != "":
  204. area_dic['area'] = big_area
  205. if pred_pro != "":
  206. area_dic['province'] = pred_pro
  207. if pred_city != "":
  208. area_dic['city'] = pred_city
  209. if pred_dis != "":
  210. area_dic['district'] = pred_dis
  211. if in_content:
  212. area_dic['is_in_text'] = True
  213. return {'district': area_dic}
  214. def predict(self, text):
  215. return self.get_area(text=text, web_name='')
  216. if __name__ == '__main__':
  217. dist = DistrictExtractor()
  218. text = '浙江省杭州市余杭区崇贤街道运河路5-4号13幢312室'
  219. print(dist.predict(text=text))