|
@@ -546,9 +546,11 @@ class CodeNamePredict():
|
|
if _name not in dict_name_freq_score:
|
|
if _name not in dict_name_freq_score:
|
|
# dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
# dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减
|
|
len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减
|
|
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05)*w+(5-sentence.sentence_index)*0.2]
|
|
|
|
|
|
+ dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05), w]
|
|
else:
|
|
else:
|
|
dict_name_freq_score[_name][0] += 1
|
|
dict_name_freq_score[_name][0] += 1
|
|
|
|
+ if w > dict_name_freq_score[_name][2]:
|
|
|
|
+ dict_name_freq_score[_name][2] = w
|
|
'''
|
|
'''
|
|
for iter in re.finditer(self.PN_pattern,join_predict):
|
|
for iter in re.finditer(self.PN_pattern,join_predict):
|
|
print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
|
|
print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
|
|
@@ -593,7 +595,7 @@ class CodeNamePredict():
|
|
w = 1
|
|
w = 1
|
|
if _name not in dict_name_freq_score:
|
|
if _name not in dict_name_freq_score:
|
|
# dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
# dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w+(5-sentence.sentence_index)*0.2]
|
|
|
|
|
|
+ dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05), w]
|
|
else:
|
|
else:
|
|
dict_name_freq_score[_name][0] += 1
|
|
dict_name_freq_score[_name][0] += 1
|
|
# othername = re.search(name_re1, sentence.sentence_text)
|
|
# othername = re.search(name_re1, sentence.sentence_text)
|
|
@@ -608,7 +610,7 @@ class CodeNamePredict():
|
|
list_name_freq_score.append([_name,dict_name_freq_score[_name]])
|
|
list_name_freq_score.append([_name,dict_name_freq_score[_name]])
|
|
# print(list_name_freq_score)
|
|
# print(list_name_freq_score)
|
|
if len(list_name_freq_score)>0:
|
|
if len(list_name_freq_score)>0:
|
|
- list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
|
|
|
|
|
|
+ list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1]*x[1][2],reverse=True)
|
|
item['name'] = list_name_freq_score[0][0]
|
|
item['name'] = list_name_freq_score[0][0]
|
|
# for it in list_name_freq_score:
|
|
# for it in list_name_freq_score:
|
|
# print('项目名称及分值:',it[0],it[1], it[1][0]*it[1][1])
|
|
# print('项目名称及分值:',it[0],it[1], it[1][0]*it[1][1])
|
|
@@ -5836,44 +5838,44 @@ class DistrictPredictor():
|
|
with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f:
|
|
with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f:
|
|
district_tuple = pickle.load(f)
|
|
district_tuple = pickle.load(f)
|
|
self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
|
|
self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
|
|
|
|
+ # self.pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
|
|
|
|
+ # self.p_pro, self.p_city, self.p_dis, self.p_city, self.p_dis, self.p_dis)
|
|
|
|
+ self.pettern = "(?P<prov>%s)##(?P<city>%s)##(?P<dist>%s)" % (
|
|
|
|
+ self.p_pro, self.p_city, self.p_dis)
|
|
|
|
|
|
with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
|
|
with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
|
|
self.area_variance_dic = pickle.load(f)
|
|
self.area_variance_dic = pickle.load(f)
|
|
|
|
+ @staticmethod
|
|
|
|
+ def find_whole_areas(text, pettern, area_variance_dic, full_dic, weight=1):
|
|
|
|
+ '''
|
|
|
|
+ 通过正则匹配字符串返回地址
|
|
|
|
+ :param pettern: 地址正则 广东省|广西省|...
|
|
|
|
+ :param text: 待匹配文本
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ province_l, city_l, district_l = [], [], []
|
|
|
|
|
|
- def predict_area(self, title, content, web_source_name, prem={}, addr_dic={}):
|
|
|
|
- p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
|
|
|
|
-
|
|
|
|
- def find_whole_areas(text, weight=1):
|
|
|
|
- '''
|
|
|
|
- 通过正则匹配字符串返回地址
|
|
|
|
- :param pettern: 地址正则 广东省|广西省|...
|
|
|
|
- :param text: 待匹配文本
|
|
|
|
- :return:
|
|
|
|
- '''
|
|
|
|
- province_l, city_l, district_l = [], [], []
|
|
|
|
-
|
|
|
|
- text = str(text).replace('(', '(').replace(')', ')')
|
|
|
|
- text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
|
|
|
|
- text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城',
|
|
|
|
- ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
|
|
|
|
- text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
|
- text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
|
|
- text = re.sub('茂名滨海新区', '茂名市', text)
|
|
|
|
- text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
|
|
|
|
- text = re.sub('横州市', '横县', text) # 例:547363890 修复广西南宁横州 不在地区表问题
|
|
|
|
- ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
|
|
|
|
- if ser and '黎族' not in ser.group(0):
|
|
|
|
- text = text.replace(ser.group(0), ser.group(0) + '黎族')
|
|
|
|
- for k, v in self.area_variance_dic.items(): # 20241113 根据地区变更信息替换文本
|
|
|
|
- text = text.replace(k, v)
|
|
|
|
- text = re.sub('\s+', '', text)
|
|
|
|
-
|
|
|
|
- if re.search('[\u4e00-\u9fa5]', text) == None:
|
|
|
|
- return province_l, city_l, district_l
|
|
|
|
-
|
|
|
|
- pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
|
|
|
|
- p_pro, p_city, p_dis, p_city, p_dis, p_dis)
|
|
|
|
|
|
+ text = str(text).replace('(', '(').replace(')', ')')
|
|
|
|
+ text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
|
|
|
|
+ text = re.sub(
|
|
|
|
+ '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城',
|
|
|
|
+ ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
|
|
|
|
+ text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
|
+ text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
|
|
+ text = re.sub('茂名滨海新区', '茂名市', text)
|
|
|
|
+ text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
|
|
|
|
+ text = re.sub('横州市', '横县', text) # 例:547363890 修复广西南宁横州 不在地区表问题
|
|
|
|
+ ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
|
|
|
|
+ if ser and '黎族' not in ser.group(0):
|
|
|
|
+ text = text.replace(ser.group(0), ser.group(0) + '黎族')
|
|
|
|
+ for k, v in area_variance_dic.items(): # 20241113 根据地区变更信息替换文本
|
|
|
|
+ text = text.replace(k, v)
|
|
|
|
+ text = re.sub('\s+', ' ', text)
|
|
|
|
+
|
|
|
|
+ if re.search('[\u4e00-\u9fa5]', text) == None:
|
|
|
|
+ return province_l, city_l, district_l
|
|
|
|
|
|
|
|
+ for pettern in pettern.split('##'):
|
|
for it in re.finditer(pettern, text):
|
|
for it in re.finditer(pettern, text):
|
|
if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
|
|
if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
|
|
continue
|
|
continue
|
|
@@ -5890,9 +5892,10 @@ class DistrictPredictor():
|
|
else:
|
|
else:
|
|
score = 1
|
|
score = 1
|
|
if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
- , text[it.end(k):]) or re.search('^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
|
|
|
|
|
|
+ , text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
|
|
|
|
+ text[max(0, it.start(k) - 1):]):
|
|
score += 1
|
|
score += 1
|
|
- score += it.end(k) / len(text) / 10
|
|
|
|
|
|
+ # score += it.end(k) / len(text) / 10
|
|
province_l.append((v, score * weight))
|
|
province_l.append((v, score * weight))
|
|
elif k in ['city', 'city1']:
|
|
elif k in ['city', 'city1']:
|
|
if v in full_dic['city']:
|
|
if v in full_dic['city']:
|
|
@@ -5900,253 +5903,272 @@ class DistrictPredictor():
|
|
else:
|
|
else:
|
|
score = 1
|
|
score = 1
|
|
if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
- , text[it.end(k):]) or re.search('^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
|
|
|
|
|
|
+ , text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
|
|
|
|
+ text[max(0, it.start(k) - 1):]):
|
|
score += 1
|
|
score += 1
|
|
- score += it.end(k) / len(text) / 10
|
|
|
|
|
|
+ score += it.end(k) / len(text) / 10 # 优化 572840045 上海铁路公安局合肥公安处 这种表达
|
|
city_l.append((v, score * weight))
|
|
city_l.append((v, score * weight))
|
|
elif k in ['dist', 'dist1', 'dist2']:
|
|
elif k in ['dist', 'dist1', 'dist2']:
|
|
- if v in ['东区', '西区', '城区', '郊区', '矿区']:
|
|
|
|
|
|
+ if v in ['东区', '西区', '城区', '郊区', '矿区', '东至']:
|
|
continue
|
|
continue
|
|
- if v in full_dic['district'] and len(v)>2:
|
|
|
|
|
|
+ if v in full_dic['district'] and len(v) > 2:
|
|
score = 2
|
|
score = 2
|
|
else:
|
|
else:
|
|
score = 0.5
|
|
score = 0.5
|
|
if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
- , text[it.end(k):]) or (re.match('\s*%s'%v, text) and it.start(k)<2) or re.search(
|
|
|
|
- '^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
|
|
|
|
|
|
+ , text[it.end(k):]) or (
|
|
|
|
+ re.match('\s*%s' % v, text) and it.start(k) < 2) or re.search(
|
|
|
|
+ '^((%s)|\-%s)' % (v, v), text[max(0, it.start(k) - 1):]):
|
|
score += 0.5
|
|
score += 0.5
|
|
# score += it.end(k) / len(text) / 10
|
|
# score += it.end(k) / len(text) / 10
|
|
if v == '昌江' and '景德镇' not in it.group(0):
|
|
if v == '昌江' and '景德镇' not in it.group(0):
|
|
district_l.append(('昌江黎族', score * weight))
|
|
district_l.append(('昌江黎族', score * weight))
|
|
else:
|
|
else:
|
|
district_l.append((v, score * weight))
|
|
district_l.append((v, score * weight))
|
|
- return province_l, city_l, district_l
|
|
|
|
-
|
|
|
|
- def merge_score(province_l, city_l, district_l, filter_short_dist=True):
|
|
|
|
- '''
|
|
|
|
- 合并分数,下级地区分数加到上级
|
|
|
|
- :param province_l: 提取到的省份列表 [(name, score)]
|
|
|
|
- :param city_l: 提取到的城市列表 [(name, score)]
|
|
|
|
- :param district_l: 提取到的区县列表 [(name, score)]
|
|
|
|
- :param filter_short_dist: 是否过滤不在省份下的区县简称权重
|
|
|
|
- :return:
|
|
|
|
- '''
|
|
|
|
- pro_ids = dict()
|
|
|
|
- city_ids = dict()
|
|
|
|
- dis_ids = dict()
|
|
|
|
- for pro in province_l:
|
|
|
|
- name, score = pro
|
|
|
|
- idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
|
|
|
|
- if idx not in pro_ids:
|
|
|
|
- pro_ids[idx] = 0
|
|
|
|
- pro_ids[idx] += score
|
|
|
|
-
|
|
|
|
- tmp_pro = {}
|
|
|
|
- for city in city_l:
|
|
|
|
- name, score = city
|
|
|
|
- if name in full_dic['city']:
|
|
|
|
- for idx in full_dic['city'][name]:
|
|
|
|
- if idx not in city_ids:
|
|
|
|
- city_ids[idx] = 0
|
|
|
|
- city_ids[idx] += score
|
|
|
|
- pro_idx = idx_dic[idx]['省']
|
|
|
|
- if pro_idx in tmp_pro:
|
|
|
|
- tmp_pro[pro_idx] += score
|
|
|
|
- else:
|
|
|
|
- tmp_pro[pro_idx] = score
|
|
|
|
- elif name in short_dic['city']:
|
|
|
|
- for idx in short_dic['city'][name]:
|
|
|
|
- if idx not in city_ids:
|
|
|
|
- city_ids[idx] = 0
|
|
|
|
- city_ids[idx] += score
|
|
|
|
- pro_idx = idx_dic[idx]['省']
|
|
|
|
- if pro_idx in tmp_pro:
|
|
|
|
- tmp_pro[pro_idx] += score
|
|
|
|
- else:
|
|
|
|
- tmp_pro[pro_idx] = score
|
|
|
|
- if set(tmp_pro) & set(pro_ids) != set():
|
|
|
|
- for k, v in tmp_pro.items():
|
|
|
|
- if k in pro_ids:
|
|
|
|
- pro_ids[k] += v
|
|
|
|
- else:
|
|
|
|
- pro_ids.update(tmp_pro)
|
|
|
|
- tmp_pro = {}
|
|
|
|
- tmp_city = {}
|
|
|
|
- for dis in district_l:
|
|
|
|
- name, score = dis
|
|
|
|
- if name in full_dic['district']:
|
|
|
|
- for idx in full_dic['district'][name]:
|
|
|
|
- if idx not in dis_ids:
|
|
|
|
- dis_ids[idx] = 0
|
|
|
|
- dis_ids[idx] += score
|
|
|
|
- pro_idx = idx_dic[idx]['省']
|
|
|
|
- if pro_idx in tmp_pro:
|
|
|
|
- tmp_pro[pro_idx] += score
|
|
|
|
- else:
|
|
|
|
- tmp_pro[pro_idx] = score
|
|
|
|
- city_idx = idx_dic[idx]['市']
|
|
|
|
- if city_idx in tmp_city:
|
|
|
|
- tmp_city[city_idx] += score
|
|
|
|
- else:
|
|
|
|
- tmp_city[city_idx] = score
|
|
|
|
- elif name in short_dic['district']:
|
|
|
|
- for idx in short_dic['district'][name]:
|
|
|
|
- if idx not in dis_ids:
|
|
|
|
- dis_ids[idx] = 0
|
|
|
|
- dis_ids[idx] += score
|
|
|
|
- pro_idx = idx_dic[idx]['省']
|
|
|
|
- if filter_short_dist and score < 1: # pro_idx not in pro_ids
|
|
|
|
- continue
|
|
|
|
- if pro_idx in tmp_pro:
|
|
|
|
- tmp_pro[pro_idx] += score
|
|
|
|
- else:
|
|
|
|
- tmp_pro[pro_idx] = score
|
|
|
|
- city_idx = idx_dic[idx]['市']
|
|
|
|
- if city_idx in tmp_city:
|
|
|
|
- tmp_city[city_idx] += score
|
|
|
|
- else:
|
|
|
|
- tmp_city[city_idx] = score
|
|
|
|
- if set(tmp_pro) & set(pro_ids) != set():
|
|
|
|
- for k, v in tmp_pro.items():
|
|
|
|
- if k in pro_ids:
|
|
|
|
- pro_ids[k] += v
|
|
|
|
- else:
|
|
|
|
- pro_ids.update(tmp_pro)
|
|
|
|
- if set(tmp_city) & set(city_ids) != set():
|
|
|
|
- for k, v in tmp_city.items():
|
|
|
|
- if k in city_ids:
|
|
|
|
- city_ids[k] += v
|
|
|
|
- else:
|
|
|
|
- city_ids.update(tmp_city)
|
|
|
|
- return pro_ids, city_ids, dis_ids
|
|
|
|
-
|
|
|
|
- def get_final_addr(pro_ids, city_ids, dis_ids):
|
|
|
|
- '''
|
|
|
|
- 先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
|
|
|
|
- :param province_l: 匹配到的所有省份
|
|
|
|
- :param city_l: 匹配到的所有城市
|
|
|
|
- :param district_l: 匹配到的所有区县
|
|
|
|
- :return:
|
|
|
|
- '''
|
|
|
|
- big_area = ""
|
|
|
|
- pred_pro = ""
|
|
|
|
- pred_city = ""
|
|
|
|
- pred_dis = ""
|
|
|
|
-
|
|
|
|
- final_pro = ""
|
|
|
|
- final_city = ""
|
|
|
|
- prob = 0
|
|
|
|
- max_score = 0
|
|
|
|
- if len(pro_ids) >= 1:
|
|
|
|
- pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
- scores = [it[1] for it in pro_l]
|
|
|
|
- prob = max(scores)/sum(scores)
|
|
|
|
- max_score = max(scores)
|
|
|
|
- final_pro, score = pro_l[0]
|
|
|
|
- if score >= 0.01:
|
|
|
|
- pred_pro = idx_dic[final_pro]['返回名称']
|
|
|
|
- big_area = idx_dic[final_pro]['大区']
|
|
|
|
- if pred_pro != "" and len(city_ids) >= 1:
|
|
|
|
- city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
- for it in city_l:
|
|
|
|
- if idx_dic[it[0]]['省'] == final_pro:
|
|
|
|
- final_city = it[0]
|
|
|
|
- pred_city = idx_dic[final_city]['返回名称']
|
|
|
|
- break
|
|
|
|
- if final_city != "" and len(set(dis_ids)) >= 1:
|
|
|
|
- dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
- for it in dis_l:
|
|
|
|
- if idx_dic[it[0]]['市'] == final_city:
|
|
|
|
- pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
|
- elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1: # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
|
|
|
|
- dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
- for it in dis_l:
|
|
|
|
- if idx_dic[it[0]]['省'] == final_pro:
|
|
|
|
- pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
|
|
|
|
- pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
|
- if pred_city in ['北京', '天津', '上海', '重庆']:
|
|
|
|
- pred_city = pred_dis
|
|
|
|
- pred_dis = ""
|
|
|
|
- return big_area, pred_pro, pred_city, pred_dis, prob, max_score
|
|
|
|
-
|
|
|
|
- def get_ree_addr(prem):
|
|
|
|
- tenderee = ""
|
|
|
|
- tenderee_address = ""
|
|
|
|
- try:
|
|
|
|
- for v in prem.values():
|
|
|
|
- for link in v['roleList']:
|
|
|
|
- if link['role_name'] == 'tenderee' and tenderee == "":
|
|
|
|
- tenderee = link['role_text']
|
|
|
|
- tenderee_address = link['address']
|
|
|
|
- except Exception as e:
|
|
|
|
- print('解析prem 获取招标人、及地址出错')
|
|
|
|
- return tenderee, tenderee_address
|
|
|
|
-
|
|
|
|
- def get_role_address(text):
|
|
|
|
- '''正则匹配获取招标人地址
|
|
|
|
- 3:地址直接在招标人后面 招标人:xxx,地址:xxx
|
|
|
|
- 4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
|
|
|
|
- '''
|
|
|
|
- p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
- p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
- p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
- if re.search(p3, text):
|
|
|
|
- return re.search(p3, text).group('addr')
|
|
|
|
- elif re.search(p4, text):
|
|
|
|
- return re.search(p4, text).group('addr')
|
|
|
|
- elif re.search(p5, text):
|
|
|
|
- return re.search(p5, text).group('addr')
|
|
|
|
- else:
|
|
|
|
- return ''
|
|
|
|
-
|
|
|
|
- def get_all_addr(list_entitys):
|
|
|
|
- tenderee_l = []
|
|
|
|
- addr_l = []
|
|
|
|
- for ent in list_entitys[0]:
|
|
|
|
- if ent.entity_type == 'location' and len(ent.entity_text) > 2:
|
|
|
|
- addr_l.append(ent.entity_text)
|
|
|
|
- elif ent.entity_type in ['org', 'company']:
|
|
|
|
- if ent.label in [0, 1]: # 加招标或代理
|
|
|
|
- tenderee_l.append(ent.entity_text)
|
|
|
|
- return ' '.join(addr_l), ' '.join(tenderee_l)
|
|
|
|
-
|
|
|
|
|
|
+ return province_l, city_l, district_l
|
|
|
|
+ @staticmethod
|
|
|
|
+ def merge_score(province_l, city_l, district_l, full_dic, short_dic, idx_dic, filter_short_dist=True):
|
|
|
|
+ '''
|
|
|
|
+ 合并分数,下级地区分数加到上级
|
|
|
|
+ :param province_l: 提取到的省份列表 [(name, score)]
|
|
|
|
+ :param city_l: 提取到的城市列表 [(name, score)]
|
|
|
|
+ :param district_l: 提取到的区县列表 [(name, score)]
|
|
|
|
+ :param filter_short_dist: 是否过滤不在省份下的区县简称权重
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ pro_ids = dict()
|
|
|
|
+ city_ids = dict()
|
|
|
|
+ dis_ids = dict()
|
|
|
|
+ for pro in province_l:
|
|
|
|
+ name, score = pro
|
|
|
|
+ idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
|
|
|
|
+ if idx not in pro_ids:
|
|
|
|
+ pro_ids[idx] = 0
|
|
|
|
+ pro_ids[idx] += score
|
|
|
|
+
|
|
|
|
+ tmp_pro = {}
|
|
|
|
+ for city in city_l:
|
|
|
|
+ name, score = city
|
|
|
|
+ if name in full_dic['city']:
|
|
|
|
+ for idx in full_dic['city'][name]:
|
|
|
|
+ if idx not in city_ids:
|
|
|
|
+ city_ids[idx] = 0
|
|
|
|
+ city_ids[idx] += score
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in tmp_pro:
|
|
|
|
+ tmp_pro[pro_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_pro[pro_idx] = score
|
|
|
|
+ elif name in short_dic['city']:
|
|
|
|
+ for idx in short_dic['city'][name]:
|
|
|
|
+ if idx not in city_ids:
|
|
|
|
+ city_ids[idx] = 0
|
|
|
|
+ city_ids[idx] += score
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in tmp_pro:
|
|
|
|
+ tmp_pro[pro_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_pro[pro_idx] = score
|
|
|
|
+ if set(tmp_pro) & set(pro_ids) != set():
|
|
|
|
+ for k, v in tmp_pro.items():
|
|
|
|
+ if k in pro_ids:
|
|
|
|
+ pro_ids[k] += v
|
|
|
|
+ else:
|
|
|
|
+ pro_ids.update(tmp_pro)
|
|
|
|
+ tmp_pro = {}
|
|
|
|
+ tmp_city = {}
|
|
|
|
+ for dis in district_l:
|
|
|
|
+ name, score = dis
|
|
|
|
+ if name in full_dic['district']:
|
|
|
|
+ for idx in full_dic['district'][name]:
|
|
|
|
+ if idx not in dis_ids:
|
|
|
|
+ dis_ids[idx] = 0
|
|
|
|
+ dis_ids[idx] += score
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if pro_idx in tmp_pro:
|
|
|
|
+ tmp_pro[pro_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_pro[pro_idx] = score
|
|
|
|
+ city_idx = idx_dic[idx]['市']
|
|
|
|
+ if city_idx in tmp_city:
|
|
|
|
+ tmp_city[city_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_city[city_idx] = score
|
|
|
|
+ elif name in short_dic['district']:
|
|
|
|
+ for idx in short_dic['district'][name]:
|
|
|
|
+ if idx not in dis_ids:
|
|
|
|
+ dis_ids[idx] = 0
|
|
|
|
+ dis_ids[idx] += score
|
|
|
|
+ pro_idx = idx_dic[idx]['省']
|
|
|
|
+ if filter_short_dist and score < 1: # pro_idx not in pro_ids
|
|
|
|
+ continue
|
|
|
|
+ if pro_idx in tmp_pro:
|
|
|
|
+ tmp_pro[pro_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_pro[pro_idx] = score
|
|
|
|
+ city_idx = idx_dic[idx]['市']
|
|
|
|
+ if city_idx in tmp_city:
|
|
|
|
+ tmp_city[city_idx] += score
|
|
|
|
+ else:
|
|
|
|
+ tmp_city[city_idx] = score
|
|
|
|
+ if set(tmp_pro) & set(pro_ids) != set():
|
|
|
|
+ for k, v in tmp_pro.items():
|
|
|
|
+ if k in pro_ids:
|
|
|
|
+ pro_ids[k] += v
|
|
|
|
+ else:
|
|
|
|
+ pro_ids.update(tmp_pro)
|
|
|
|
+ if set(tmp_city) & set(city_ids) != set():
|
|
|
|
+ for k, v in tmp_city.items():
|
|
|
|
+ if k in city_ids:
|
|
|
|
+ city_ids[k] += v
|
|
|
|
+ else:
|
|
|
|
+ city_ids.update(tmp_city)
|
|
|
|
+ return pro_ids, city_ids, dis_ids
|
|
|
|
+ @staticmethod
|
|
|
|
+ def get_final_addr(pro_ids, city_ids, dis_ids, idx_dic):
|
|
|
|
+ '''
|
|
|
|
+ 先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
|
|
|
|
+ :param province_l: 匹配到的所有省份
|
|
|
|
+ :param city_l: 匹配到的所有城市
|
|
|
|
+ :param district_l: 匹配到的所有区县
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ big_area = ""
|
|
|
|
+ pred_pro = ""
|
|
|
|
+ pred_city = ""
|
|
|
|
+ pred_dis = ""
|
|
|
|
+
|
|
|
|
+ final_pro = ""
|
|
|
|
+ final_city = ""
|
|
|
|
+ prob = 0
|
|
|
|
+ max_score = 0
|
|
|
|
+ if len(pro_ids) >= 1:
|
|
|
|
+ pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ scores = [it[1] for it in pro_l]
|
|
|
|
+ prob = max(scores) / sum(scores)
|
|
|
|
+ max_score = max(scores)
|
|
|
|
+ final_pro, score = pro_l[0]
|
|
|
|
+ if score >= 0.01:
|
|
|
|
+ pred_pro = idx_dic[final_pro]['返回名称']
|
|
|
|
+ big_area = idx_dic[final_pro]['大区']
|
|
|
|
+ if pred_pro != "" and len(city_ids) >= 1:
|
|
|
|
+ city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ for it in city_l:
|
|
|
|
+ if idx_dic[it[0]]['省'] == final_pro:
|
|
|
|
+ final_city = it[0]
|
|
|
|
+ pred_city = idx_dic[final_city]['返回名称']
|
|
|
|
+ break
|
|
|
|
+ if final_city != "" and len(set(dis_ids)) >= 1:
|
|
|
|
+ dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ for it in dis_l:
|
|
|
|
+ if idx_dic[it[0]]['市'] == final_city:
|
|
|
|
+ pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
|
+ elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1: # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
|
|
|
|
+ dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
|
+ for it in dis_l:
|
|
|
|
+ if idx_dic[it[0]]['省'] == final_pro:
|
|
|
|
+ pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
|
|
|
|
+ pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
|
+ return big_area, pred_pro, pred_city, pred_dis, prob, max_score
|
|
|
|
+ @staticmethod
|
|
|
|
+ def get_ree_addr(prem):
|
|
|
|
+ tenderee = ""
|
|
|
|
+ tenderee_address = ""
|
|
|
|
+ try:
|
|
|
|
+ for v in prem.values():
|
|
|
|
+ for link in v['roleList']:
|
|
|
|
+ if link['role_name'] == 'tenderee' and tenderee == "":
|
|
|
|
+ tenderee = link['role_text']
|
|
|
|
+ tenderee_address = link['address']
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print('解析prem 获取招标人、及地址出错')
|
|
|
|
+ return tenderee, tenderee_address
|
|
|
|
+ @staticmethod
|
|
|
|
+ def get_role_address(text):
|
|
|
|
+ '''正则匹配获取招标人地址
|
|
|
|
+ 3:地址直接在招标人后面 招标人:xxx,地址:xxx
|
|
|
|
+ 4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
|
|
|
|
+ '''
|
|
|
|
+ p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
+ if re.search(p3, text):
|
|
|
|
+ return re.search(p3, text).group('addr')
|
|
|
|
+ elif re.search(p4, text):
|
|
|
|
+ return re.search(p4, text).group('addr')
|
|
|
|
+ elif re.search(p5, text):
|
|
|
|
+ return re.search(p5, text).group('addr')
|
|
|
|
+ else:
|
|
|
|
+ return ''
|
|
|
|
+ @staticmethod
|
|
|
|
+ def get_all_addr(list_entity):
|
|
|
|
+ tenderee_l = []
|
|
|
|
+ addr_l = []
|
|
|
|
+ for ent in list_entity:
|
|
|
|
+ if ent.entity_type == 'location' and len(ent.entity_text) > 2:
|
|
|
|
+ addr_l.append(ent.entity_text)
|
|
|
|
+ elif ent.entity_type in ['org', 'company']:
|
|
|
|
+ if ent.label in [0, 1]: # 加招标或代理
|
|
|
|
+ tenderee_l.append(ent.entity_text)
|
|
|
|
+ return ' '.join(addr_l), ' '.join(tenderee_l)
|
|
|
|
+
|
|
|
|
+ def predict_area(self, title, content, web_source_name, prem={}, addr_dic={}, list_entity=[]):
|
|
area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
addr_project = addr_dic.get('addr_project', '')
|
|
addr_project = addr_dic.get('addr_project', '')
|
|
addr_delivery = addr_dic.get('addr_delivery', '')
|
|
addr_delivery = addr_dic.get('addr_delivery', '')
|
|
addr_bidopen = addr_dic.get('addr_bidopen', '')
|
|
addr_bidopen = addr_dic.get('addr_bidopen', '')
|
|
addr_bidsend = addr_dic.get('addr_bidsend', '')
|
|
addr_bidsend = addr_dic.get('addr_bidsend', '')
|
|
- province_l, city_l, district_l = find_whole_areas('%s %s %s'%(title, addr_delivery, addr_project))
|
|
|
|
- pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
|
|
|
|
- big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
|
|
|
|
|
|
+ addr_contact = addr_dic.get('addr_contact', '')
|
|
|
|
+ in_content = False
|
|
|
|
+ province_l, city_l, district_l = self.find_whole_areas('%s %s'%(title, addr_project), self.pettern, self.area_variance_dic, self.full_dic)
|
|
|
|
+ pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
# print('关键词1:', province_l, city_l, district_l)
|
|
# print('关键词1:', province_l, city_l, district_l)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
if pred_city == "" or prob < 0.7 or max_score<2:
|
|
if pred_city == "" or prob < 0.7 or max_score<2:
|
|
- ree, addr = get_ree_addr(prem)
|
|
|
|
- rule_ree_addr = get_role_address(content)
|
|
|
|
|
|
+ ree, addr = self.get_ree_addr(prem)
|
|
|
|
+ if ree in title:
|
|
|
|
+ ree = '##'
|
|
|
|
+ rule_ree_addr = self.get_role_address(content)
|
|
if rule_ree_addr:
|
|
if rule_ree_addr:
|
|
addr = rule_ree_addr
|
|
addr = rule_ree_addr
|
|
|
|
|
|
# addr = content
|
|
# addr = content
|
|
# ree = ''
|
|
# ree = ''
|
|
- province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
|
|
|
|
|
|
+ province_l2, city_l2, district_l2 = self.find_whole_areas('%s %s %s %s' % (ree, addr, addr_contact, addr_delivery), self.pettern, self.area_variance_dic, self.full_dic, weight=0.8)
|
|
province_l.extend(province_l2)
|
|
province_l.extend(province_l2)
|
|
city_l.extend(city_l2)
|
|
city_l.extend(city_l2)
|
|
district_l.extend(district_l2)
|
|
district_l.extend(district_l2)
|
|
- pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
|
|
|
|
- big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
|
|
|
|
|
|
+ pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
# print('关键词2:', province_l, city_l, district_l)
|
|
# print('关键词2:', province_l, city_l, district_l)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
if pred_city == "" or prob < 0.7 or max_score<2:
|
|
if pred_city == "" or prob < 0.7 or max_score<2:
|
|
- province_l3, city_l3, district_l3 = find_whole_areas('%s %s %s'%(web_source_name, addr_bidopen, addr_bidsend), weight=0.6)
|
|
|
|
|
|
+ province_l3, city_l3, district_l3 = self.find_whole_areas('%s %s'%(addr_bidopen, addr_bidsend), self.pettern, self.area_variance_dic, self.full_dic, weight=0.6)
|
|
province_l.extend(province_l3)
|
|
province_l.extend(province_l3)
|
|
city_l.extend(city_l3)
|
|
city_l.extend(city_l3)
|
|
district_l.extend(district_l3)
|
|
district_l.extend(district_l3)
|
|
- pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
|
|
|
|
- big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
|
|
|
|
|
|
+ pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
# print('关键词3:', province_l, city_l, district_l)
|
|
# print('关键词3:', province_l, city_l, district_l)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
|
+ if pred_city == "" or prob < 0.6 or max_score < 2:
|
|
|
|
+ all_addr, tenderees = self.get_all_addr(list_entity)
|
|
|
|
+ province_l4, city_l4, district_l4 = self.find_whole_areas('%s %s %s' % (web_source_name, tenderees, all_addr), self.pettern, self.area_variance_dic, self.full_dic, weight=0.3)
|
|
|
|
+ province_l.extend(province_l4)
|
|
|
|
+ city_l.extend(city_l4)
|
|
|
|
+ district_l.extend(district_l4)
|
|
|
|
+ pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
|
|
|
|
+ if prob < 0.6 or max_score < 4:
|
|
|
|
+ in_content = True
|
|
|
|
+ # print('关键词4:', province_l, city_l, district_l)
|
|
|
|
+ # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
|
+
|
|
|
|
+ if pred_city in ['北京', '天津', '上海', '重庆']:
|
|
|
|
+ pred_city = pred_dis
|
|
|
|
+ pred_dis = ""
|
|
|
|
|
|
- in_content = False
|
|
|
|
if big_area != "":
|
|
if big_area != "":
|
|
area_dic['area'] = big_area
|
|
area_dic['area'] = big_area
|
|
if pred_pro != "":
|
|
if pred_pro != "":
|
|
@@ -6155,8 +6177,9 @@ class DistrictPredictor():
|
|
area_dic['city'] = pred_city
|
|
area_dic['city'] = pred_city
|
|
if pred_dis != "":
|
|
if pred_dis != "":
|
|
area_dic['district'] = pred_dis
|
|
area_dic['district'] = pred_dis
|
|
- if in_content:
|
|
|
|
- area_dic['is_in_text'] = True
|
|
|
|
|
|
+ area_dic['is_in_text'] = in_content
|
|
|
|
+ # area_dic['prob'] = prob
|
|
|
|
+ # area_dic['max_score'] = max_score
|
|
return {'district': area_dic}
|
|
return {'district': area_dic}
|
|
|
|
|
|
def get_area(self, text, web_name, in_content=False):
|
|
def get_area(self, text, web_name, in_content=False):
|
|
@@ -6607,6 +6630,14 @@ class DistrictPredictor():
|
|
class TableTag2List():
|
|
class TableTag2List():
|
|
'''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
|
|
'''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
|
|
def table2list(self, table, text_process=None, return_html_table=False,return_kv=False):
|
|
def table2list(self, table, text_process=None, return_html_table=False,return_kv=False):
|
|
|
|
+ '''
|
|
|
|
+ 表格补全及把表格内容列表返回
|
|
|
|
+ :param table:
|
|
|
|
+ :param text_process: 预处理方法,segment(),不为None 时把td内容做预处理,结果返回加标签,适配表头识别 [[[text, 0], [text, 0]] ], 否则只返回文本[[text, text], [text, text]]
|
|
|
|
+ :param return_html_table:
|
|
|
|
+ :param return_kv:
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
self._output = []
|
|
self._output = []
|
|
row_ind = 0
|
|
row_ind = 0
|
|
col_ind = 0
|
|
col_ind = 0
|
|
@@ -6618,6 +6649,8 @@ class TableTag2List():
|
|
|
|
|
|
if len(row.find_all(['td', 'th'], recursive=False)) > 20:
|
|
if len(row.find_all(['td', 'th'], recursive=False)) > 20:
|
|
log('未补全前表格列数大于20的不做表格处理')
|
|
log('未补全前表格列数大于20的不做表格处理')
|
|
|
|
+ if return_html_table:
|
|
|
|
+ return [], []
|
|
return []
|
|
return []
|
|
|
|
|
|
for cell in row.children:
|
|
for cell in row.children:
|
|
@@ -6676,6 +6709,8 @@ class TableTag2List():
|
|
# update col_ind
|
|
# update col_ind
|
|
col_ind += col_span
|
|
col_ind += col_span
|
|
if col_ind > 50 and text_process == None: # 表格要素提取及候选人提取的 表格列数大于50的去掉
|
|
if col_ind > 50 and text_process == None: # 表格要素提取及候选人提取的 表格列数大于50的去掉
|
|
|
|
+ if return_html_table:
|
|
|
|
+ return [], []
|
|
return []
|
|
return []
|
|
|
|
|
|
# update row_ind
|
|
# update row_ind
|
|
@@ -6773,7 +6808,7 @@ class TablePremExtractor(object):
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
|
|
'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
- 'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
|
|
|
|
|
|
+ 'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因|中标情况',
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
|
|
"tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
"tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
"budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
|
|
@@ -6814,6 +6849,7 @@ class TablePremExtractor(object):
|
|
for i in range(len(td_list)) :
|
|
for i in range(len(td_list)) :
|
|
text = td_list[i]
|
|
text = td_list[i]
|
|
text = re.sub('\s|[((]排名不分先后[))]', '', text)
|
|
text = re.sub('\s|[((]排名不分先后[))]', '', text)
|
|
|
|
+ text = re.sub('排名价', '', text) # 20241225 修复 252208201 排名价(元)错误为排名
|
|
text = re.sub('^人选', '入选', text)
|
|
text = re.sub('^人选', '入选', text)
|
|
if text == '备选中标人':
|
|
if text == '备选中标人':
|
|
text = '第二候选人'
|
|
text = '第二候选人'
|
|
@@ -6949,6 +6985,8 @@ class TablePremExtractor(object):
|
|
break
|
|
break
|
|
if win_or_not != "" and (re.search('(建议|推荐)(中标|成交|中选)|是|^(中标|成交|中选)', win_or_not)==None or re.search('\w', win_or_not)==None): # 2024/04/2 修复 252208201 为空的不中标
|
|
if win_or_not != "" and (re.search('(建议|推荐)(中标|成交|中选)|是|^(中标|成交|中选)', win_or_not)==None or re.search('\w', win_or_not)==None): # 2024/04/2 修复 252208201 为空的不中标
|
|
continue
|
|
continue
|
|
|
|
+ elif 'win_or_not' in headers and win_or_not == '': # 2024/12/25 修复 334753545 中标情况为空的不中标
|
|
|
|
+ continue
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
|
|
continue
|
|
continue
|
|
if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None and all_winner == False:
|
|
if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None and all_winner == False:
|
|
@@ -7288,8 +7326,8 @@ class CandidateExtractor(object):
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
|
|
"project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
"win_sort": "排名|排序|名次|推荐顺序",
|
|
- 'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
|
|
|
|
- "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
|
|
|
|
|
|
+ 'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论|^选择设备$', # 补充站源特别表达:例:577351909 选择设备 1 为中标 0 非中标
|
|
|
|
+ "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位|^公司名称$", #补充 368295593 投标个人/单位 提取
|
|
"bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
|
|
"bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
"win_tenderer": "第一名|第一(中标|成交)?候选人",
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
"second_tenderer": "第二名|第二(中标|成交)?候选人",
|
|
@@ -7519,8 +7557,12 @@ class CandidateExtractor(object):
|
|
role_type = ""
|
|
role_type = ""
|
|
if re.search('第[一1]|^[一1]$', win_sort):
|
|
if re.search('第[一1]|^[一1]$', win_sort):
|
|
role_type = "win_tenderer"
|
|
role_type = "win_tenderer"
|
|
|
|
+ if win_or_not in ['否', '未中标', '0']: # 修复特别站源表达 577351909 选择设备:0 不是中标
|
|
|
|
+ role_type = ''
|
|
elif re.search('第[二2]|^[二2]$', win_sort):
|
|
elif re.search('第[二2]|^[二2]$', win_sort):
|
|
role_type = "second_tenderer"
|
|
role_type = "second_tenderer"
|
|
|
|
+ if win_or_not in ['是', '1']:
|
|
|
|
+ role_type = "win_tenderer"
|
|
elif re.search('第[三3]|^[三3]$', win_sort):
|
|
elif re.search('第[三3]|^[三3]$', win_sort):
|
|
role_type = "third_tenderer"
|
|
role_type = "third_tenderer"
|
|
if role_type != "":
|
|
if role_type != "":
|
|
@@ -8298,12 +8340,18 @@ class BiddingScore():
|
|
|
|
|
|
class EntityTypeRulePredictor():
|
|
class EntityTypeRulePredictor():
|
|
def __init__(self):
|
|
def __init__(self):
|
|
- self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址]([((]网址[))])?[:为]'
|
|
|
|
- self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址]([((]网址[))])?[:为]'
|
|
|
|
- self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|卸货)((期|时间)[及和、])?)?地[点址]?[:为]'
|
|
|
|
- self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(实施|服务)?(地址|地点|位置|所在地区?)(位于)?[:为]|项目位于'
|
|
|
|
|
|
+ self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址区]([((]网址[))])?[:为]'
|
|
|
|
+ self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址区]([((]网址[))])?[:为]'
|
|
|
|
+ self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?地[点址区]?[:为]'
|
|
|
|
+ self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|所在(区域|地区):|存放地[点址]?[:为]'
|
|
|
|
+ self.pattern_addr_contact = '(联系|收件人?|邮寄)地[点址区][:为]|行政区:'
|
|
self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
|
|
self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
|
|
self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
|
|
self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
|
|
|
|
+ self.pattern_addr_dic = {'addr_bidopen': self.pattern_addr_bidopen,
|
|
|
|
+ 'addr_bidsend': self.pattern_addr_bidsend,
|
|
|
|
+ 'addr_delivery': self.pattern_addr_delivery,
|
|
|
|
+ 'addr_project': self.pattern_addr_project,
|
|
|
|
+ 'addr_contact': self.pattern_addr_contact}
|
|
def predict(self, list_entitys, list_sentences, list_articles):
|
|
def predict(self, list_entitys, list_sentences, list_articles):
|
|
addr_dic = {}
|
|
addr_dic = {}
|
|
time_dic = {}
|
|
time_dic = {}
|
|
@@ -8313,14 +8361,10 @@ class EntityTypeRulePredictor():
|
|
b = entity.wordOffset_begin
|
|
b = entity.wordOffset_begin
|
|
s_index = entity.sentence_index
|
|
s_index = entity.sentence_index
|
|
sentance_text = list_sentences[0][s_index].sentence_text
|
|
sentance_text = list_sentences[0][s_index].sentence_text
|
|
- if re.search(self.pattern_addr_bidopen, sentance_text[max(0, b-10): b]):
|
|
|
|
- addr_dic['addr_bidopen'] = entity.entity_text
|
|
|
|
- elif re.search(self.pattern_addr_bidsend, sentance_text[max(0, b-10): b]):
|
|
|
|
- addr_dic['addr_bidsend'] = entity.entity_text
|
|
|
|
- elif re.search(self.pattern_addr_delivery, sentance_text[max(0, b-10): b]):
|
|
|
|
- addr_dic['addr_delivery'] = entity.entity_text
|
|
|
|
- elif re.search(self.pattern_addr_project, sentance_text[max(0, b-10): b]):
|
|
|
|
- addr_dic['addr_project'] = entity.entity_text
|
|
|
|
|
|
+ for k, v in self.pattern_addr_dic.items():
|
|
|
|
+ v = v.replace('[:为]', '')
|
|
|
|
+ if re.search(v, sentance_text[max(0, b-10): b]) and len(entity.entity_text)>2:
|
|
|
|
+ addr_dic[k] = entity.entity_text
|
|
elif entity.entity_type == 'time':
|
|
elif entity.entity_type == 'time':
|
|
b = entity.wordOffset_begin
|
|
b = entity.wordOffset_begin
|
|
s_index = entity.sentence_index
|
|
s_index = entity.sentence_index
|
|
@@ -8334,14 +8378,14 @@ class EntityTypeRulePredictor():
|
|
if code_investment == '' and re.search(self.pattern_code_investment, sentance_text[max(0, b-12): b]):
|
|
if code_investment == '' and re.search(self.pattern_code_investment, sentance_text[max(0, b-12): b]):
|
|
code_investment = entity.entity_text
|
|
code_investment = entity.entity_text
|
|
|
|
|
|
- ser1 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_bidopen, list_articles[0].content)
|
|
|
|
- ser2 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_bidsend, list_articles[0].content)
|
|
|
|
|
|
+ ser1 = re.search('(%s)(?P<addr>[\w():\.-]{5,100})[,。]'%self.pattern_addr_bidopen, list_articles[0].content)
|
|
|
|
+ ser2 = re.search('(%s)(?P<addr>[\w():\.-]{5,100})[,。]'%self.pattern_addr_bidsend, list_articles[0].content)
|
|
ser3 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_delivery, list_articles[0].content)
|
|
ser3 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_delivery, list_articles[0].content)
|
|
ser4 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_project, list_articles[0].content)
|
|
ser4 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_project, list_articles[0].content)
|
|
ser5 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
|
|
ser5 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
|
|
- if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
|
|
|
|
|
|
+ if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
|
|
addr_dic['addr_bidopen'] = ser1.group('addr')
|
|
addr_dic['addr_bidopen'] = ser1.group('addr')
|
|
- if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
|
|
|
|
|
|
+ if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
|
|
addr_dic['addr_bidsend'] = ser2.group('addr')
|
|
addr_dic['addr_bidsend'] = ser2.group('addr')
|
|
if ser3 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
|
|
if ser3 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
|
|
addr_dic['addr_delivery'] = ser3.group('addr')
|
|
addr_dic['addr_delivery'] = ser3.group('addr')
|
|
@@ -8709,20 +8753,20 @@ if __name__=="__main__":
|
|
# print(rs)
|
|
# print(rs)
|
|
|
|
|
|
docid = ""
|
|
docid = ""
|
|
- title = '甘肃省妇幼保健院(甘肃省中心医院)2024年度大额资金定期存款竞争性存放项目(第二期)采购结果公告'
|
|
|
|
|
|
+ title = '甘肃省妇幼保健院(甘肃省中心医院)(第二期)采购结果公告'
|
|
with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
html = f.read()
|
|
html = f.read()
|
|
- # tb_extract = TablePremExtractor()
|
|
|
|
- # rs = tb_extract.predict(html, [
|
|
|
|
- # "江苏中联铸本混凝土有限公司",
|
|
|
|
- # "鼓楼区协荣机械设备经销部"
|
|
|
|
- # ], web_source_name = '', all_winner=False)
|
|
|
|
- # print('标段数:',len(rs[0]))
|
|
|
|
- # print(rs)
|
|
|
|
- bdscore = BiddingScore()
|
|
|
|
- rs = bdscore.predict(html)
|
|
|
|
- print(type(rs), len(rs))
|
|
|
|
|
|
+ tb_extract = TablePremExtractor()
|
|
|
|
+ rs = tb_extract.predict(html, [
|
|
|
|
+ "江苏中联铸本混凝土有限公司",
|
|
|
|
+ "鼓楼区协荣机械设备经销部"
|
|
|
|
+ ], web_source_name = '', all_winner=False)
|
|
|
|
+ print('标段数:',len(rs[0]))
|
|
print(rs)
|
|
print(rs)
|
|
|
|
+ # bdscore = BiddingScore()
|
|
|
|
+ # rs = bdscore.predict(html)
|
|
|
|
+ # print(type(rs), len(rs))
|
|
|
|
+ # print(rs)
|
|
|
|
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
# # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|
|
# # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|