|
@@ -3116,7 +3116,7 @@ class ProductAttributesPredictor():
|
|
|
'''
|
|
|
items = [re.sub('\s', '', it) for it in items]
|
|
|
flag = False
|
|
|
- header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':'', '备注':'','发布日期':''}
|
|
|
+ header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':'', '备注':'','发布日期':'', '品目号':'', '品目名':''}
|
|
|
product = "" # 产品
|
|
|
quantity = "" # 数量
|
|
|
quantity_unit = "" # 数量单位
|
|
@@ -3132,6 +3132,8 @@ class ProductAttributesPredictor():
|
|
|
tenderee = "" # 采购人
|
|
|
notes = "" # 备注 2024/3/27 达仁 需求
|
|
|
issue_date = "" # 发布日期 2024/3/27 达仁 需求
|
|
|
+ pinmu_no = "" # 品目号
|
|
|
+ pinmu_name = "" # 品目名称
|
|
|
|
|
|
# for i in range(min(6, len(items))):
|
|
|
for i in range(len(items)):
|
|
@@ -3168,6 +3170,12 @@ class ProductAttributesPredictor():
|
|
|
if flag:
|
|
|
# for j in range(i + 1, len(items)):
|
|
|
for j in range(len(items)):
|
|
|
+ if header_dic['品目号'] == "" and re.search('(品目|品类)(编?号|编码|序号)', items[j]):
|
|
|
+ header_dic['品目号'] = j
|
|
|
+ pinmu_no = items[j]
|
|
|
+ elif header_dic['品目名'] == "" and re.search('(品目|品类)名称|采购(品目|品类)$', items[j]):
|
|
|
+ header_dic['品目名'] = j
|
|
|
+ pinmu_name = items[j]
|
|
|
if items[j] in [product, category]:
|
|
|
continue
|
|
|
if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
|
|
@@ -3220,9 +3228,9 @@ class ProductAttributesPredictor():
|
|
|
# if num >=2:
|
|
|
# return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
|
|
|
if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
|
|
|
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee, notes,issue_date)
|
|
|
+ return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter, pinmu_no, pinmu_name), (product, demand, budget, order_time,tenderee, notes,issue_date)
|
|
|
flag = False
|
|
|
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee,notes,issue_date)
|
|
|
+ return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter, pinmu_no, pinmu_name), (product, demand, budget, order_time,tenderee,notes,issue_date)
|
|
|
|
|
|
def predict(self, docid='', html='', page_time=""):
|
|
|
'''
|
|
@@ -3390,12 +3398,13 @@ class ProductAttributesPredictor():
|
|
|
tenderee = "" # 采购人
|
|
|
notes = '' # 备注
|
|
|
issue_date = '' # 发布日期
|
|
|
+ pinmu_no = '' # 品目号
|
|
|
+ pinmu_name = '' # 品目名称
|
|
|
if len(set([re.sub('[::\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4:
|
|
|
# if len(set(tds) & self.header_set) > len(tds) * 0.2:
|
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
|
|
|
if found_header:
|
|
|
header_colnum = len(tds) # 保存表头所在行列数
|
|
|
- # print('发现表头:', header_colnum, header_dic)
|
|
|
if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
|
|
|
quantity_header = header_list[1].replace('单位:', '')
|
|
|
if re.search('(([\w/]{,5}))', quantity_header):
|
|
@@ -3433,6 +3442,8 @@ class ProductAttributesPredictor():
|
|
|
|
|
|
id12 = header_dic.get('备注', "")
|
|
|
id13 = header_dic.get('发布日期', "")
|
|
|
+ id14 = header_dic.get('品目号', "")
|
|
|
+ id15 = header_dic.get('品目名', "")
|
|
|
|
|
|
not_attr = 0
|
|
|
for k, v in header_dic.items():
|
|
@@ -3528,6 +3539,10 @@ class ProductAttributesPredictor():
|
|
|
notes = tds[id12].strip()
|
|
|
if id13 != "":
|
|
|
issue_date = self.fix_time(tds[id13].strip(), '', '')[0]
|
|
|
+ if id14 != "":
|
|
|
+ pinmu_no = tds[id14].strip()
|
|
|
+ if id15 != "":
|
|
|
+ pinmu_name = tds[id15].strip()
|
|
|
# print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
|
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic:
|
|
|
if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]+', tds[id2])) > 1 and len(re.split('[;;、,\n]+', tds[id1])) == len(re.split('[;;、,\n]+', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
@@ -3608,7 +3623,8 @@ class ProductAttributesPredictor():
|
|
|
total_price_list.append(total_price)
|
|
|
total_price = str(total_price) if total_price != 0 and total_price<50000000000 else ""
|
|
|
link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
|
- 'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter}
|
|
|
+ 'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter,
|
|
|
+ 'pinmu_no': pinmu_no, 'pinmu_name': pinmu_name}
|
|
|
|
|
|
# if link not in product_link:
|
|
|
# product_link.append(link)
|
|
@@ -5744,6 +5760,9 @@ class DistrictPredictor():
|
|
|
district_tuple = pickle.load(f)
|
|
|
self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
|
|
|
|
|
|
+ with open(os.path.dirname(__file__) + "area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
|
|
|
+ self.area_variance_dic = pickle.load(f)
|
|
|
+
|
|
|
def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
|
|
|
'''
|
|
|
先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
|
|
@@ -6000,8 +6019,12 @@ class DistrictPredictor():
|
|
|
|
|
|
final_pro = ""
|
|
|
final_city = ""
|
|
|
+ pro_prob = 0
|
|
|
+ city_prob = 0
|
|
|
if len(pro_ids) >= 1:
|
|
|
pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
+ scores = [it[1] for it in pro_l]
|
|
|
+ pro_prob = max(scores)/sum(scores)
|
|
|
final_pro, score = pro_l[0]
|
|
|
if score >= 0.01:
|
|
|
pred_pro = idx_dic[final_pro]['返回名称']
|
|
@@ -6011,6 +6034,8 @@ class DistrictPredictor():
|
|
|
|
|
|
if pred_pro != "" and len(city_ids) >= 1:
|
|
|
city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
+ scores = [it[1] for it in city_l]
|
|
|
+ city_prob = max(scores) / sum(scores)
|
|
|
for it in city_l:
|
|
|
if idx_dic[it[0]]['省'] == final_pro:
|
|
|
final_city = it[0]
|
|
@@ -6021,6 +6046,13 @@ class DistrictPredictor():
|
|
|
for it in dis_l:
|
|
|
if idx_dic[it[0]]['市'] == final_city:
|
|
|
pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
+ elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1: # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
|
|
|
+ dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
+ for it in dis_l:
|
|
|
+ if idx_dic[it[0]]['省'] == final_pro:
|
|
|
+ pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
|
|
|
+ pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
+ # print('20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县: ', pred_city, pred_dis)
|
|
|
|
|
|
if pred_city in ['北京', '天津', '上海', '重庆']:
|
|
|
pred_city = pred_dis
|
|
@@ -6073,7 +6105,7 @@ class DistrictPredictor():
|
|
|
p_pro, p_city, p_dis, p_city, p_dis, p_dis)
|
|
|
province_l, city_l, district_l = [], [], []
|
|
|
for it in re.finditer(pettern, text):
|
|
|
- if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
|
|
|
+ if re.search('[省市区县旗盟]', it.group(0)) == None and re.search(
|
|
|
'^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
|
|
|
continue
|
|
|
if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
|
|
@@ -6096,7 +6128,7 @@ class DistrictPredictor():
|
|
|
return province_l, city_l, district_l
|
|
|
|
|
|
def get_pro_city_dis_score(text, text_weight=1):
|
|
|
- text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)', ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
|
|
|
+ text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间', ' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
|
|
|
text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
|
text = re.sub('茂名滨海新区', '茂名市', text)
|
|
@@ -6105,15 +6137,17 @@ class DistrictPredictor():
|
|
|
ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
|
|
|
if ser and '黎族' not in ser.group(0):
|
|
|
text = text.replace(ser.group(0), ser.group(0)+'黎族')
|
|
|
+ for k, v in self.area_variance_dic.items(): # 20241113 根据地区变更信息替换文本
|
|
|
+ text = text.replace(k, v)
|
|
|
# province_l = find_areas(p_pro, text)
|
|
|
# city_l = find_areas(p_city, text)
|
|
|
# district_l = find_areas(p_dis, text)
|
|
|
|
|
|
province_l, city_l, district_l = find_whole_areas(text) # 20240703 优化地址提取,解决类似 海南昌江 得到 海南 南昌 结果
|
|
|
|
|
|
- if len(province_l) == len(city_l) == 0:
|
|
|
- district_l = [it for it in district_l if
|
|
|
- re.search('[市县旗区]$', it[0])] # 20240428去掉只有区县地址且不是全称的匹配,避免错误 例 凌云工业股份有限公司 提取地区为广西白色凌云
|
|
|
+ # if len(province_l) == len(city_l) == 0:
|
|
|
+ # district_l = [it for it in district_l if
|
|
|
+ # re.search('[市县旗区]$', it[0])] # 20240428去掉只有区县地址且不是全称的匹配,避免错误 例 凌云工业股份有限公司 提取地区为广西白色凌云
|
|
|
|
|
|
province_l = chage_area2score(province_l, max_len=len(text))
|
|
|
city_l = chage_area2score(city_l, max_len=len(text))
|
|
@@ -6192,7 +6226,8 @@ class DistrictPredictor():
|
|
|
dis_ids[idx] = 0
|
|
|
weight = idx_dic[idx]['权重']
|
|
|
dis_ids[idx] += (score + 0) * w
|
|
|
-
|
|
|
+ if idx_dic[idx]['市'] not in city_ids and idx_dic[idx]['省'] not in pro_ids: # 20241111 区县简称不在获取到的省、市范围内的过滤掉
|
|
|
+ continue
|
|
|
pro_idx = idx_dic[idx]['省']
|
|
|
if pro_idx in pro_ids:
|
|
|
pro_ids[pro_idx] += (score + 0) * w * weight
|
|
@@ -6203,6 +6238,8 @@ class DistrictPredictor():
|
|
|
city_ids[city_idx] += (score + 0) * w * weight
|
|
|
# else: # 20241015 注销 区县简称且不在提取的省市下面,不加分,避免提取错误 例:536550843
|
|
|
# city_ids[city_idx] = (score + 0) * w * weight * 0.1
|
|
|
+ elif pro_idx in pro_ids:
|
|
|
+ city_ids[city_idx] = (score + 0) * w * weight * 0.1
|
|
|
|
|
|
for k, v in pro_ids.items():
|
|
|
pro_ids[k] = v * text_weight
|
|
@@ -6215,7 +6252,7 @@ class DistrictPredictor():
|
|
|
area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
|
|
|
|
pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
|
|
|
- pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name[:3], text_weight=0.2) # 20240422 修改为站源名称只取前三字,避免类似 459056219 中金岭南阳光采购平台 错提取阳光
|
|
|
+ pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.01) # 20240422 修改为站源名称只取前三字,避免类似 459056219 中金岭南阳光采购平台 错提取阳光
|
|
|
for k in pro_ids1:
|
|
|
if k in pro_ids:
|
|
|
pro_ids[k] += pro_ids1[k]
|
|
@@ -6288,7 +6325,7 @@ class DistrictPredictor():
|
|
|
return ''
|
|
|
|
|
|
def get_project_addr(text):
|
|
|
- p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
+ p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+([\w()]{,20}[,。])?|\w{2,15}[,。])'
|
|
|
p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
|
|
|
if re.search(p1, text):
|
|
|
return re.search(p1, text).group('addr')
|
|
@@ -6332,12 +6369,12 @@ class DistrictPredictor():
|
|
|
tenderee, tenderee_address = get_ree_addr(prem)
|
|
|
msc = ""
|
|
|
pro_addr = get_project_addr(content)
|
|
|
- if pro_addr != "":
|
|
|
+ if pro_addr != "" and re.search('(采购人|招标人)?指定地点', pro_addr)==None: # 排除错误项目地址 例:554024168 1.5服务地点:采购人指定地点。
|
|
|
msc += '使用规则提取的项目地址;'
|
|
|
tenderee_address = pro_addr
|
|
|
else:
|
|
|
role_addr = get_role_address(content)
|
|
|
- if role_addr != "":
|
|
|
+ if role_addr != "" and re.search('(采购人|招标人)?指定地点', role_addr)==None:
|
|
|
msc += '使用规则提取的联系人地址;'
|
|
|
tenderee_address = role_addr
|
|
|
|
|
@@ -6370,14 +6407,14 @@ class DistrictPredictor():
|
|
|
web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
|
|
|
text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) # 预防提取错 合肥 路南 新会 等地区
|
|
|
|
|
|
- if pro_addr and re.search('\w{2,}([省市县旗盟]|自治[区州县旗])', pro_addr):
|
|
|
+ if pro_addr and re.search('\w{2,}([市县旗盟]|自治[区州县旗])', pro_addr):
|
|
|
if re.search('[市县旗盟]', pro_addr)==None: # 修复 486623506 项目地址不完整
|
|
|
pro_addr = text1 + ' '+ pro_addr
|
|
|
msc += '## 使用项目地址输入:%s ##;' % pro_addr
|
|
|
rs = self.get_area(pro_addr, '')
|
|
|
msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
|
|
|
rs['district']['province'], rs['district']['city'], rs['district']['district'])
|
|
|
- if rs['district']['province'] != '全国':
|
|
|
+ if rs['district']['province'] != '全国' and rs['district']['city'] != '未知':
|
|
|
# print('地区匹配:', msc)
|
|
|
return rs
|
|
|
|
|
@@ -6389,7 +6426,7 @@ class DistrictPredictor():
|
|
|
# self.f.write('%s %s \n' % (list_articles[0].id, msc))
|
|
|
# print('地区匹配:', msc)
|
|
|
if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
|
|
|
- msc = ""
|
|
|
+ # msc = ""
|
|
|
all_addr, tenderees = get_all_addr(list_entitys)
|
|
|
text2 = tenderees + " " + all_addr + ' ' + title
|
|
|
msc += '使用实体列表所有招标人+所有地址;'
|