|
@@ -2930,7 +2930,7 @@ class ProductPredictor():
|
|
|
class ProductAttributesPredictor():
|
|
|
def __init__(self,):
|
|
|
self.p0 = '(类别|类型|物类|目录|类目|分类)(名称|$)|^品名|^品类|^品目|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)'
|
|
|
- self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体)[\))的]?([、\w]{,4}名称|内容|描述)'
|
|
|
+ self.p1 = '(标的|维修|系统|报价构成|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|配件|资产|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名|气体)[\))的]?(名称|内容|描述)' # [、\w]{,4} 避免提取采购人名称 等
|
|
|
self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称|^内容$|(标项|分项|项目|计划|包组|标段|[分子]?包|子目|服务|招标|中标|成交|工程|招标内容)(名称|内容|描述)'
|
|
|
# self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
|
|
|
# self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
|
|
@@ -6052,7 +6052,7 @@ class DistrictPredictor():
|
|
|
text = str(text).replace('(', '(').replace(')', ')')
|
|
|
text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
|
|
|
text = re.sub(
|
|
|
- '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城|西九龙站|广州路北|安阳山村|电信|联通|北京现代', # 570445994 广州路北侧 预测为 广州 路北
|
|
|
+ '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城|西九龙站|广州路北|安阳山村|电信|联通|北京现代|祁连山', # 570445994 广州路北侧 预测为 广州 路北
|
|
|
' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
|
|
|
text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
@@ -6107,7 +6107,11 @@ class DistrictPredictor():
|
|
|
elif k in ['dist', 'dist1', 'dist2']:
|
|
|
if v in ['东区', '西区', '城区', '郊区', '矿区', '东至']:
|
|
|
continue
|
|
|
- if v in full_dic['district'] and len(v) > 2:
|
|
|
+ if v in ['向阳区', '宝山区', '南沙区', '和平区', '新城区', '鼓楼区', '南山区', '白云区', '朝阳区',
|
|
|
+ '江北区', '城关区', '永定区', '普陀区', '长安区', '市中区', '西安区', '通州区', '西湖区',
|
|
|
+ '龙华区', '城中区', '河东区', '桥西区', '青山区', '新华区', '铁西区', '铁东区', '海州区']: # 多个城市有的区概率降低
|
|
|
+ score = 0.5
|
|
|
+ elif v in full_dic['district'] and (len(v) > 2 or v.endswith('县')): # 20250709 修复 萧县 等概率过低
|
|
|
score = 2
|
|
|
else:
|
|
|
score = 0.5
|
|
@@ -6240,6 +6244,11 @@ class DistrictPredictor():
|
|
|
final_city = ""
|
|
|
prob = 0
|
|
|
max_score = 0
|
|
|
+ code_dic = {
|
|
|
+ 'province_code': '',
|
|
|
+ 'city_code': '',
|
|
|
+ 'district_code': ''
|
|
|
+ }
|
|
|
if len(pro_ids) >= 1:
|
|
|
pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
scores = [it[1] for it in pro_l]
|
|
@@ -6249,25 +6258,30 @@ class DistrictPredictor():
|
|
|
if score >= 0.01:
|
|
|
pred_pro = idx_dic[final_pro]['返回名称']
|
|
|
big_area = idx_dic[final_pro]['大区']
|
|
|
+ code_dic['province_code'] = idx_dic[final_pro]['编码']
|
|
|
if pred_pro != "" and len(city_ids) >= 1:
|
|
|
city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
for it in city_l:
|
|
|
if idx_dic[it[0]]['省'] == final_pro:
|
|
|
final_city = it[0]
|
|
|
pred_city = idx_dic[final_city]['返回名称']
|
|
|
+ code_dic['city_code'] = idx_dic[final_city]['编码']
|
|
|
break
|
|
|
if final_city != "" and len(set(dis_ids)) >= 1:
|
|
|
dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
for it in dis_l:
|
|
|
if idx_dic[it[0]]['市'] == final_city:
|
|
|
pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
+ code_dic['district_code'] = idx_dic[it[0]]['编码']
|
|
|
elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1: # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
|
|
|
dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
|
|
|
for it in dis_l:
|
|
|
if idx_dic[it[0]]['省'] == final_pro:
|
|
|
pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
|
|
|
pred_dis = idx_dic[it[0]]['返回名称']
|
|
|
- return big_area, pred_pro, pred_city, pred_dis, prob, max_score
|
|
|
+ code_dic['city_code'] = idx_dic[idx_dic[it[0]]['市']]['编码']
|
|
|
+ code_dic['district_code'] = idx_dic[it[0]]['编码']
|
|
|
+ return big_area, pred_pro, pred_city, pred_dis, prob, max_score, code_dic
|
|
|
@staticmethod
|
|
|
def get_ree_addr(prem):
|
|
|
tenderee = ""
|
|
@@ -6288,7 +6302,7 @@ class DistrictPredictor():
|
|
|
4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
|
|
|
'''
|
|
|
p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
- p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
+ p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,35}),(联系)?地址:'
|
|
|
p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
if re.search(p3, text):
|
|
|
return re.search(p3, text).group('addr')
|
|
@@ -6319,10 +6333,14 @@ class DistrictPredictor():
|
|
|
addr_contact = addr_dic.get('addr_contact', '')
|
|
|
in_content = False
|
|
|
not_sure = True # 是否不确定地区
|
|
|
+
|
|
|
province_l, city_l, district_l = self.find_whole_areas('%s %s'%(title, addr_project), self.pettern, self.area_variance_dic, self.full_dic)
|
|
|
pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
- big_area_1, pred_pro_1, pred_city_1, pred_dis_1, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
- big_area, pred_pro, pred_city, pred_dis = big_area_1, pred_pro_1, pred_city_1, pred_dis_1
|
|
|
+ big_area_1, pred_pro_1, pred_city_1, pred_dis_1, prob, max_score, code_dic_1 = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_1, pred_pro_1, pred_city_1, pred_dis_1, code_dic_1
|
|
|
+ # print('关键词1:', province_l, city_l, district_l)
|
|
|
+ # print('输入:', '标题:%s; 项目地址:%s'%(title, addr_project))
|
|
|
+ # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
if pred_city_1 == "" or prob < 0.7 or max_score<2:
|
|
|
ree, addr = self.get_ree_addr(prem)
|
|
|
if ree in title:
|
|
@@ -6333,28 +6351,33 @@ class DistrictPredictor():
|
|
|
|
|
|
# addr = content
|
|
|
# ree = ''
|
|
|
- province_l2, city_l2, district_l2 = self.find_whole_areas('%s %s %s' % (ree, addr, addr_delivery), self.pettern, self.area_variance_dic, self.full_dic, weight=0.8)
|
|
|
+ province_l2, city_l2, district_l2 = self.find_whole_areas('%s %s %s' % (ree, addr, addr_delivery), self.pettern, self.area_variance_dic, self.full_dic, weight=1)
|
|
|
province_l.extend(province_l2)
|
|
|
city_l.extend(city_l2)
|
|
|
district_l.extend(district_l2)
|
|
|
pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
- big_area_2, pred_pro_2, pred_city_2, pred_dis_2, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
- big_area, pred_pro, pred_city, pred_dis = big_area_2, pred_pro_2, pred_city_2, pred_dis_2
|
|
|
+ big_area_2, pred_pro_2, pred_city_2, pred_dis_2, prob, max_score, code_dic_2 = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_2, pred_pro_2, pred_city_2, pred_dis_2, code_dic_2
|
|
|
# print('关键词2:', province_l, city_l, district_l)
|
|
|
+ # print('输入:', '招标人:%s; 招标人地址:%s; 收货地址:%s' % (ree, addr, addr_delivery))
|
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
- if re.search('省|市|自治', addr_project) and pred_pro_1 != '' and pred_pro_1 != pred_pro_2: # 如果有项目地址使用项目地址
|
|
|
+ if re.search('省|市|县|自治', addr_project) and pred_pro_1 != '' and pred_pro_1 != pred_pro_2: # 如果有项目地址使用项目地址 要有省市县等 275127622 工程地点为狮山镇颜峰综合区岐山至人和段道路, 提错 岐山
|
|
|
not_sure = False
|
|
|
- big_area, pred_pro, pred_city, pred_dis = big_area_1, pred_pro_1, pred_city_1, pred_dis_1
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_1, pred_pro_1, pred_city_1, pred_dis_1, code_dic_1
|
|
|
if not_sure and (pred_city_2 == "" or prob < 0.7 or max_score<2):
|
|
|
- province_l3, city_l3, district_l3 = self.find_whole_areas('%s %s %s'%(addr_contact, addr_bidopen, addr_bidsend), self.pettern, self.area_variance_dic, self.full_dic, weight=0.6)
|
|
|
+ province_l3, city_l3, district_l3 = self.find_whole_areas('%s; %s; %s'%(addr_contact, addr_bidopen, addr_bidsend), self.pettern, self.area_variance_dic, self.full_dic, weight=0.6)
|
|
|
province_l.extend(province_l3)
|
|
|
city_l.extend(city_l3)
|
|
|
district_l.extend(district_l3)
|
|
|
pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
- big_area_3, pred_pro_3, pred_city_3, pred_dis_3, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
- big_area, pred_pro, pred_city, pred_dis = big_area_3, pred_pro_3, pred_city_3, pred_dis_3
|
|
|
+ big_area_3, pred_pro_3, pred_city_3, pred_dis_3, prob, max_score, code_dic_3 = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_3, pred_pro_3, pred_city_3, pred_dis_3, code_dic_3
|
|
|
# print('关键词3:', province_l, city_l, district_l)
|
|
|
+ # print('输入:', '联系:%s, 开标:%s, 邮寄:%s'%(addr_contact, addr_bidopen, addr_bidsend))
|
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
+ if pred_city_2 != "" and pred_city_2 != pred_city_3:
|
|
|
+ not_sure = False
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_2, pred_pro_2, pred_city_2, pred_dis_2, code_dic_2 # 如果招标人、招标人地址、收货地址与开标地址、联系地址等不一致,取招标人地址
|
|
|
if not_sure and (pred_city_3 == "" or prob < 0.6 or max_score < 2):
|
|
|
all_addr, tenderees = self.get_all_addr(list_entity)
|
|
|
province_l4, city_l4, district_l4 = self.find_whole_areas('%s %s %s' % (web_source_name, tenderees, all_addr), self.pettern, self.area_variance_dic, self.full_dic, weight=0.3)
|
|
@@ -6362,11 +6385,15 @@ class DistrictPredictor():
|
|
|
city_l.extend(city_l4)
|
|
|
district_l.extend(district_l4)
|
|
|
pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
|
|
|
- big_area_4, pred_pro_4, pred_city_4, pred_dis_4, prob, max_score = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
|
|
|
- big_area, pred_pro, pred_city, pred_dis = big_area_4, pred_pro_4, pred_city_4, pred_dis_4
|
|
|
+ big_area_4, pred_pro_4, pred_city_4, pred_dis_4, prob, max_score, code_dic_4 = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_4, pred_pro_4, pred_city_4, pred_dis_4, code_dic_4
|
|
|
+ if pred_city_3 != "" and pred_city_3 != pred_city_4:
|
|
|
+ not_sure = False
|
|
|
+ big_area, pred_pro, pred_city, pred_dis, code_dic = big_area_3, pred_pro_3, pred_city_3, pred_dis_3, code_dic_3 # 如果开标地址等提取的城市与所有地址提取的城市不一致,取开标地址等
|
|
|
if pred_pro_3 != pred_pro_4 and (prob < 0.6 or max_score < 2):
|
|
|
in_content = True
|
|
|
# print('关键词4:', province_l, city_l, district_l)
|
|
|
+ # print('输入:', '站源:%s, 角色:%s, 地址:%s' % (web_source_name, tenderees, all_addr))
|
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
|
|
|
|
if pred_city in ['北京', '天津', '上海', '重庆']:
|
|
@@ -6381,9 +6408,13 @@ class DistrictPredictor():
|
|
|
area_dic['city'] = pred_city
|
|
|
if pred_dis != "":
|
|
|
area_dic['district'] = pred_dis
|
|
|
+ for k, v in code_dic.items():
|
|
|
+ if v != '':
|
|
|
+ area_dic[k] = v
|
|
|
area_dic['is_in_text'] = in_content
|
|
|
# area_dic['prob'] = prob
|
|
|
# area_dic['max_score'] = max_score
|
|
|
+ # print('最终地址:', pred_pro, pred_city, pred_dis)
|
|
|
return {'district': area_dic}
|
|
|
|
|
|
def get_area(self, text, web_name, in_content=False):
|
|
@@ -9187,29 +9218,29 @@ if __name__=="__main__":
|
|
|
# # print("cost_time:", json.loads(requests_result.text)['cost_time'])
|
|
|
# # print(MAX_LEN, len(sentence), len(list_sentence))
|
|
|
|
|
|
- # docid = ""
|
|
|
- # title = ''
|
|
|
- # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
- # html = f.read()
|
|
|
- # product_attr = ProductAttributesPredictor()
|
|
|
- # rs = product_attr.predict(docid='', html=html, page_time="")
|
|
|
- # print(rs)
|
|
|
-
|
|
|
docid = ""
|
|
|
- title = '甘肃省妇幼保健院(甘肃省中心医院)(第二期)采购结果公告'
|
|
|
+ title = ''
|
|
|
with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
html = f.read()
|
|
|
- tb_extract = TablePremExtractor()
|
|
|
- rs = tb_extract.predict(html, [
|
|
|
- "江苏中联铸本混凝土有限公司",
|
|
|
- "鼓楼区协荣机械设备经销部"
|
|
|
- ], web_source_name = '', all_winner=False)
|
|
|
- print('标段数:',len(rs[0]))
|
|
|
+ product_attr = ProductAttributesPredictor()
|
|
|
+ rs = product_attr.predict(docid='', html=html, page_time="")
|
|
|
print(rs)
|
|
|
- # bdscore = BiddingScore()
|
|
|
- # rs = bdscore.predict(html)
|
|
|
- # print(type(rs), len(rs))
|
|
|
+
|
|
|
+ # docid = ""
|
|
|
+ # title = '甘肃省妇幼保健院(甘肃省中心医院)(第二期)采购结果公告'
|
|
|
+ # with open('d:/html/2.html', 'r', encoding='utf-8') as f:
|
|
|
+ # html = f.read()
|
|
|
+ # tb_extract = TablePremExtractor()
|
|
|
+ # rs = tb_extract.predict(html, [
|
|
|
+ # "江苏中联铸本混凝土有限公司",
|
|
|
+ # "鼓楼区协荣机械设备经销部"
|
|
|
+ # ], web_source_name = '', all_winner=False)
|
|
|
+ # print('标段数:',len(rs[0]))
|
|
|
# print(rs)
|
|
|
+ # # bdscore = BiddingScore()
|
|
|
+ # # rs = bdscore.predict(html)
|
|
|
+ # # print(type(rs), len(rs))
|
|
|
+ # # print(rs)
|
|
|
|
|
|
# # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
|
|
|
# # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]
|