|
@@ -3609,6 +3609,8 @@ class ProductAttributesPredictor():
|
|
# print('产品拆分:', len(products),len(quantitys) , len(unitPrices),len(brands),len(specses))
|
|
# print('产品拆分:', len(products),len(quantitys) , len(unitPrices),len(brands),len(specses))
|
|
if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(specses):
|
|
if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(specses):
|
|
for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(products,quantitys,unitPrices, brands, specses, total_prices, parameters):
|
|
for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(products,quantitys,unitPrices, brands, specses, total_prices, parameters):
|
|
|
|
+ if product.strip() == '': # 20241219修复 572876124 最后一个符号分割产品所有要素为空问题
|
|
|
|
+ continue
|
|
if quantity != "":
|
|
if quantity != "":
|
|
quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
|
|
quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
@@ -5811,7 +5813,7 @@ class DistrictPredictor():
|
|
with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
|
|
with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
|
|
self.area_variance_dic = pickle.load(f)
|
|
self.area_variance_dic = pickle.load(f)
|
|
|
|
|
|
- def predict_area(self, title, ree, addr, web_source_name):
|
|
|
|
|
|
+ def predict_area(self, title, content, web_source_name, prem={}, addr_dic={}):
|
|
p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
|
|
p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
|
|
|
|
|
|
def find_whole_areas(text, weight=1):
|
|
def find_whole_areas(text, weight=1):
|
|
@@ -5823,8 +5825,9 @@ class DistrictPredictor():
|
|
'''
|
|
'''
|
|
province_l, city_l, district_l = [], [], []
|
|
province_l, city_l, district_l = [], [], []
|
|
|
|
|
|
- text = str(text)
|
|
|
|
- text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县',
|
|
|
|
|
|
+ text = str(text).replace('(', '(').replace(')', ')')
|
|
|
|
+ text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
|
|
|
|
+ text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城',
|
|
' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
|
|
' ', text) # 544151395 赤壁市老城区燃气管道老化更新改造
|
|
text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
@@ -5836,6 +5839,7 @@ class DistrictPredictor():
|
|
text = text.replace(ser.group(0), ser.group(0) + '黎族')
|
|
text = text.replace(ser.group(0), ser.group(0) + '黎族')
|
|
for k, v in self.area_variance_dic.items(): # 20241113 根据地区变更信息替换文本
|
|
for k, v in self.area_variance_dic.items(): # 20241113 根据地区变更信息替换文本
|
|
text = text.replace(k, v)
|
|
text = text.replace(k, v)
|
|
|
|
+ text = re.sub('\s+', '', text)
|
|
|
|
|
|
if re.search('[\u4e00-\u9fa5]', text) == None:
|
|
if re.search('[\u4e00-\u9fa5]', text) == None:
|
|
return province_l, city_l, district_l
|
|
return province_l, city_l, district_l
|
|
@@ -5858,8 +5862,8 @@ class DistrictPredictor():
|
|
score = 2
|
|
score = 2
|
|
else:
|
|
else:
|
|
score = 1
|
|
score = 1
|
|
- if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
|
- , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
|
|
|
|
|
|
+ if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
|
+ , text[it.end(k):]) or re.search('^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
|
|
score += 1
|
|
score += 1
|
|
score += it.end(k) / len(text) / 10
|
|
score += it.end(k) / len(text) / 10
|
|
province_l.append((v, score * weight))
|
|
province_l.append((v, score * weight))
|
|
@@ -5868,8 +5872,8 @@ class DistrictPredictor():
|
|
score = 2
|
|
score = 2
|
|
else:
|
|
else:
|
|
score = 1
|
|
score = 1
|
|
- if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
|
- , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
|
|
|
|
|
|
+ if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
|
+ , text[it.end(k):]) or re.search('^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
|
|
score += 1
|
|
score += 1
|
|
score += it.end(k) / len(text) / 10
|
|
score += it.end(k) / len(text) / 10
|
|
city_l.append((v, score * weight))
|
|
city_l.append((v, score * weight))
|
|
@@ -5880,11 +5884,11 @@ class DistrictPredictor():
|
|
score = 2
|
|
score = 2
|
|
else:
|
|
else:
|
|
score = 0.5
|
|
score = 0.5
|
|
- if it.start(k)==0 or re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
|
- , text[it.end(k):]) or (it.start(k)>0 and it.end(k)<len(text) and text[it.start(k)-1]=='(' and text[it.end(k)]==')'):
|
|
|
|
- score += 1
|
|
|
|
- # print('县区加分:', v, text)
|
|
|
|
- score += it.end(k) / len(text) / 10
|
|
|
|
|
|
+ if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
|
|
|
|
+ , text[it.end(k):]) or (re.match('\s*%s'%v, text) and it.start(k)<2) or re.search(
|
|
|
|
+ '^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
|
|
|
|
+ score += 0.5
|
|
|
|
+ # score += it.end(k) / len(text) / 10
|
|
if v == '昌江' and '景德镇' not in it.group(0):
|
|
if v == '昌江' and '景德镇' not in it.group(0):
|
|
district_l.append(('昌江黎族', score * weight))
|
|
district_l.append(('昌江黎族', score * weight))
|
|
else:
|
|
else:
|
|
@@ -5964,7 +5968,7 @@ class DistrictPredictor():
|
|
dis_ids[idx] = 0
|
|
dis_ids[idx] = 0
|
|
dis_ids[idx] += score
|
|
dis_ids[idx] += score
|
|
pro_idx = idx_dic[idx]['省']
|
|
pro_idx = idx_dic[idx]['省']
|
|
- if filter_short_dist and pro_idx not in pro_ids:
|
|
|
|
|
|
+ if filter_short_dist and score < 1: # pro_idx not in pro_ids
|
|
continue
|
|
continue
|
|
if pro_idx in tmp_pro:
|
|
if pro_idx in tmp_pro:
|
|
tmp_pro[pro_idx] += score
|
|
tmp_pro[pro_idx] += score
|
|
@@ -6042,7 +6046,7 @@ class DistrictPredictor():
|
|
tenderee = ""
|
|
tenderee = ""
|
|
tenderee_address = ""
|
|
tenderee_address = ""
|
|
try:
|
|
try:
|
|
- for v in prem[0]['prem'].values():
|
|
|
|
|
|
+ for v in prem.values():
|
|
for link in v['roleList']:
|
|
for link in v['roleList']:
|
|
if link['role_name'] == 'tenderee' and tenderee == "":
|
|
if link['role_name'] == 'tenderee' and tenderee == "":
|
|
tenderee = link['role_text']
|
|
tenderee = link['role_text']
|
|
@@ -6068,23 +6072,6 @@ class DistrictPredictor():
|
|
else:
|
|
else:
|
|
return ''
|
|
return ''
|
|
|
|
|
|
- def get_project_addr(text):
|
|
|
|
- p1 = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)(位于)?:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+([\w()]{,20}[,。])?|\w{2,15}[,。])'
|
|
|
|
- p2 = '项目位于(?P<addr>\w{2}市\w{2,4}区)'
|
|
|
|
- if re.search(p1, text):
|
|
|
|
- return re.search(p1, text).group('addr')
|
|
|
|
- elif re.search(p2, text):
|
|
|
|
- return re.search(p2, text).group('addr')
|
|
|
|
- else:
|
|
|
|
- return ''
|
|
|
|
-
|
|
|
|
- def get_bid_addr(text):
|
|
|
|
- p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
- if re.search(p2, text):
|
|
|
|
- return re.search(p2, text).group('addr')
|
|
|
|
- else:
|
|
|
|
- return ''
|
|
|
|
-
|
|
|
|
def get_all_addr(list_entitys):
|
|
def get_all_addr(list_entitys):
|
|
tenderee_l = []
|
|
tenderee_l = []
|
|
addr_l = []
|
|
addr_l = []
|
|
@@ -6096,20 +6083,24 @@ class DistrictPredictor():
|
|
tenderee_l.append(ent.entity_text)
|
|
tenderee_l.append(ent.entity_text)
|
|
return ' '.join(addr_l), ' '.join(tenderee_l)
|
|
return ' '.join(addr_l), ' '.join(tenderee_l)
|
|
|
|
|
|
- def get_title_addr(text):
|
|
|
|
- p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
|
- if re.search(p1, text):
|
|
|
|
- return re.search(p1, text).group('addr')
|
|
|
|
- else:
|
|
|
|
- return ''
|
|
|
|
-
|
|
|
|
area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
|
|
- province_l, city_l, district_l = find_whole_areas(title)
|
|
|
|
|
|
+ addr_project = addr_dic.get('addr_project', '')
|
|
|
|
+ addr_delivery = addr_dic.get('addr_delivery', '')
|
|
|
|
+ addr_bidopen = addr_dic.get('addr_bidopen', '')
|
|
|
|
+ addr_bidsend = addr_dic.get('addr_bidsend', '')
|
|
|
|
+ province_l, city_l, district_l = find_whole_areas('%s %s %s'%(title, addr_delivery, addr_project))
|
|
pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
|
|
pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
|
|
big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
|
|
big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
|
|
# print('关键词1:', province_l, city_l, district_l)
|
|
# print('关键词1:', province_l, city_l, district_l)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
if pred_city == "" or prob < 0.7 or max_score<2:
|
|
if pred_city == "" or prob < 0.7 or max_score<2:
|
|
|
|
+ ree, addr = get_ree_addr(prem)
|
|
|
|
+ rule_ree_addr = get_role_address(content)
|
|
|
|
+ if rule_ree_addr:
|
|
|
|
+ addr = rule_ree_addr
|
|
|
|
+
|
|
|
|
+ # addr = content
|
|
|
|
+ # ree = ''
|
|
province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
|
|
province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
|
|
province_l.extend(province_l2)
|
|
province_l.extend(province_l2)
|
|
city_l.extend(city_l2)
|
|
city_l.extend(city_l2)
|
|
@@ -6119,7 +6110,7 @@ class DistrictPredictor():
|
|
# print('关键词2:', province_l, city_l, district_l)
|
|
# print('关键词2:', province_l, city_l, district_l)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
# print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
|
|
if pred_city == "" or prob < 0.7 or max_score<2:
|
|
if pred_city == "" or prob < 0.7 or max_score<2:
|
|
- province_l3, city_l3, district_l3 = find_whole_areas(web_source_name, weight=0.6)
|
|
|
|
|
|
+ province_l3, city_l3, district_l3 = find_whole_areas('%s %s %s'%(web_source_name, addr_bidopen, addr_bidsend), weight=0.6)
|
|
province_l.extend(province_l3)
|
|
province_l.extend(province_l3)
|
|
city_l.extend(city_l3)
|
|
city_l.extend(city_l3)
|
|
district_l.extend(district_l3)
|
|
district_l.extend(district_l3)
|
|
@@ -8249,7 +8240,7 @@ class EntityTypeRulePredictor():
|
|
def __init__(self):
|
|
def __init__(self):
|
|
self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址]([((]网址[))])?[:为]'
|
|
self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址]([((]网址[))])?[:为]'
|
|
self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址]([((]网址[))])?[:为]'
|
|
self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址]([((]网址[))])?[:为]'
|
|
- self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|卸货)((期|时间)[及和、])?)?地[点址][:为]'
|
|
|
|
|
|
+ self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|卸货)((期|时间)[及和、])?)?地[点址]?[:为]'
|
|
self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(实施|服务)?(地址|地点|位置|所在地区?)(位于)?[:为]|项目位于'
|
|
self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(实施|服务)?(地址|地点|位置|所在地区?)(位于)?[:为]|项目位于'
|
|
self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
|
|
self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
|
|
self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
|
|
self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
|
|
@@ -8288,13 +8279,13 @@ class EntityTypeRulePredictor():
|
|
ser3 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_delivery, list_articles[0].content)
|
|
ser3 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_delivery, list_articles[0].content)
|
|
ser4 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_project, list_articles[0].content)
|
|
ser4 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_project, list_articles[0].content)
|
|
ser5 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
|
|
ser5 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
|
|
- if ser1 and re.search('\w{2,5}[省市区]|\d号|采购网|http', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
|
|
|
|
|
|
+ if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
|
|
addr_dic['addr_bidopen'] = ser1.group('addr')
|
|
addr_dic['addr_bidopen'] = ser1.group('addr')
|
|
- if ser2 and re.search('\w{2,5}[省市区]|\d号|采购网|http', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
|
|
|
|
|
|
+ if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
|
|
addr_dic['addr_bidsend'] = ser2.group('addr')
|
|
addr_dic['addr_bidsend'] = ser2.group('addr')
|
|
- if ser3 and re.search('\w{2,5}[省市区]|\d号', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
|
|
|
|
|
|
+ if ser3 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
|
|
addr_dic['addr_delivery'] = ser3.group('addr')
|
|
addr_dic['addr_delivery'] = ser3.group('addr')
|
|
- if ser4 and re.search('\w{2,5}[省市区]|\d号', ser4.group('addr')) and addr_dic.get('addr_project', '') in ser4.group('addr'):
|
|
|
|
|
|
+ if ser4 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser4.group('addr')) and addr_dic.get('addr_project', '') in ser4.group('addr'):
|
|
addr_dic['addr_project'] = ser4.group('addr')
|
|
addr_dic['addr_project'] = ser4.group('addr')
|
|
if ser5 and code_investment == '':
|
|
if ser5 and code_investment == '':
|
|
code_investment = ser5.group('code')
|
|
code_investment = ser5.group('code')
|