|
@@ -2890,7 +2890,7 @@ class ProductAttributesPredictor():
|
|
|
'''
|
|
|
items = [re.sub('\s', '', it) for it in items]
|
|
|
flag = False
|
|
|
- header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':''}
|
|
|
+ header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':'', '备注':'','发布日期':''}
|
|
|
product = "" # 产品
|
|
|
quantity = "" # 数量
|
|
|
quantity_unit = "" # 数量单位
|
|
@@ -2904,6 +2904,8 @@ class ProductAttributesPredictor():
|
|
|
category = "" # 品目
|
|
|
parameter = "" # 参数
|
|
|
tenderee = "" # 采购人
|
|
|
+ notes = "" # 备注 2024/3/27 达仁 需求
|
|
|
+ issue_date = "" # 发布日期 2024/3/27 达仁 需求
|
|
|
|
|
|
# for i in range(min(6, len(items))):
|
|
|
for i in range(len(items)):
|
|
@@ -2977,6 +2979,12 @@ class ProductAttributesPredictor():
|
|
|
elif re.search('总价|(成交|中标|验收|合同|预算|控制|总|合计))?([金总]额|价格?)|最高限价|价格|金额', items[j]) and re.search('数量|规格|型号|品牌|供应商', items[j])==None:
|
|
|
header_dic['总价'] = j
|
|
|
total_price = items[j]
|
|
|
+ elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', items[j]):
|
|
|
+ header_dic['备注'] = j
|
|
|
+ notes = items[j]
|
|
|
+ elif re.search('^\w{,4}发布(时间|日期)$', items[j]):
|
|
|
+ header_dic['发布日期'] = j
|
|
|
+ issue_date = items[j]
|
|
|
|
|
|
if header_dic.get('名称', "") != "" or header_dic.get('品目', "") != "":
|
|
|
# num = 0
|
|
@@ -2986,9 +2994,9 @@ class ProductAttributesPredictor():
|
|
|
# if num >=2:
|
|
|
# return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
|
|
|
if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
|
|
|
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
|
|
|
+ return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee, notes,issue_date)
|
|
|
flag = False
|
|
|
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
|
|
|
+ return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee,notes,issue_date)
|
|
|
|
|
|
def predict(self, docid='', html='', page_time=""):
|
|
|
'''
|
|
@@ -3053,6 +3061,8 @@ class ProductAttributesPredictor():
|
|
|
header_list2 = []
|
|
|
product = demand = budget = order_begin = order_end = ""
|
|
|
tenderee = ""
|
|
|
+ notes = ''
|
|
|
+ issue_date = ''
|
|
|
for i in range(len(col0_l)):
|
|
|
if re.search('项目名称', col0_l[i]):
|
|
|
header_list2.append(col0_l[i])
|
|
@@ -3078,6 +3088,12 @@ class ProductAttributesPredictor():
|
|
|
header_list2.append(col0_l[i])
|
|
|
order_time = col1_l[i].strip()
|
|
|
order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
|
+ elif re.search('^备\s*注$|资质要求|预留面向中小企业|是否适宜中小企业采购预算预留|公开征集信息', col0_l[i]):
|
|
|
+ header_list2.append(col0_l[i])
|
|
|
+ notes = col1_l[i].strip()
|
|
|
+ elif re.search('^\w{,4}发布(时间|日期)$', col0_l[i]):
|
|
|
+ header_list2.append(col0_l[i])
|
|
|
+ issue_date = self.fix_time(col1_l[i].strip(), '', '')[0]
|
|
|
if order_begin != "" and order_end!="":
|
|
|
order_begin_year = int(order_begin.split("-")[0])
|
|
|
order_end_year = int(order_end.split("-")[0])
|
|
@@ -3087,7 +3103,7 @@ class ProductAttributesPredictor():
|
|
|
# print(product,demand,budget,order_begin)
|
|
|
if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
|
|
|
link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
|
|
|
- 'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee}
|
|
|
+ 'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee, 'notes':notes, 'issue_date':issue_date}
|
|
|
if link not in demand_link:
|
|
|
demand_link.append(link)
|
|
|
headers_demand.append('_'.join(header_list2))
|
|
@@ -3140,6 +3156,8 @@ class ProductAttributesPredictor():
|
|
|
total_price = "" # 总金额
|
|
|
parameter = "" # 参数
|
|
|
tenderee = "" # 采购人
|
|
|
+ notes = '' # 备注
|
|
|
+ issue_date = '' # 发布日期
|
|
|
if len(set([re.sub('[::\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4:
|
|
|
# if len(set(tds) & self.header_set) > len(tds) * 0.2:
|
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
|
|
@@ -3181,6 +3199,9 @@ class ProductAttributesPredictor():
|
|
|
id10 = header_dic.get('参数', "")
|
|
|
id11 = header_dic.get('采购人', "")
|
|
|
|
|
|
+ id12 = header_dic.get('备注', "")
|
|
|
+ id13 = header_dic.get('发布日期', "")
|
|
|
+
|
|
|
not_attr = 0
|
|
|
for k, v in header_dic.items():
|
|
|
if isinstance(v, int):
|
|
@@ -3271,6 +3292,10 @@ class ProductAttributesPredictor():
|
|
|
tenderee = re.sub("\s","",tds[id11])
|
|
|
if len(tenderee) > 30:
|
|
|
tenderee = ""
|
|
|
+ if id12 != "":
|
|
|
+ notes = tds[id12].strip()
|
|
|
+ if id13 != "":
|
|
|
+ issue_date = self.fix_time(tds[id13].strip(), '', '')[0]
|
|
|
# print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
|
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic:
|
|
|
if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]+', tds[id2])) > 1 and len(re.split('[;;、,\n]+', tds[id1])) == len(re.split('[;;、,\n]+', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
@@ -3384,7 +3409,7 @@ class ProductAttributesPredictor():
|
|
|
order_begin = order_end = ""
|
|
|
# print(budget,order_time)
|
|
|
if budget != "" and order_time != "":
|
|
|
- link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee}
|
|
|
+ link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee,'notes':notes,'issue_date':issue_date}
|
|
|
if link not in demand_link:
|
|
|
demand_link.append(link)
|
|
|
i += 1
|
|
@@ -5489,7 +5514,7 @@ class DistrictPredictor():
|
|
|
return ''
|
|
|
|
|
|
def get_bid_addr(text):
|
|
|
- p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
+ p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售|所属)(地址|地点|所在地区?|地域):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
|
|
|
if re.search(p2, text):
|
|
|
return re.search(p2, text).group('addr')
|
|
|
else:
|
|
@@ -5525,6 +5550,8 @@ class DistrictPredictor():
|
|
|
if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
|
|
|
'^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
|
|
|
continue
|
|
|
+ if it.group(0) == '站前': # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
|
|
|
+ continue
|
|
|
addr.append((it.group(0), it.start(), it.end()))
|
|
|
if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]):
|
|
|
addr.append((it.group(0), it.start(), it.end()))
|
|
@@ -5532,6 +5559,8 @@ class DistrictPredictor():
|
|
|
|
|
|
def get_pro_city_dis_score(text, text_weight=1):
|
|
|
text = re.sub('复合肥|海南岛|兴业银行|双河口', '', text)
|
|
|
+ text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
|
|
|
+ text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589 所属地域:怒江州 识别为广西 - 崇左 - 江州
|
|
|
province_l = find_areas(p_pro, text)
|
|
|
city_l = find_areas(p_city, text)
|
|
|
district_l = find_areas(p_dis, text)
|
|
@@ -5760,7 +5789,7 @@ class DistrictPredictor():
|
|
|
|
|
|
# print('招标人地址',role_addr, tenderee_address)
|
|
|
|
|
|
- project_name = project_name + title if project_name not in title else project_name
|
|
|
+ project_name = project_name + title if project_name not in title else title
|
|
|
project_name = project_name.replace(tenderee, '')
|
|
|
|
|
|
text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
|