|
@@ -2883,7 +2883,7 @@ class ProductAttributesPredictor():
|
|
'''
|
|
'''
|
|
items = [re.sub('\s', '', it) for it in items]
|
|
items = [re.sub('\s', '', it) for it in items]
|
|
flag = False
|
|
flag = False
|
|
- header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': ''}
|
|
|
|
|
|
+ header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': '', '采购人':''}
|
|
product = "" # 产品
|
|
product = "" # 产品
|
|
quantity = "" # 数量
|
|
quantity = "" # 数量
|
|
quantity_unit = "" # 数量单位
|
|
quantity_unit = "" # 数量单位
|
|
@@ -2896,6 +2896,7 @@ class ProductAttributesPredictor():
|
|
total_price = "" # 总价
|
|
total_price = "" # 总价
|
|
category = "" # 品目
|
|
category = "" # 品目
|
|
parameter = "" # 参数
|
|
parameter = "" # 参数
|
|
|
|
+ tenderee = "" # 采购人
|
|
|
|
|
|
# for i in range(min(6, len(items))):
|
|
# for i in range(min(6, len(items))):
|
|
for i in range(len(items)):
|
|
for i in range(len(items)):
|
|
@@ -2954,11 +2955,13 @@ class ProductAttributesPredictor():
|
|
elif re.search('参数', items[j]):
|
|
elif re.search('参数', items[j]):
|
|
header_dic['参数'] = j
|
|
header_dic['参数'] = j
|
|
parameter = items[j]
|
|
parameter = items[j]
|
|
-
|
|
|
|
|
|
+ elif re.search('预算单位|(采购|招标|购买)(单位|人|方|主体)|项目业主|采购商|申购单位|需求单位|业主单位',items[j]) and len(items[j])<=8:
|
|
|
|
+ header_dic['采购人'] = j
|
|
|
|
+ tenderee = items[j]
|
|
elif re.search('需求|服务要求|服务标准', items[j]):
|
|
elif re.search('需求|服务要求|服务标准', items[j]):
|
|
header_dic['需求'] = j
|
|
header_dic['需求'] = j
|
|
demand = items[j]
|
|
demand = items[j]
|
|
- elif re.search('预算|控制金额', items[j]):
|
|
|
|
|
|
+ elif re.search('预算|控制金额', items[j]) and not re.search('预算单位',items[j]):
|
|
header_dic['预算'] = j
|
|
header_dic['预算'] = j
|
|
budget = items[j]
|
|
budget = items[j]
|
|
elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
|
|
elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
|
|
@@ -2976,9 +2979,9 @@ class ProductAttributesPredictor():
|
|
# if num >=2:
|
|
# if num >=2:
|
|
# return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
|
|
# return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
|
|
if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
|
|
if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
|
|
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
|
|
|
|
|
|
+ return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
|
|
flag = False
|
|
flag = False
|
|
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
|
|
|
|
|
|
+ return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time,tenderee)
|
|
|
|
|
|
def predict(self, docid='', html='', page_time=""):
|
|
def predict(self, docid='', html='', page_time=""):
|
|
'''
|
|
'''
|
|
@@ -3058,7 +3061,9 @@ class ProductAttributesPredictor():
|
|
if '万元' in col0_l[i] and '万' not in _budget:
|
|
if '万元' in col0_l[i] and '万' not in _budget:
|
|
_budget += '万元'
|
|
_budget += '万元'
|
|
budget = str(getUnifyMoney(_budget))
|
|
budget = str(getUnifyMoney(_budget))
|
|
-
|
|
|
|
|
|
+ elif re.search('预算单位|(采购|招标|购买)(单位|人|方|主体)|项目业主|采购商|申购单位|需求单位|业主单位', col0_l[i]):
|
|
|
|
+ header_list2.append(col0_l[i])
|
|
|
|
+ tenderee = re.sub("\s","",col1_l[i])
|
|
elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
|
|
elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
|
|
header_list2.append(col0_l[i])
|
|
header_list2.append(col0_l[i])
|
|
order_time = col1_l[i].strip()
|
|
order_time = col1_l[i].strip()
|
|
@@ -3072,7 +3077,7 @@ class ProductAttributesPredictor():
|
|
# print(product,demand,budget,order_begin)
|
|
# print(product,demand,budget,order_begin)
|
|
if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
|
|
if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
|
|
link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
|
|
link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
|
|
- 'order_begin': order_begin, 'order_end': order_end}
|
|
|
|
|
|
+ 'order_begin': order_begin, 'order_end': order_end ,'tenderee':tenderee}
|
|
if link not in demand_link:
|
|
if link not in demand_link:
|
|
demand_link.append(link)
|
|
demand_link.append(link)
|
|
headers_demand.append('_'.join(header_list2))
|
|
headers_demand.append('_'.join(header_list2))
|
|
@@ -3124,6 +3129,7 @@ class ProductAttributesPredictor():
|
|
order_end = ""
|
|
order_end = ""
|
|
total_price = "" # 总金额
|
|
total_price = "" # 总金额
|
|
parameter = "" # 参数
|
|
parameter = "" # 参数
|
|
|
|
+ tenderee = "" # 采购人
|
|
if len(set([re.sub('[::\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4:
|
|
if len(set([re.sub('[::\s]','',td) for td in tds]) & self.header_set) > len(tds) * 0.4:
|
|
# if len(set(tds) & self.header_set) > len(tds) * 0.2:
|
|
# if len(set(tds) & self.header_set) > len(tds) * 0.2:
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
|
|
@@ -3163,6 +3169,7 @@ class ProductAttributesPredictor():
|
|
|
|
|
|
id9 = header_dic.get("总价", "")
|
|
id9 = header_dic.get("总价", "")
|
|
id10 = header_dic.get('参数', "")
|
|
id10 = header_dic.get('参数', "")
|
|
|
|
+ id11 = header_dic.get('采购人', "")
|
|
|
|
|
|
not_attr = 0
|
|
not_attr = 0
|
|
for k, v in header_dic.items():
|
|
for k, v in header_dic.items():
|
|
@@ -3250,6 +3257,10 @@ class ProductAttributesPredictor():
|
|
parameter = tds[id10][:500]
|
|
parameter = tds[id10][:500]
|
|
if re.match('^详见|^详情', parameter.strip()):
|
|
if re.match('^详见|^详情', parameter.strip()):
|
|
parameter = ""
|
|
parameter = ""
|
|
|
|
+ if id11 != "":
|
|
|
|
+ tenderee = re.sub("\s","",tds[id11])
|
|
|
|
+ if len(tenderee) > 30:
|
|
|
|
+ tenderee = ""
|
|
# print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
|
|
# print('数量:{0}, 单价:{1}, 品牌:{2}, 规格:{3},总价:{4}'.format(quantity ,unitPrice, brand, specs, total_price))
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic:
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price or '单价' in header_dic or '总价' in header_dic:
|
|
if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]+', tds[id2])) > 1 and len(re.split('[;;、,\n]+', tds[id1])) == len(re.split('[;;、,\n]+', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]+', tds[id2])) > 1 and len(re.split('[;;、,\n]+', tds[id1])) == len(re.split('[;;、,\n]+', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
@@ -3363,7 +3374,7 @@ class ProductAttributesPredictor():
|
|
order_begin = order_end = ""
|
|
order_begin = order_end = ""
|
|
# print(budget,order_time)
|
|
# print(budget,order_time)
|
|
if budget != "" and order_time != "":
|
|
if budget != "" and order_time != "":
|
|
- link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
|
|
|
|
|
|
+ link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end, 'tenderee':tenderee}
|
|
if link not in demand_link:
|
|
if link not in demand_link:
|
|
demand_link.append(link)
|
|
demand_link.append(link)
|
|
i += 1
|
|
i += 1
|