|
@@ -3303,6 +3303,7 @@ class ProductAttributesPredictor():
|
|
header_col = []
|
|
header_col = []
|
|
product_link = []
|
|
product_link = []
|
|
demand_link = []
|
|
demand_link = []
|
|
|
|
+ product_set = set()
|
|
for idx in range(len(begin_list)):
|
|
for idx in range(len(begin_list)):
|
|
if idx==len(begin_list)-1:
|
|
if idx==len(begin_list)-1:
|
|
deal_list = head_value_list[begin_list[idx]:]
|
|
deal_list = head_value_list[begin_list[idx]:]
|
|
@@ -3321,6 +3322,8 @@ class ProductAttributesPredictor():
|
|
order_time = "" # 采购时间
|
|
order_time = "" # 采购时间
|
|
order_begin = ""
|
|
order_begin = ""
|
|
order_end = ""
|
|
order_end = ""
|
|
|
|
+ total_price = "" # 总金额
|
|
|
|
+ parameter = "" # 参数
|
|
|
|
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2)
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2)
|
|
if found_header:
|
|
if found_header:
|
|
@@ -3328,6 +3331,7 @@ class ProductAttributesPredictor():
|
|
headers_demand.append('_'.join(header_list2))
|
|
headers_demand.append('_'.join(header_list2))
|
|
header_col.append('_'.join(tmp_head_list))
|
|
header_col.append('_'.join(tmp_head_list))
|
|
# print('header_dic: ',header_dic)
|
|
# print('header_dic: ',header_dic)
|
|
|
|
+ id0 = header_dic.get('品目', "")
|
|
id1 = header_dic.get('名称', "")
|
|
id1 = header_dic.get('名称', "")
|
|
id2 = header_dic.get('数量', "")
|
|
id2 = header_dic.get('数量', "")
|
|
id2_2 = header_dic.get('单位', "")
|
|
id2_2 = header_dic.get('单位', "")
|
|
@@ -3338,9 +3342,18 @@ class ProductAttributesPredictor():
|
|
id6 = header_dic.get('需求', "")
|
|
id6 = header_dic.get('需求', "")
|
|
id7 = header_dic.get('预算', "")
|
|
id7 = header_dic.get('预算', "")
|
|
id8 = header_dic.get('时间', "")
|
|
id8 = header_dic.get('时间', "")
|
|
|
|
+
|
|
|
|
+ id9 = header_dic.get("总价", "")
|
|
|
|
+ id10 = header_dic.get('参数', "")
|
|
if id1!='' and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
|
|
if id1!='' and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
|
|
re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
|
|
re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
|
|
product = deal_list[id1]
|
|
product = deal_list[id1]
|
|
|
|
+ if id0 != "" and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id0]) and deal_list[id0] not in self.header_set and \
|
|
|
|
+ re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id0]) == None:
|
|
|
|
+ category = deal_list[id0]
|
|
|
|
+ product = "%s_%s" % (category, product) if product != "" else category
|
|
|
|
+
|
|
|
|
+ if product != "":
|
|
if id2 != "":
|
|
if id2 != "":
|
|
if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
|
|
if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
|
|
quantity = deal_list[id2]
|
|
quantity = deal_list[id2]
|
|
@@ -3381,11 +3394,15 @@ class ProductAttributesPredictor():
|
|
if id4 != "":
|
|
if id4 != "":
|
|
if re.search('\w', deal_list[id4]):
|
|
if re.search('\w', deal_list[id4]):
|
|
brand = deal_list[id4]
|
|
brand = deal_list[id4]
|
|
|
|
+ if re.match('^详见|^略$', brand.strip()):
|
|
|
|
+ brand = ""
|
|
else:
|
|
else:
|
|
brand = ""
|
|
brand = ""
|
|
if id5 != "":
|
|
if id5 != "":
|
|
if re.search('\w', deal_list[id5]):
|
|
if re.search('\w', deal_list[id5]):
|
|
- specs = deal_list[id5]
|
|
|
|
|
|
+ specs = deal_list[id5][:500]
|
|
|
|
+ if re.match('^详见|^略$', specs.strip()):
|
|
|
|
+ brand = ""
|
|
else:
|
|
else:
|
|
specs = ""
|
|
specs = ""
|
|
if id6 != "":
|
|
if id6 != "":
|
|
@@ -3408,25 +3425,104 @@ class ProductAttributesPredictor():
|
|
if re.search('\w', deal_list[id8]) and re.search("采购(实施)?(时间|月份|日期)",header_list2[3]):
|
|
if re.search('\w', deal_list[id8]) and re.search("采购(实施)?(时间|月份|日期)",header_list2[3]):
|
|
order_time = deal_list[id8].strip()
|
|
order_time = deal_list[id8].strip()
|
|
order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
- # print(quantity,unitPrice,brand,specs)
|
|
|
|
- if quantity != "" or unitPrice != "" or brand != "" or specs != "":
|
|
|
|
- link = {'product': product, 'quantity': quantity, 'quantity_unit':quantity_unit,'unitPrice': unitPrice,
|
|
|
|
- 'brand': brand[:50], 'specs': specs}
|
|
|
|
- if link not in product_link:
|
|
|
|
- product_link.append(link)
|
|
|
|
- # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
|
|
|
|
- # if link['unitPrice'] != "" and mat:
|
|
|
|
- # try:
|
|
|
|
- # total_product_money += float(link['unitPrice']) * float(
|
|
|
|
- # mat.group(1).replace(',', ''))
|
|
|
|
- # except:
|
|
|
|
- # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
|
|
|
|
- # link['unitPrice'], link['quantity']))
|
|
|
|
|
|
+ if id9 != "":
|
|
|
|
+ if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id9]):
|
|
|
|
+ total_price = deal_list[id9]
|
|
|
|
+ elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', deal_list[id9].strip()):
|
|
|
|
+ total_price = deal_list[id9]
|
|
|
|
+ if id10 != "":
|
|
|
|
+ parameter = deal_list[id10][:500]
|
|
|
|
+ if re.match('^详见|^略$', parameter.strip()):
|
|
|
|
+ parameter = ""
|
|
|
|
+ if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
|
|
|
|
+ if id2 != "" and id3 != "" and len(re.split('[;;、,\n]', deal_list[id2])) > 1 and len(
|
|
|
|
+ re.split('[;;、,\n]', deal_list[id1])) == len(re.split('[;;、,\n]', deal_list[
|
|
|
|
+ id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
|
|
+ products = re.split('[;;、,\n]', deal_list[id1])
|
|
|
|
+ quantitys = re.split('[;;、,\n]', deal_list[id2])
|
|
|
|
+ unitPrices = re.split('[;;、,\n]', deal_list[id3])
|
|
|
|
+ total_prices = re.split('[;;、,\n]', total_price)
|
|
|
|
+ brands = re.split('[;;、,\n]', brand) if re.search('等$', brand) == None else [brand]
|
|
|
|
+ specses = re.split('[;;、,\n]', specs) if re.search('等$', specs) == None else [specs]
|
|
|
|
+ parameters = re.split('[;;、,\n]', parameter) if re.search('等$', parameter) == None else [parameter]
|
|
|
|
+ unitPrices = [""] * len(products) if len(unitPrices) == 1 else unitPrices
|
|
|
|
+ total_prices = [""] * len(products) if len(total_prices) == 1 else total_prices
|
|
|
|
+ brands = brands * len(products) if len(brands) == 1 else brands
|
|
|
|
+ specses = specses * len(products) if len(specses) == 1 else specses
|
|
|
|
+ parameters = parameters * len(products) if len(parameters) == 1 else parameters
|
|
|
|
+ if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(
|
|
|
|
+ specses):
|
|
|
|
+ for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(
|
|
|
|
+ products, quantitys, unitPrices, brands, specses, total_prices,
|
|
|
|
+ parameters):
|
|
|
|
+ # if quantity != "":
|
|
|
|
+ # quantity, quantity_unit_ = self.fix_quantity(quantity,
|
|
|
|
+ # header_quan_unit)
|
|
|
|
+ # quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
|
|
+ if unitPrice != "":
|
|
|
|
+ unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
|
|
+ unitPrice = str(unitPrice) if unitPrice != 0 else ""
|
|
|
|
+ if budget != "":
|
|
|
|
+ budget, _money_unit = money_process(budget, header_list2[2])
|
|
|
|
+ budget = str(budget) if budget != 0 else ''
|
|
|
|
+ if total_price != "":
|
|
|
|
+ total_price, _money_unit = money_process(total_price,
|
|
|
|
+ header_list[6])
|
|
|
|
+ total_price = str(total_price) if unitPrice != 0 else ""
|
|
|
|
+ link = {'product': product, 'quantity': quantity,
|
|
|
|
+ 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
|
|
+ 'brand': brand[:50], 'specs': specs, 'total_price': total_price,
|
|
|
|
+ 'parameter': parameter}
|
|
|
|
+
|
|
|
|
+ if (product, specs, unitPrice, quantity) not in product_set:
|
|
|
|
+ product_set.add((product, specs, unitPrice, quantity))
|
|
|
|
+ product_link.append(link)
|
|
|
|
+ # if link['unitPrice'] != "" and link['quantity'] != '':
|
|
|
|
+ # try:
|
|
|
|
+ # total_product_money += float(link['unitPrice']) * float(
|
|
|
|
+ # link['quantity']) if float(
|
|
|
|
+ # link['quantity']) < 50000 else 0
|
|
|
|
+ # except:
|
|
|
|
+ # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
|
|
|
|
+ # link['unitPrice'], link['quantity']))
|
|
|
|
+
|
|
|
|
+ elif len(unitPrice) > 15 or len(product) > 100: # 单价大于15位数或 产品名称长于100字
|
|
|
|
+ # i += 1
|
|
|
|
+ continue
|
|
|
|
+ else:
|
|
|
|
+ # if quantity != "":
|
|
|
|
+ # quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
|
|
|
|
+ # quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
|
|
+ if unitPrice != "":
|
|
|
|
+ unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
|
|
+ unitPrice = str(unitPrice) if unitPrice != 0 else ""
|
|
|
|
+ if budget != "":
|
|
|
|
+ budget, _money_unit = money_process(budget, header_list2[2])
|
|
|
|
+ budget = str(budget) if budget != 0 else ''
|
|
|
|
+ if total_price != "":
|
|
|
|
+ total_price, _money_unit = money_process(total_price, header_list[6])
|
|
|
|
+ total_price = str(total_price) if unitPrice != 0 else ""
|
|
|
|
+ link = {'product': product, 'quantity': quantity,
|
|
|
|
+ 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
|
|
+ 'brand': brand[:50], 'specs': specs, 'total_price': total_price,
|
|
|
|
+ 'parameter': parameter}
|
|
|
|
+
|
|
|
|
+ if (product, specs, unitPrice, quantity) not in product_set:
|
|
|
|
+ product_set.add((product, specs, unitPrice, quantity))
|
|
|
|
+ product_link.append(link)
|
|
|
|
+ # if link['unitPrice'] != "" and link['quantity'] != '':
|
|
|
|
+ # try:
|
|
|
|
+ # total_product_money += float(link['unitPrice']) * float(
|
|
|
|
+ # link['quantity']) if float(link['quantity']) < 50000 else 0
|
|
|
|
+ # except:
|
|
|
|
+ # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
|
|
|
|
+ # link['unitPrice'], link['quantity']))
|
|
|
|
+
|
|
if order_begin != "" and order_end != "":
|
|
if order_begin != "" and order_end != "":
|
|
order_begin_year = int(order_begin.split("-")[0])
|
|
order_begin_year = int(order_begin.split("-")[0])
|
|
order_end_year = int(order_end.split("-")[0])
|
|
order_end_year = int(order_end.split("-")[0])
|
|
# 限制附件错误识别时间
|
|
# 限制附件错误识别时间
|
|
- if order_begin_year >= 2050 or order_end_year >= 2050:
|
|
|
|
|
|
+ if order_begin_year >= 2050 or order_begin_year < 2000 or order_end_year >= 2050 or order_end_year < 2000:
|
|
order_begin = order_end = ""
|
|
order_begin = order_end = ""
|
|
# print(budget, order_time)
|
|
# print(budget, order_time)
|
|
if budget != "" and order_time != "":
|
|
if budget != "" and order_time != "":
|