|
@@ -2536,8 +2536,11 @@ class ProductPredictor():
|
|
|
# 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
|
|
|
class ProductAttributesPredictor():
|
|
|
def __init__(self,):
|
|
|
- self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
|
|
|
- self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
|
|
|
+ self.p0 = '(品目|类别|类型|物类|目录|^品名|^品类)(名称|$)'
|
|
|
+ self.p1 = '(标的|标项|项目|计划|标段|[分子]?包|子目|维修|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品?|采购|物装|服务|工程|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|招标|标的物|物件|药品|药材|药械|货品|食品|食材|品目|^品名)[\))的]?(名称|内容|描述)'
|
|
|
+ self.p2 = '标的|标项|项目$|商品|产品|物料|物资|货物|设备|采购品|采购条目|物品|材料|印刷品|物装|配件|资产|招标内容|耗材|清单|器材|仪器|器械|备件|拍卖物|标的物|物件|药品|药材|药械|货品|食品|食材|菜名|^品目$|^品名$|^名称'
|
|
|
+ # self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
|
|
|
+ # self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
|
|
|
with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
|
|
|
self.header_set = pickle.load(f)
|
|
|
def isTrueTable(self, table):
|
|
@@ -2580,7 +2583,8 @@ class ProductAttributesPredictor():
|
|
|
if len(tds) < 2:
|
|
|
continue
|
|
|
for td in tds:
|
|
|
- td_text = re.sub('\s+|…', ' ', td.get_text()).strip()
|
|
|
+ # td_text = re.sub('\s+|…', ' ', td.get_text()).strip()
|
|
|
+ td_text = re.sub('…', '', td.get_text()).strip()
|
|
|
td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/').replace('"', '') # 修复272144312 # 产品单价数量提取结果有特殊符号\ 气动执行装置备件\密封组件\NBR+PT
|
|
|
td_text = td_text.replace("(", "(").replace(")", ")").replace(':', ':')
|
|
|
tr_line.append(td_text)
|
|
@@ -2806,7 +2810,7 @@ class ProductAttributesPredictor():
|
|
|
quantity_unit = ""
|
|
|
return quantity, quantity_unit
|
|
|
|
|
|
- def find_header(self, items, p1, p2):
|
|
|
+ def find_header(self, items,p0, p1, p2):
|
|
|
'''
|
|
|
inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
|
|
|
:param items: 列表,内容为每个td 文本内容
|
|
@@ -2815,7 +2819,7 @@ class ProductAttributesPredictor():
|
|
|
:return: 表头所在列序号,是否表头,表头内容
|
|
|
'''
|
|
|
flag = False
|
|
|
- header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
|
|
|
+ header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': '', '总价': '', '品目': '', '参数': ''}
|
|
|
product = "" # 产品
|
|
|
quantity = "" # 数量
|
|
|
quantity_unit = "" # 数量单位
|
|
@@ -2825,25 +2829,36 @@ class ProductAttributesPredictor():
|
|
|
demand = "" # 采购需求
|
|
|
budget = "" # 预算金额
|
|
|
order_time = "" # 采购时间
|
|
|
+ total_price = "" # 总价
|
|
|
+ category = "" # 品目
|
|
|
+ parameter = "" # 参数
|
|
|
|
|
|
- for i in range(min(4, len(items))):
|
|
|
+ for i in range(min(6, len(items))):
|
|
|
it = items[i]
|
|
|
- if len(it) < 15 and re.search(p1, it) != None:
|
|
|
+ if len(it) < 15 and re.search(p0, it) != None:
|
|
|
+ flag = True
|
|
|
+ category = it
|
|
|
+ header_dic['品目'] = i
|
|
|
+ elif len(it) < 15 and re.search(p1, it) != None:
|
|
|
flag = True
|
|
|
product = it
|
|
|
header_dic['名称'] = i
|
|
|
break
|
|
|
- if not flag:
|
|
|
+ # if not flag:
|
|
|
+ if product == "":
|
|
|
for i in range(min(4, len(items))):
|
|
|
it = items[i]
|
|
|
- if len(it) < 15 and re.search(p2, it) and re.search(
|
|
|
+ if len(it) < 15 and it != category and re.search(p2, it) and re.search(
|
|
|
'编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None:
|
|
|
flag = True
|
|
|
product = it
|
|
|
header_dic['名称'] = i
|
|
|
break
|
|
|
if flag:
|
|
|
- for j in range(i + 1, len(items)):
|
|
|
+ # for j in range(i + 1, len(items)):
|
|
|
+ for j in range(len(items)):
|
|
|
+ if items[j] in [product, category]:
|
|
|
+ continue
|
|
|
if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
|
|
|
continue
|
|
|
if header_dic['数量']=="" and re.search('数量|采购量', items[j]) and re.search('单价|用途|要求|规格|型号|运输|承运', items[j])==None:
|
|
@@ -2861,6 +2876,9 @@ class ProductAttributesPredictor():
|
|
|
elif re.search('规格|型号', items[j]):
|
|
|
header_dic['规格'] = j
|
|
|
specs = items[j]
|
|
|
+ elif re.search('参数', items[j]):
|
|
|
+ header_dic['参数'] = j
|
|
|
+ parameter = items[j]
|
|
|
|
|
|
elif re.search('需求|服务要求|服务标准', items[j]):
|
|
|
header_dic['需求'] = j
|
|
@@ -2871,16 +2889,21 @@ class ProductAttributesPredictor():
|
|
|
elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
|
|
|
header_dic['时间'] = j
|
|
|
order_time = items[j]
|
|
|
-
|
|
|
- if header_dic.get('名称', "") != "" :
|
|
|
- num = 0
|
|
|
- for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
|
|
|
- if it != "":
|
|
|
- num += 1
|
|
|
- if num >=2:
|
|
|
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
|
|
|
+ elif re.search('总价|^金额|(成交|中标|验收|合同|预算|控制|总|合计))?(金额|价格?)', items[j]):
|
|
|
+ header_dic['总价'] = j
|
|
|
+ total_price = items[j]
|
|
|
+
|
|
|
+ if header_dic.get('名称', "") != "" or header_dic.get('品目', "") != "":
|
|
|
+ # num = 0
|
|
|
+ # for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time, total_price):
|
|
|
+ # if it != "":
|
|
|
+ # num += 1
|
|
|
+ # if num >=2:
|
|
|
+ # return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
|
|
|
+ if set([quantity, brand, specs, unitPrice, total_price])!=set([""]) or set([demand, budget])!=set([""]):
|
|
|
+ return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
|
|
|
flag = False
|
|
|
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
|
|
|
+ return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs, total_price, category, parameter), (product, demand, budget, order_time)
|
|
|
|
|
|
def predict(self, docid='', html='', page_time=""):
|
|
|
'''
|
|
@@ -2899,6 +2922,7 @@ class ProductAttributesPredictor():
|
|
|
header_col = []
|
|
|
product_link = []
|
|
|
demand_link = []
|
|
|
+ product_set = set()
|
|
|
total_product_money = 0
|
|
|
for i in range(len(tables)-1, -1, -1):
|
|
|
table = tables[i]
|
|
@@ -2983,10 +3007,11 @@ class ProductAttributesPredictor():
|
|
|
order_time = "" # 采购时间
|
|
|
order_begin = ""
|
|
|
order_end = ""
|
|
|
- # print(tds,set(tds) & self.header_set)
|
|
|
+ total_price = "" # 总金额
|
|
|
+ parameter = "" # 参数
|
|
|
if len(set([re.sub('[::]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
|
|
|
# if len(set(tds) & self.header_set) > len(tds) * 0.2:
|
|
|
- header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
|
|
|
+ header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p0, self.p1, self.p2)
|
|
|
if found_header:
|
|
|
header_colnum = len(tds) # 保存表头所在行列数
|
|
|
if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
|
|
@@ -3006,6 +3031,7 @@ class ProductAttributesPredictor():
|
|
|
if len(tds) != header_colnum: # 表头、属性列数不一致跳过
|
|
|
i += 1
|
|
|
continue
|
|
|
+ id0 = header_dic.get('品目', "")
|
|
|
id1 = header_dic.get('名称', "")
|
|
|
id2 = header_dic.get('数量', "")
|
|
|
id2_2 = header_dic.get('单位', "")
|
|
@@ -3017,6 +3043,9 @@ class ProductAttributesPredictor():
|
|
|
id7 = header_dic.get('预算', "")
|
|
|
id8 = header_dic.get('时间', "")
|
|
|
|
|
|
+ id9 = header_dic.get("总价", "")
|
|
|
+ id10 = header_dic.get('参数', "")
|
|
|
+
|
|
|
not_attr = 0
|
|
|
for k, v in header_dic.items():
|
|
|
if isinstance(v, int):
|
|
@@ -3028,48 +3057,39 @@ class ProductAttributesPredictor():
|
|
|
found_header = False
|
|
|
continue
|
|
|
|
|
|
- if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
|
|
|
+ if id1!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
|
|
|
re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
|
|
|
product = tds[id1]
|
|
|
+
|
|
|
+ if id0!="" and re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id0]) and tds[id0] not in self.header_set and \
|
|
|
+ re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id0]) == None:
|
|
|
+ category = tds[id0]
|
|
|
+ product = "%s_%s"%(category, product) if product!="" else category
|
|
|
+
|
|
|
+ if product != "":
|
|
|
if id2 != "":
|
|
|
if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
|
|
|
quantity = tds[id2]
|
|
|
- # quantity = re.sub('[()(),,约]', '', quantity)
|
|
|
- # quantity = re.sub('[一壹]', '1', quantity)
|
|
|
- # ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)
|
|
|
- # if ser:
|
|
|
- # quantity = str(ser.group(1))
|
|
|
- # quantity_unit = ser.group(2)
|
|
|
- # if quantity_unit == "" and header_quan_unit != "":
|
|
|
- # quantity_unit = header_quan_unit
|
|
|
- # else:
|
|
|
- # quantity = ""
|
|
|
- # quantity_unit = ""
|
|
|
if id2_2 != "":
|
|
|
- if re.search('^\w{1,4}$', tds[id2_2]):
|
|
|
+ if re.search('^\w{1,4}$', tds[id2_2]) and re.search('元', tds[id2_2])==None:
|
|
|
quantity_unit = tds[id2_2]
|
|
|
if id3 != "":
|
|
|
if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
|
|
|
unitPrice = tds[id3]
|
|
|
- elif re.search('^[\d,.亿万元人民币欧美日金额:()()]+$', tds[id3].strip()):
|
|
|
+ elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id3].strip()):
|
|
|
unitPrice = tds[id3]
|
|
|
- # _unitPrice = tds[id3]
|
|
|
- # re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
|
|
|
- # if re_price:
|
|
|
- # _unitPrice = re_price[0]
|
|
|
- # if '万元' in header_list[2] and '万' not in _unitPrice:
|
|
|
- # _unitPrice += '万元'
|
|
|
- # # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
|
|
|
- # unitPrice = str(getUnifyMoney(_unitPrice))
|
|
|
-
|
|
|
if id4 != "":
|
|
|
if re.search('\w', tds[id4]):
|
|
|
brand = tds[id4]
|
|
|
+ if re.match('^详见|^略$', brand.strip()):
|
|
|
+ brand = ""
|
|
|
else:
|
|
|
brand = ""
|
|
|
if id5 != "":
|
|
|
if re.search('\w', tds[id5]):
|
|
|
specs = tds[id5][:500] # 限制最多500字
|
|
|
+ if re.match('^详见|^略$', specs.strip()):
|
|
|
+ specs = ""
|
|
|
else:
|
|
|
specs = ""
|
|
|
if id6 != "":
|
|
@@ -3080,46 +3100,69 @@ class ProductAttributesPredictor():
|
|
|
if id7 != "":
|
|
|
if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
|
|
|
budget = tds[id7]
|
|
|
- # _budget = tds[id7]
|
|
|
- # re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget)
|
|
|
- # if re_price:
|
|
|
- # _budget = re_price[0]
|
|
|
- # if '万元' in header_list2[2] and '万' not in _budget:
|
|
|
- # _budget += '万元'
|
|
|
- # budget = str(getUnifyMoney(_budget))
|
|
|
if id8 != "":
|
|
|
if re.search('\w', tds[id8]):
|
|
|
order_time = tds[id8].strip()
|
|
|
order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
|
- if quantity != "" or unitPrice != "" or brand != "" or specs != "":
|
|
|
- if quantity != "":
|
|
|
- quantity, quantity_unit = self.fix_quantity(quantity, header_quan_unit)
|
|
|
- if unitPrice != "":
|
|
|
- unitPrice, _money_unit = money_process(unitPrice, header_list[2])
|
|
|
- unitPrice = str(unitPrice) if unitPrice != 0 else ""
|
|
|
- if budget != "":
|
|
|
- budget, _money_unit = money_process(budget, header_list2[2])
|
|
|
- budget = str(budget) if budget != 0 else ''
|
|
|
-
|
|
|
- if id2 != "" and id3 != "" and len(re.split('[,,\s]', tds[id2])) > 1 and len(re.split('[,,\s]', tds[id1])) == len(re.split('[,,\s]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
|
- products = re.split('[,,\s]', tds[id1])
|
|
|
- quantitys = re.split('[,,\s]', tds[id2])
|
|
|
- unitPrices = re.split('[,,\s]', tds[id3])
|
|
|
- brands = re.split('[,,\s]', brand)
|
|
|
- specses = re.split('[,,\s]', specs)
|
|
|
+ if id9 != "":
|
|
|
+ if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id9]):
|
|
|
+ total_price = tds[id9]
|
|
|
+ elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id9].strip()):
|
|
|
+ total_price = tds[id9]
|
|
|
+ if id10 != "":
|
|
|
+ parameter = tds[id10][:500]
|
|
|
+ if re.match('^详见|^略$', parameter.strip()):
|
|
|
+ parameter = ""
|
|
|
+ if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
|
|
|
+ if id2 != "" and id3 != "" and len(re.split('[;;、,\n]', tds[id2])) > 1 and len(re.split('[;;、,\n]', tds[id1])) == len(re.split('[;;、,\n]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
|
+ products = re.split('[;;、,\n]', tds[id1])
|
|
|
+ quantitys = re.split('[;;、,\n]', tds[id2])
|
|
|
+ unitPrices = re.split('[;;、,\n]', tds[id3])
|
|
|
+ total_prices = re.split('[;;、,\n]', total_price)
|
|
|
+ brands = re.split('[;;、,\n]', brand) if re.search('等$', brand)==None else [brand]
|
|
|
+ specses = re.split('[;;、,\n]', specs) if re.search('等$', specs)==None else [specs]
|
|
|
+ parameters = re.split('[;;、,\n]', parameter) if re.search('等$', parameter)==None else [parameter]
|
|
|
+ unitPrices = [""]*len(products) if len(unitPrices)==1 else unitPrices
|
|
|
+ total_prices = [""]*len(products) if len(total_prices)==1 else total_prices
|
|
|
+ brands = brands*len(products) if len(brands)==1 else brands
|
|
|
+ specses = specses*len(products) if len(specses)==1 else specses
|
|
|
+ parameters = parameters*len(products) if len(parameters)==1 else parameters
|
|
|
if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(specses):
|
|
|
- for product, quantity, unitPrice, brand, specs in zip(products,quantitys,unitPrices, brands, specses):
|
|
|
+ for product, quantity, unitPrice, brand, specs, total_price, parameter in zip(products,quantitys,unitPrices, brands, specses, total_prices, parameters):
|
|
|
+ if quantity != "":
|
|
|
+ quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
|
|
|
+ quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
|
+ if unitPrice != "":
|
|
|
+ unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
|
+ unitPrice = str(unitPrice) if unitPrice != 0 else ""
|
|
|
+ if budget != "":
|
|
|
+ budget, _money_unit = money_process(budget, header_list2[2])
|
|
|
+ budget = str(budget) if budget != 0 else ''
|
|
|
+ if total_price != "":
|
|
|
+ total_price, _money_unit = money_process(total_price, header_list[6])
|
|
|
+ total_price = str(total_price) if unitPrice != 0 else ""
|
|
|
link = {'product': product, 'quantity': quantity,
|
|
|
'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
|
- 'brand': brand[:50], 'specs': specs}
|
|
|
- if link not in product_link:
|
|
|
+ 'brand': brand[:50], 'specs': specs, 'total_price': total_price, 'parameter': parameter}
|
|
|
+ # if link not in product_link:
|
|
|
+ # product_link.append(link)
|
|
|
+ # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
|
|
|
+ # if link['unitPrice'] != "" and mat:
|
|
|
+ # try:
|
|
|
+ # total_product_money += float(link['unitPrice']) * float(
|
|
|
+ # mat.group(1).replace(',', '')) if float(
|
|
|
+ # mat.group(1).replace(',', '')) < 50000 else 0
|
|
|
+ # except:
|
|
|
+ # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
|
|
|
+ # link['unitPrice'], link['quantity']))
|
|
|
+
|
|
|
+ if (product, specs, unitPrice, quantity) not in product_set:
|
|
|
+ product_set.add((product, specs, unitPrice, quantity))
|
|
|
product_link.append(link)
|
|
|
- mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
|
|
|
- if link['unitPrice'] != "" and mat:
|
|
|
+ if link['unitPrice'] != "" and link['quantity'] != '':
|
|
|
try:
|
|
|
total_product_money += float(link['unitPrice']) * float(
|
|
|
- mat.group(1).replace(',', '')) if float(
|
|
|
- mat.group(1).replace(',', '')) < 50000 else 0
|
|
|
+ link['quantity']) if float(link['quantity']) < 50000 else 0
|
|
|
except:
|
|
|
log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
|
|
|
link['unitPrice'], link['quantity']))
|
|
@@ -3128,16 +3171,40 @@ class ProductAttributesPredictor():
|
|
|
i += 1
|
|
|
continue
|
|
|
else:
|
|
|
+ if quantity != "":
|
|
|
+ quantity, quantity_unit_ = self.fix_quantity(quantity, header_quan_unit)
|
|
|
+ quantity_unit = quantity_unit_ if quantity_unit_ != "" else quantity_unit
|
|
|
+ if unitPrice != "":
|
|
|
+ unitPrice, _money_unit = money_process(unitPrice, header_list[3])
|
|
|
+ unitPrice = str(unitPrice) if unitPrice != 0 else ""
|
|
|
+ if budget != "":
|
|
|
+ budget, _money_unit = money_process(budget, header_list2[2])
|
|
|
+ budget = str(budget) if budget != 0 else ''
|
|
|
+ if total_price != "":
|
|
|
+ total_price, _money_unit = money_process(total_price, header_list[6])
|
|
|
+ total_price = str(total_price) if unitPrice != 0 else ""
|
|
|
link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
|
|
|
- 'brand': brand[:50], 'specs':specs}
|
|
|
- if link not in product_link:
|
|
|
+ 'brand': brand[:50], 'specs':specs, 'total_price': total_price, 'parameter': parameter}
|
|
|
+
|
|
|
+ # if link not in product_link:
|
|
|
+ # product_link.append(link)
|
|
|
+ # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
|
|
|
+ # if link['unitPrice'] != "" and mat:
|
|
|
+ # try:
|
|
|
+ # total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) if float(mat.group(1).replace(',', ''))<50000 else 0
|
|
|
+ # except:
|
|
|
+ # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
|
|
|
+
|
|
|
+ if (product, specs, unitPrice, quantity) not in product_set:
|
|
|
+ product_set.add((product, specs, unitPrice, quantity))
|
|
|
product_link.append(link)
|
|
|
- mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
|
|
|
- if link['unitPrice'] != "" and mat:
|
|
|
+ if link['unitPrice'] != "" and link['quantity'] != '':
|
|
|
try:
|
|
|
- total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) if float(mat.group(1).replace(',', ''))<50000 else 0
|
|
|
+ total_product_money += float(link['unitPrice'])*float(link['quantity']) if float(link['quantity'])<50000 else 0
|
|
|
except:
|
|
|
log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
|
|
|
+
|
|
|
+
|
|
|
if order_begin != "" and order_end != "":
|
|
|
order_begin_year = int(order_begin.split("-")[0])
|
|
|
order_end_year = int(order_end.split("-")[0])
|
|
@@ -3255,7 +3322,7 @@ class ProductAttributesPredictor():
|
|
|
order_begin = ""
|
|
|
order_end = ""
|
|
|
|
|
|
- header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p1,self.p2)
|
|
|
+ header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p0, self.p1,self.p2)
|
|
|
if found_header:
|
|
|
headers.append('_'.join(header_list))
|
|
|
headers_demand.append('_'.join(header_list2))
|
|
@@ -3271,7 +3338,7 @@ class ProductAttributesPredictor():
|
|
|
id6 = header_dic.get('需求', "")
|
|
|
id7 = header_dic.get('预算', "")
|
|
|
id8 = header_dic.get('时间', "")
|
|
|
- if re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
|
|
|
+ if id1!='' and re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
|
|
|
re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
|
|
|
product = deal_list[id1]
|
|
|
if id2 != "":
|