|
@@ -3072,11 +3072,11 @@ class ProductAttributesPredictor():
|
|
if product != "":
|
|
if product != "":
|
|
if id2 != "":
|
|
if id2 != "":
|
|
if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
|
|
if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
|
|
- if re.search('(^\d{,3}(,?\d{3})+(\.\d{2,7},?)$)|万?元', tds[id2]):
|
|
|
|
|
|
+ if re.search('(^\d{,3}(,?\d{3}){2,}(\.\d{2,7},?)$)|万?元', tds[id2]):
|
|
i += 1
|
|
i += 1
|
|
continue
|
|
continue
|
|
quantity = tds[id2]
|
|
quantity = tds[id2]
|
|
- elif re.search('\w', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
|
|
|
|
|
|
+ elif re.search('\w{5,}', tds[id2]) and re.search('^详见|^略', tds[id2])==None:
|
|
i += 1
|
|
i += 1
|
|
continue
|
|
continue
|
|
if id2_2 != "":
|
|
if id2_2 != "":
|
|
@@ -3085,9 +3085,9 @@ class ProductAttributesPredictor():
|
|
if id3 != "":
|
|
if id3 != "":
|
|
if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
|
|
if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
|
|
unitPrice = tds[id3]
|
|
unitPrice = tds[id3]
|
|
- elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id3].strip()):
|
|
|
|
|
|
+ elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id3].strip()):
|
|
unitPrice = tds[id3]
|
|
unitPrice = tds[id3]
|
|
- elif re.search('\w', tds[id3]) and re.search('^详见|^略', tds[id3])==None:
|
|
|
|
|
|
+ elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id3])) > 5:
|
|
i += 1
|
|
i += 1
|
|
continue
|
|
continue
|
|
if id4 != "":
|
|
if id4 != "":
|
|
@@ -3119,9 +3119,9 @@ class ProductAttributesPredictor():
|
|
if id9 != "":
|
|
if id9 != "":
|
|
if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id9]):
|
|
if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id9]):
|
|
total_price = tds[id9]
|
|
total_price = tds[id9]
|
|
- elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$', tds[id9].strip()):
|
|
|
|
|
|
+ elif re.search('^[\d,.亿万元人民币欧美日金额:()();;、,\n]+$|¥|¥|RMB|USD|EUR|JPY|CNY|元$', tds[id9].strip()):
|
|
total_price = tds[id9]
|
|
total_price = tds[id9]
|
|
- elif re.search('\w', tds[id9]) and re.search('^详见|^略', tds[id9])==None:
|
|
|
|
|
|
+ elif len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', tds[id9])) > 5:
|
|
i += 1
|
|
i += 1
|
|
continue
|
|
continue
|
|
if id10 != "":
|
|
if id10 != "":
|
|
@@ -3129,7 +3129,7 @@ class ProductAttributesPredictor():
|
|
if re.match('^详见|^略$', parameter.strip()):
|
|
if re.match('^详见|^略$', parameter.strip()):
|
|
parameter = ""
|
|
parameter = ""
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "" or total_price:
|
|
- if id2 != "" and id3 != "" and len(re.split('[;;、,\n]', tds[id2])) > 1 and len(re.split('[;;、,\n]', tds[id1])) == len(re.split('[;;、,\n]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
|
|
|
|
+ if id1!="" and id2 != "" and id3 != "" and len(re.split('[;;、,\n]', tds[id2])) > 1 and len(re.split('[;;、,\n]', tds[id1])) == len(re.split('[;;、,\n]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
|
|
products = re.split('[;;、,\n]', tds[id1])
|
|
products = re.split('[;;、,\n]', tds[id1])
|
|
quantitys = re.split('[;;、,\n]', tds[id2])
|
|
quantitys = re.split('[;;、,\n]', tds[id2])
|
|
unitPrices = re.split('[;;、,\n]', tds[id3])
|
|
unitPrices = re.split('[;;、,\n]', tds[id3])
|