|
@@ -2206,7 +2206,7 @@ class ProductAttributesPredictor():
|
|
|
elif re.search('预算', items[j]):
|
|
|
header_dic['预算'] = j
|
|
|
budget = items[j]
|
|
|
- elif re.search('时间|采购实施月份|采购月份', items[j]):
|
|
|
+ elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
|
|
|
header_dic['时间'] = j
|
|
|
order_time = items[j]
|
|
|
|
|
@@ -2250,7 +2250,6 @@ class ProductAttributesPredictor():
|
|
|
i = 0
|
|
|
found_header = False
|
|
|
header_colnum = 0
|
|
|
-
|
|
|
if flag_yx:
|
|
|
col0_l = []
|
|
|
col1_l = []
|
|
@@ -2271,11 +2270,15 @@ class ProductAttributesPredictor():
|
|
|
elif re.search('采购预算|预算金额', col0_l[i]):
|
|
|
header_list2.append(col0_l[i])
|
|
|
budget = col1_l[i]
|
|
|
- if '万元' in col0_l[i] and '万' not in budget:
|
|
|
- budget += '万元'
|
|
|
- budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
|
|
|
- budget = str(getUnifyMoney(budget))
|
|
|
- elif re.search('采购时间|采购实施月份|采购月份', col0_l[i]):
|
|
|
+ re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
|
|
|
+ if re_price:
|
|
|
+ budget = re_price[0]
|
|
|
+ if '万元' in col0_l[i] and '万' not in budget:
|
|
|
+ budget += '万元'
|
|
|
+ budget = str(getUnifyMoney(budget))
|
|
|
+ else:
|
|
|
+ budget = ""
|
|
|
+ elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
|
|
|
header_list2.append(col0_l[i])
|
|
|
order_time = col1_l[i].strip()
|
|
|
order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
@@ -2292,7 +2295,6 @@ class ProductAttributesPredictor():
|
|
|
demand_link.append(link)
|
|
|
headers_demand.append('_'.join(header_list2))
|
|
|
continue
|
|
|
-
|
|
|
while i < (len(inner_table)):
|
|
|
tds = inner_table[i]
|
|
|
not_empty = [it for it in tds if it != ""]
|
|
@@ -2309,7 +2311,6 @@ class ProductAttributesPredictor():
|
|
|
order_time = "" # 采购时间
|
|
|
order_begin = ""
|
|
|
order_end = ""
|
|
|
-
|
|
|
if len(set(tds) & self.header_set) > len(tds) * 0.2:
|
|
|
header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
|
|
|
if found_header:
|
|
@@ -2343,10 +2344,15 @@ class ProductAttributesPredictor():
|
|
|
if id3 != "":
|
|
|
if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
|
|
|
unitPrice = tds[id3]
|
|
|
- if '万元' in header_list[2] and '万' not in unitPrice:
|
|
|
- unitPrice += '万元'
|
|
|
- unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
|
|
|
- unitPrice = str(getUnifyMoney(unitPrice))
|
|
|
+ re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?",unitPrice)
|
|
|
+ if re_price:
|
|
|
+ unitPrice = re_price[0]
|
|
|
+ if '万元' in header_list[2] and '万' not in unitPrice:
|
|
|
+ unitPrice += '万元'
|
|
|
+ # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
|
|
|
+ unitPrice = str(getUnifyMoney(unitPrice))
|
|
|
+ else:
|
|
|
+ unitPrice = ""
|
|
|
else:
|
|
|
unitPrice = ""
|
|
|
if id4 != "":
|
|
@@ -2367,10 +2373,14 @@ class ProductAttributesPredictor():
|
|
|
if id7 != "":
|
|
|
if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
|
|
|
budget = tds[id7]
|
|
|
- if '万元' in header_list2[2] and '万' not in budget:
|
|
|
- budget += '万元'
|
|
|
- budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
|
|
|
- budget = str(getUnifyMoney(budget))
|
|
|
+ re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
|
|
|
+ if re_price:
|
|
|
+ budget = re_price[0]
|
|
|
+ if '万元' in header_list[2] and '万' not in budget:
|
|
|
+ budget += '万元'
|
|
|
+ budget = str(getUnifyMoney(budget))
|
|
|
+ else:
|
|
|
+ budget = ""
|
|
|
else:
|
|
|
budget = ""
|
|
|
if id8 != "":
|
|
@@ -2416,7 +2426,7 @@ class ProductAttributesPredictor():
|
|
|
list_sentence = list_sentences[0]
|
|
|
list_entity = list_entitys[0]
|
|
|
_data = product_attrs[1]['demand_info']['data']
|
|
|
- re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份)[::,].{0,2}$")
|
|
|
+ re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
|
|
|
order_times = []
|
|
|
for entity in list_entity:
|
|
|
if entity.entity_type=='time':
|