|
@@ -1664,6 +1664,10 @@ class ProductAttributesPredictor():
|
|
|
return str(num)
|
|
|
def fix_time(self, text, html, page_time):
|
|
|
'''输入日期字段返回格式化日期'''
|
|
|
+ for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'),
|
|
|
+ ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]:
|
|
|
+ if it[0] in text:
|
|
|
+ text = text.replace(it[0], it[1])
|
|
|
if re.search('^\d{1,2}月$', text):
|
|
|
m = re.search('^(\d{1,2})月$', text).group(1)
|
|
|
if len(m) < 2:
|
|
@@ -1804,7 +1808,7 @@ class ProductAttributesPredictor():
|
|
|
elif re.search('预算', items[j]):
|
|
|
header_dic['预算'] = j
|
|
|
budget = items[j]
|
|
|
- elif re.search('时间', items[j]):
|
|
|
+ elif re.search('时间|采购实施月份|采购月份', items[j]):
|
|
|
header_dic['时间'] = j
|
|
|
order_time = items[j]
|
|
|
|
|
@@ -1836,7 +1840,7 @@ class ProductAttributesPredictor():
|
|
|
demand_link = []
|
|
|
for i in range(len(tables)-1, -1, -1):
|
|
|
table = tables[i]
|
|
|
- if table.parent.name == 'td' and len(table.find_all('td')) == 1:
|
|
|
+ if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
|
|
|
table.string = table.get_text()
|
|
|
table.name = 'turntable'
|
|
|
continue
|
|
@@ -1872,7 +1876,7 @@ class ProductAttributesPredictor():
|
|
|
budget += '万元'
|
|
|
budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
|
|
|
budget = str(getUnifyMoney(budget))
|
|
|
- elif re.search('采购时间', col0_l[i]):
|
|
|
+ elif re.search('采购时间|采购实施月份|采购月份', col0_l[i]):
|
|
|
header_list2.append(col0_l[i])
|
|
|
order_time = col1_l[i].strip()
|
|
|
order_begin, order_end = self.fix_time(order_time, html, page_time)
|