|
@@ -1578,7 +1578,7 @@ class ProductAttributesPredictor():
|
|
|
for tr in trs:
|
|
|
tr_line = []
|
|
|
tds = tr.findChildren(['td', 'th'], recursive=False)
|
|
|
- if len(tds) < 3:
|
|
|
+ if len(tds) < 2:
|
|
|
continue
|
|
|
for td in tds:
|
|
|
td_text = re.sub('\s', '', td.get_text())
|
|
@@ -1662,8 +1662,38 @@ class ProductAttributesPredictor():
|
|
|
except:
|
|
|
num = 30
|
|
|
return str(num)
|
|
|
- def fix_time(self, text):
|
|
|
+ def fix_time(self, text, html, page_time):
|
|
|
'''输入日期字段返回格式化日期'''
|
|
|
+ if re.search('^\d{1,2}月$', text):
|
|
|
+ m = re.search('^(\d{1,2})月$', text).group(1)
|
|
|
+ if len(m) < 2:
|
|
|
+ m = '0' + m
|
|
|
+ year = re.search('(\d{4})年(.{,12}采购意向)?', html)
|
|
|
+ if year:
|
|
|
+ y = year.group(1)
|
|
|
+ num = self.get_monthlen(y, m)
|
|
|
+ if len(num) < 2:
|
|
|
+ num = '0' + num
|
|
|
+ order_begin = "%s.%s.01" % (y, m)
|
|
|
+ order_end = "%s.%s.%s" % (y, m, num)
|
|
|
+ elif page_time != "":
|
|
|
+ year = re.search('\d{4}', page_time)
|
|
|
+ if year:
|
|
|
+ y = year.group(0)
|
|
|
+ num = self.get_monthlen(y, m)
|
|
|
+ if len(num) < 2:
|
|
|
+ num = '0' + num
|
|
|
+ order_begin = "%s.%s.01" % (y, m)
|
|
|
+ order_end = "%s.%s.%s" % (y, m, num)
|
|
|
+ else:
|
|
|
+ y = str(datetime.datetime.now().year)
|
|
|
+ num = self.get_monthlen(y, m)
|
|
|
+ if len(num) < 2:
|
|
|
+ num = '0' + num
|
|
|
+ order_begin = "%s.%s.01" % (y, m)
|
|
|
+ order_end = "%s.%s.%s" % (y, m, num)
|
|
|
+ return order_begin, order_end
|
|
|
+
|
|
|
t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)
|
|
|
if t1:
|
|
|
year = t1.group(1)
|
|
@@ -1797,6 +1827,7 @@ class ProductAttributesPredictor():
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html, 'lxml')
|
|
|
+ flag_yx = True if re.search('采购意向', html) else False
|
|
|
tables = soup.find_all(['table'])
|
|
|
headers = []
|
|
|
headers_demand = []
|
|
@@ -1816,10 +1847,47 @@ class ProductAttributesPredictor():
|
|
|
i = 0
|
|
|
found_header = False
|
|
|
header_colnum = 0
|
|
|
+
|
|
|
+ if flag_yx:
|
|
|
+ col0_l = []
|
|
|
+ col1_l = []
|
|
|
+ for tds in inner_table:
|
|
|
+ if len(tds) == 2:
|
|
|
+ col0_l.append(re.sub(':', '', tds[0]))
|
|
|
+ col1_l.append(tds[1])
|
|
|
+ if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2:
|
|
|
+ header_list2 = []
|
|
|
+ product = demand = budget = order_begin = order_end = ""
|
|
|
+ for i in range(len(col0_l)):
|
|
|
+ if re.search('项目名称', col0_l[i]):
|
|
|
+ header_list2.append(col0_l[i])
|
|
|
+ product = col1_l[i]
|
|
|
+ elif re.search('采购需求|需求概况', col0_l[i]):
|
|
|
+ header_list2.append(col0_l[i])
|
|
|
+ demand = col1_l[i]
|
|
|
+ elif re.search('采购预算|预算金额', col0_l[i]):
|
|
|
+ header_list2.append(col0_l[i])
|
|
|
+ budget = col1_l[i]
|
|
|
+ if '万元' in col0_l[i] and '万' not in budget:
|
|
|
+ budget += '万元'
|
|
|
+ budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
|
|
|
+ budget = str(getUnifyMoney(budget))
|
|
|
+ elif re.search('采购时间', col0_l[i]):
|
|
|
+ header_list2.append(col0_l[i])
|
|
|
+ order_time = col1_l[i].strip()
|
|
|
+ order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
|
+ if product!= "" and demand != "" and budget!="" and order_begin != "":
|
|
|
+ link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
|
|
|
+ 'order_begin': order_begin, 'order_end': order_end}
|
|
|
+ if link not in demand_link:
|
|
|
+ demand_link.append(link)
|
|
|
+ headers_demand.append('_'.join(header_list2))
|
|
|
+ continue
|
|
|
+
|
|
|
while i < (len(inner_table)):
|
|
|
tds = inner_table[i]
|
|
|
not_empty = [it for it in tds if it != ""]
|
|
|
- if len(set(not_empty)) < len(not_empty) * 0.5:
|
|
|
+ if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
|
|
|
i += 1
|
|
|
continue
|
|
|
product = "" # 产品
|
|
@@ -1899,36 +1967,7 @@ class ProductAttributesPredictor():
|
|
|
if id8 != "":
|
|
|
if re.search('\w', tds[id8]):
|
|
|
order_time = tds[id8].strip()
|
|
|
- if re.search('^\d{1,2}月$', order_time):
|
|
|
- m = re.search('^(\d{1,2})月$', order_time).group(1)
|
|
|
- if len(m) < 2:
|
|
|
- m = '0'+m
|
|
|
- year = re.search('(\d{4})年(.{,12}采购意向)?', html)
|
|
|
- if year:
|
|
|
- y = year.group(1)
|
|
|
- num = self.get_monthlen(y, m)
|
|
|
- if len(num)<2:
|
|
|
- num = '0'+num
|
|
|
- order_begin = "%s.%s.01" % (y, m)
|
|
|
- order_end = "%s.%s.%s" % (y, m, num)
|
|
|
- elif page_time!="":
|
|
|
- year = re.search('\d{4}', page_time)
|
|
|
- if year:
|
|
|
- y = year.group(0)
|
|
|
- num = self.get_monthlen(y, m)
|
|
|
- if len(num) < 2:
|
|
|
- num = '0'+num
|
|
|
- order_begin = "%s.%s.01" % (y, m)
|
|
|
- order_end = "%s.%s.%s" % (y, m, num)
|
|
|
- else:
|
|
|
- y = str(datetime.datetime.now().year)
|
|
|
- num = self.get_monthlen(y, m)
|
|
|
- if len(num) < 2:
|
|
|
- num = '0'+num
|
|
|
- order_begin = "%s.%s.01" % (y, m)
|
|
|
- order_end = "%s.%s.%s" % (y, m, num)
|
|
|
- else:
|
|
|
- order_begin, order_end = self.fix_time(order_time)
|
|
|
+ order_begin, order_end = self.fix_time(order_time, html, page_time)
|
|
|
if quantity != "" or unitPrice != "" or brand != "" or specs != "":
|
|
|
link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
|
|
|
'brand': brand[:50], 'specs':specs}
|