|
@@ -71,6 +71,27 @@ def str_to_num(s):
|
|
num = int(num)
|
|
num = int(num)
|
|
return num
|
|
return num
|
|
|
|
|
|
|
|
+def format_date(date_str):
|
|
|
|
+ p = re.compile('(?P<year>\d{4})([-年/.](?P<month>\d{1,2})([-月/.](?P<day>\d{1,2})日?)?)?')
|
|
|
|
+ for match in re.finditer(p, date_str):
|
|
|
|
+ d = match.groupdict()
|
|
|
|
+ year, month, day = d['year'], d['month'], d['day']
|
|
|
|
+ date = year
|
|
|
|
+ if month != None:
|
|
|
|
+ date += '-' + month
|
|
|
|
+ if day != None:
|
|
|
|
+ date += '-' + day
|
|
|
|
+ return date
|
|
|
|
+ return ''
|
|
|
|
+
|
|
|
|
+def split_date(date_str):
|
|
|
|
+ start_date, end_date = '', ''
|
|
|
|
+ parts = re.split(r"[—–~至]", date_str)
|
|
|
|
+ if len(parts) == 2:
|
|
|
|
+ start_str, end_str = parts
|
|
|
|
+ start_date = format_date(start_str)
|
|
|
|
+ end_date = format_date(end_str)
|
|
|
|
+ return start_date, end_date
|
|
|
|
|
|
def get_debt_info(html):
|
|
def get_debt_info(html):
|
|
_pd = Html2KVTree(html)
|
|
_pd = Html2KVTree(html)
|
|
@@ -85,6 +106,10 @@ def get_debt_info(html):
|
|
result_dic[k] = vl[0]
|
|
result_dic[k] = vl[0]
|
|
if k == 'district':
|
|
if k == 'district':
|
|
result_dic[k] = ''.join(vl)
|
|
result_dic[k] = ''.join(vl)
|
|
|
|
+ elif k == 'construction_period':
|
|
|
|
+ result_dic['construction_start'] , result_dic['construction_end'] = split_date(vl[0])
|
|
|
|
+ elif k == 'operation_period':
|
|
|
|
+ result_dic['operation_start'] , result_dic['operation_end'] = split_date(vl[0])
|
|
|
|
|
|
detail_dic = {}
|
|
detail_dic = {}
|
|
for k, v in release_details.items():
|
|
for k, v in release_details.items():
|
|
@@ -98,6 +123,8 @@ def get_debt_info(html):
|
|
|
|
|
|
for i in range(len(detail_dic['time_release'])):
|
|
for i in range(len(detail_dic['time_release'])):
|
|
dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]}
|
|
dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]}
|
|
|
|
+ if 'time_release' in dic:
|
|
|
|
+ dic['time_release'] = format_date(dic['time_release'])
|
|
detail_list.append(dic)
|
|
detail_list.append(dic)
|
|
|
|
|
|
for k, v in interest.items():
|
|
for k, v in interest.items():
|
|
@@ -107,6 +134,8 @@ def get_debt_info(html):
|
|
vl = [str_to_num(x) for x in vl]
|
|
vl = [str_to_num(x) for x in vl]
|
|
if vl and vl[0] not in ['', '/', '—', 0]:
|
|
if vl and vl[0] not in ['', '/', '—', 0]:
|
|
result_dic[k] = vl[0]
|
|
result_dic[k] = vl[0]
|
|
|
|
+ if k in ['recent_interest_date', 'value_date', 'date_due']:
|
|
|
|
+ result_dic[k] = format_date(vl[0])
|
|
|
|
|
|
result_dic['issue_details'] = detail_list
|
|
result_dic['issue_details'] = detail_list
|
|
# print('detail_dic: ', detail_dic)
|
|
# print('detail_dic: ', detail_dic)
|