Эх сурвалжийг харах

专项债提取日期格式化

lsm 2 сар өмнө
parent
commit
4a43058304

+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -512,7 +512,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2025-03-31'}
+    version_date = {'version_date': '2025-04-01'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 29 - 0
BiddingKG/dl/interface/special_debt_extract.py

@@ -71,6 +71,27 @@ def str_to_num(s):
         num = int(num)
     return num
 
+def format_date(date_str):
+    p = re.compile('(?P<year>\d{4})([-年/.](?P<month>\d{1,2})([-月/.](?P<day>\d{1,2})日?)?)?')
+    for match in re.finditer(p, date_str):
+        d = match.groupdict()
+        year, month, day = d['year'], d['month'], d['day']
+        date = year
+        if month != None:
+            date += '-' + month
+            if day != None:
+                date += '-' + day
+        return date
+    return ''
+
+def split_date(date_str):
+    start_date, end_date = '', ''
+    parts = re.split(r"[—–~至]", date_str)
+    if len(parts) == 2:
+        start_str, end_str = parts
+        start_date = format_date(start_str)
+        end_date = format_date(end_str)
+    return start_date, end_date
 
 def get_debt_info(html):
     _pd = Html2KVTree(html)
@@ -85,6 +106,10 @@ def get_debt_info(html):
             result_dic[k] = vl[0]
             if k == 'district':
                 result_dic[k] = ''.join(vl)
+            elif k == 'construction_period':
+                result_dic['construction_start'] , result_dic['construction_end'] = split_date(vl[0])
+            elif k == 'operation_period':
+                result_dic['operation_start'] , result_dic['operation_end'] = split_date(vl[0])
 
     detail_dic = {}
     for k, v in release_details.items():
@@ -98,6 +123,8 @@ def get_debt_info(html):
 
     for i in range(len(detail_dic['time_release'])):
         dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]}
+        if 'time_release' in dic:
+            dic['time_release'] = format_date(dic['time_release'])
         detail_list.append(dic)
 
     for k, v in interest.items():
@@ -107,6 +134,8 @@ def get_debt_info(html):
             vl = [str_to_num(x) for x in vl]
         if vl and vl[0] not in ['', '/', '—', 0]:
             result_dic[k] = vl[0]
+            if k in ['recent_interest_date', 'value_date', 'date_due']:
+                result_dic[k] = format_date(vl[0])
 
     result_dic['issue_details'] = detail_list
     # print('detail_dic: ', detail_dic)