Ver Fonte

时间提取规则优化

znj há 3 semanas atrás
pai
commit
c0e36d81ae
1 ficheiros alterados com 26 adições e 15 exclusões
  1. 26 15
      BiddingKG/dl/interface/getAttributes.py

+ 26 - 15
BiddingKG/dl/interface/getAttributes.py

@@ -3337,7 +3337,7 @@ def turnMoneySource(moneysource):
     else:
         return "其他资金"
 
-my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
+my_time_format_pattern = re.compile("((?:(?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*)?(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
 from BiddingKG.dl.ratio.re_ratio import getUnifyNum
 def my_timeFormat(_time,page_time):
     if page_time:
@@ -3346,8 +3346,11 @@ def my_timeFormat(_time,page_time):
         current_year = time.strftime("%Y",time.localtime())
     all_match = re.finditer(my_time_format_pattern,_time)
     time_list = []
+    idx = 0
+    global_year = ""
     for _match in all_match:
         if len(_match.group())>0:
+            idx += 1
             legal = True
             year = ""
             month = ""
@@ -3360,22 +3363,28 @@ def my_timeFormat(_time,page_time):
                 if k=="day":
                     day = v
             if year!="":
-                if re.search("^\d+$", year):
-                    if len(year) == 2:
-                        year = "20" + year
-                        if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
-                            legal = False
+                if year==None: # 例:5月18日
+                    if idx==2 and global_year: # 例:2025年5月14日-5月18日,第二个时间没年份
+                        year = global_year
                     else:
-                        if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
-                            legal = False
+                        legal = False
                 else:
-                    _year = ""
-                    for word in year:
-                        if word == '0':
-                            _year += word
+                    if re.search("^\d+$", year):
+                        if len(year) == 2:
+                            year = "20" + year
+                            if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
+                                legal = False
                         else:
-                            _year += str(getDigitsDic(word))
-                    year = _year
+                            if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
+                                legal = False
+                    else:
+                        _year = ""
+                        for word in year:
+                            if word == '0':
+                                _year += word
+                            else:
+                                _year += str(getDigitsDic(word))
+                        year = _year
             else:
                 legal = False
             if month!="":
@@ -3402,7 +3411,7 @@ def my_timeFormat(_time,page_time):
                         legal = False
             else:
                 legal = False
-            if not isValidDate(int(year),int(month),int(day)):
+            if legal and not isValidDate(int(year),int(month),int(day)):
                 legal = False
             if legal:
                 # 数字字符格式化
@@ -3410,6 +3419,8 @@ def my_timeFormat(_time,page_time):
                 month = str(int(month))
                 day = str(int(day))
                 time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
+                if idx==1 and not global_year:
+                    global_year = year
     return time_list
 
 def getTimeAttributes(list_entity,list_sentence,page_time):