Selaa lähdekoodia

时间提取规则优化

znj 5 päivää sitten
vanhempi
commit
4edb2b0689
1 muutettua tiedostoa jossa 26 lisäystä ja 15 poistoa
  1. 26 15
      BiddingKG/dl/interface/getAttributes.py

+ 26 - 15
BiddingKG/dl/interface/getAttributes.py

@@ -3342,10 +3342,10 @@ def turnMoneySource(moneysource):
 my_time_format_pattern = re.compile("((?:(?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*)?(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))(?:.?\d{1,2}:\d{2}(?::\d{2})?)?")
 from BiddingKG.dl.ratio.re_ratio import getUnifyNum
 def my_timeFormat(_time,page_time):
-    if page_time:
-        current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
-    else:
-        current_year = time.strftime("%Y",time.localtime())
+    # if page_time:
+    #     current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
+    # else:
+    #     current_year = time.strftime("%Y",time.localtime())
     all_match = re.finditer(my_time_format_pattern,_time)
     time_list = []
     idx = 0
@@ -3374,11 +3374,11 @@ def my_timeFormat(_time,page_time):
                     if re.search("^\d+$", year):
                         if len(year) == 2:
                             year = "20" + year
-                            if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
-                                legal = False
-                        else:
-                            if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
-                                legal = False
+                            # if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
+                            #     legal = False
+                        # else:
+                        #     if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
+                        #         legal = False
                     else:
                         _year = ""
                         for word in year:
@@ -3423,7 +3423,7 @@ def my_timeFormat(_time,page_time):
                 time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
                 if idx==1 and not global_year:
                     global_year = year
-    return time_list
+    return time_list,global_year
 
 def getTimeAttributes(list_entity,list_sentence,page_time):
     # from BiddingKG.dl.interface.htmlparser import get_childs
@@ -3501,8 +3501,21 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
         'time_listingStart':"time_listingEnd",
         'time_contractStart':"time_contractEnd"
     }
-    time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
-    time_entitys = [item for item in time_entitys if item[1]]
+    # time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
+    new_time_entitys = []
+    year_list = []
+    if page_time:
+        year_list.append(page_time[:4])
+    for _entity in time_entitys:
+        _time_list,_year = my_timeFormat(_entity.entity_text,page_time)
+        if _time_list:
+            new_time_entitys.append([_entity,_time_list,_year])
+            year_list.append(_year)
+    year_list = [(y,year_list.count(y)) for y in year_list]
+    year_list.sort(key=lambda x:x[1],reverse=True)
+    most_year = year_list[0][0]
+    time_entitys = [item for item in new_time_entitys if int(item[2])-int(most_year)<=10 and int(item[2])-int(most_year)>=-1]
+
     # print(time_entitys)
     for entity_idx in range(len(time_entitys)):
         entity = time_entitys[entity_idx][0]
@@ -3529,8 +3542,6 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
         label_prob = entity.values[entity.label]
         entity_text = entity.entity_text
         in_attachment = entity.in_attachment
-        # extract_time = my_timeFormat(entity_text,page_time)
-        # print(entity_text,entity_left2)
         if extract_time:
             definite_time_list = []
             t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{1,2})?[::分]?(?P<second>\d{2})?秒?")
@@ -4367,7 +4378,7 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                 for _serviceTime in list_time:
                     # 优先取具体时间(20XX年x月x日-20XX年x月x日)
                     if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
-                        _extract_time = my_timeFormat(_serviceTime.entity_text,page_time)
+                        _extract_time,_ = my_timeFormat(_serviceTime.entity_text,page_time)
                         if _extract_time and len(_extract_time)==2:
                             # 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
                             if _extract_time[0]!=_extract_time[1]: