|
@@ -3205,6 +3205,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
}
|
|
|
time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
|
|
|
time_entitys = [item for item in time_entitys if item[1]]
|
|
|
+ # print(time_entitys)
|
|
|
for entity_idx in range(len(time_entitys)):
|
|
|
entity = time_entitys[entity_idx][0]
|
|
|
extract_time = time_entitys[entity_idx][1]
|
|
@@ -3241,14 +3242,16 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
t_in_word_num = len(re.findall(t,_entity_text))
|
|
|
# t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
|
|
|
begin_index = 0
|
|
|
+ definite_time_idx_list = []
|
|
|
for _num in range(t_in_word_num):
|
|
|
if begin_index> _entity_text_len + 8:
|
|
|
break
|
|
|
t_in_word = re.search(t, _entity_text[begin_index:])
|
|
|
+ # print(_entity_text[begin_index:])
|
|
|
if t_in_word:
|
|
|
if _num==0 and t_in_word.start() > _entity_text_len + 8:
|
|
|
break
|
|
|
- begin_index = t_in_word.end()
|
|
|
+ begin_index += t_in_word.end()
|
|
|
# print('t_in_word',entity_text,t_in_word.groupdict())
|
|
|
day = t_in_word.groupdict().get('day',"")
|
|
|
hour = t_in_word.groupdict().get('hour',"")
|
|
@@ -3276,13 +3279,30 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
|
|
|
# print(definite_time)
|
|
|
definite_time_list.append(definite_time)
|
|
|
-
|
|
|
- min_len = min(len(extract_time),len(definite_time_list))
|
|
|
- for i in range(min_len):
|
|
|
- if definite_time_list[i] == "24:00:00": # 修正不规范时间表述
|
|
|
- definite_time_list[i] = "23:59:59"
|
|
|
- if definite_time_list[i] != "00:00:00":
|
|
|
- extract_time[i] = extract_time[i] + " " + definite_time_list[i]
|
|
|
+ definite_time_idx_list.append([begin_index-len(t_in_word.group()),begin_index])
|
|
|
+
|
|
|
+ if len(extract_time)==1 and len(definite_time_list)>=2: # 实体只包含一个时间,"2024-12-09 09:00~16:00" 考虑单个时间对应两个详细时间段的识别
|
|
|
+ # 前两个详细时间的间隔
|
|
|
+ distance = definite_time_idx_list[1][0] - definite_time_idx_list[0][1]
|
|
|
+ if distance<=8 and int(definite_time_list[1][:2])>=int(definite_time_list[0][:2]): # 判断详细时间都‘小时’顺序从小到大
|
|
|
+ new_extract_time = []
|
|
|
+ for d_time in definite_time_list[:2]:
|
|
|
+ if d_time == "24:00:00": # 修正不规范时间表述
|
|
|
+ d_time = "23:59:59"
|
|
|
+ new_extract_time.append(extract_time[0] + " " + d_time)
|
|
|
+ extract_time = new_extract_time
|
|
|
+ else:
|
|
|
+ if definite_time_list[0] == "24:00:00": # 修正不规范时间表述
|
|
|
+ definite_time_list[0] = "23:59:59"
|
|
|
+ if definite_time_list[0] != "00:00:00":
|
|
|
+ extract_time[0] = extract_time[0] + " " + definite_time_list[0]
|
|
|
+ else:
|
|
|
+ min_len = min(len(extract_time),len(definite_time_list))
|
|
|
+ for i in range(min_len):
|
|
|
+ if definite_time_list[i] == "24:00:00": # 修正不规范时间表述
|
|
|
+ definite_time_list[i] = "23:59:59"
|
|
|
+ if definite_time_list[i] != "00:00:00":
|
|
|
+ extract_time[i] = extract_time[i] + " " + definite_time_list[i]
|
|
|
|
|
|
if extract_time:
|
|
|
# 时间变更prob优化
|