|
@@ -3342,10 +3342,10 @@ def turnMoneySource(moneysource):
|
|
|
my_time_format_pattern = re.compile("((?:(?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*)?(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))(?:.?\d{1,2}:\d{2}(?::\d{2})?)?")
|
|
|
from BiddingKG.dl.ratio.re_ratio import getUnifyNum
|
|
|
def my_timeFormat(_time,page_time):
|
|
|
- if page_time:
|
|
|
- current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
|
|
|
- else:
|
|
|
- current_year = time.strftime("%Y",time.localtime())
|
|
|
+ # if page_time:
|
|
|
+ # current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
|
|
|
+ # else:
|
|
|
+ # current_year = time.strftime("%Y",time.localtime())
|
|
|
all_match = re.finditer(my_time_format_pattern,_time)
|
|
|
time_list = []
|
|
|
idx = 0
|
|
@@ -3374,11 +3374,11 @@ def my_timeFormat(_time,page_time):
|
|
|
if re.search("^\d+$", year):
|
|
|
if len(year) == 2:
|
|
|
year = "20" + year
|
|
|
- if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
|
|
|
- legal = False
|
|
|
- else:
|
|
|
- if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
|
|
|
- legal = False
|
|
|
+ # if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
|
|
|
+ # legal = False
|
|
|
+ # else:
|
|
|
+ # if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
|
|
|
+ # legal = False
|
|
|
else:
|
|
|
_year = ""
|
|
|
for word in year:
|
|
@@ -3423,7 +3423,7 @@ def my_timeFormat(_time,page_time):
|
|
|
time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
|
|
|
if idx==1 and not global_year:
|
|
|
global_year = year
|
|
|
- return time_list
|
|
|
+ return time_list,global_year
|
|
|
|
|
|
def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
# from BiddingKG.dl.interface.htmlparser import get_childs
|
|
@@ -3501,8 +3501,21 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
'time_listingStart':"time_listingEnd",
|
|
|
'time_contractStart':"time_contractEnd"
|
|
|
}
|
|
|
- time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
|
|
|
- time_entitys = [item for item in time_entitys if item[1]]
|
|
|
+ # time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
|
|
|
+ new_time_entitys = []
|
|
|
+ year_list = []
|
|
|
+ if page_time:
|
|
|
+ year_list.append(page_time[:4])
|
|
|
+ for _entity in time_entitys:
|
|
|
+ _time_list,_year = my_timeFormat(_entity.entity_text,page_time)
|
|
|
+ if _time_list:
|
|
|
+ new_time_entitys.append([_entity,_time_list,_year])
|
|
|
+ year_list.append(_year)
|
|
|
+ year_list = [(y,year_list.count(y)) for y in year_list]
|
|
|
+ year_list.sort(key=lambda x:x[1],reverse=True)
|
|
|
+ most_year = year_list[0][0]
|
|
|
+ time_entitys = [item for item in new_time_entitys if int(item[2])-int(most_year)<=10 and int(item[2])-int(most_year)>=-1]
|
|
|
+
|
|
|
# print(time_entitys)
|
|
|
for entity_idx in range(len(time_entitys)):
|
|
|
entity = time_entitys[entity_idx][0]
|
|
@@ -3529,8 +3542,6 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
label_prob = entity.values[entity.label]
|
|
|
entity_text = entity.entity_text
|
|
|
in_attachment = entity.in_attachment
|
|
|
- # extract_time = my_timeFormat(entity_text,page_time)
|
|
|
- # print(entity_text,entity_left2)
|
|
|
if extract_time:
|
|
|
definite_time_list = []
|
|
|
t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{1,2})?[::分]?(?P<second>\d{2})?秒?")
|
|
@@ -4367,7 +4378,7 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
|
|
|
for _serviceTime in list_time:
|
|
|
# 优先取具体时间(20XX年x月x日-20XX年x月x日)
|
|
|
if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
|
|
|
- _extract_time = my_timeFormat(_serviceTime.entity_text,page_time)
|
|
|
+ _extract_time,_ = my_timeFormat(_serviceTime.entity_text,page_time)
|
|
|
if _extract_time and len(_extract_time)==2:
|
|
|
# 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
|
|
|
if _extract_time[0]!=_extract_time[1]:
|