|
@@ -2978,7 +2978,7 @@ def my_timeFormat(_time):
|
|
if re.search("^\d+$", year):
|
|
if re.search("^\d+$", year):
|
|
if len(year) == 2:
|
|
if len(year) == 2:
|
|
year = "20" + year
|
|
year = "20" + year
|
|
- if int(year) > int(current_year):
|
|
|
|
|
|
+ if int(year) - int(current_year) > 5:
|
|
legal = False
|
|
legal = False
|
|
else:
|
|
else:
|
|
if int(year) - int(current_year)>10:
|
|
if int(year) - int(current_year)>10:
|
|
@@ -3053,6 +3053,30 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
'time_contractStart': [], # 18 合同开始时间
|
|
'time_contractStart': [], # 18 合同开始时间
|
|
'time_contractEnd': [] # 19 合同结束时间
|
|
'time_contractEnd': [] # 19 合同结束时间
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ dict_time2label = {
|
|
|
|
+ "time_release": 1, # 1 发布时间
|
|
|
|
+ "time_bidopen": 2, # 2 开标时间
|
|
|
|
+ "time_bidclose": 3, # 3 截标时间
|
|
|
|
+ 'time_bidstart': 12, # 12 投标(开始)时间、响应文件接收(开始)时间
|
|
|
|
+
|
|
|
|
+ 'time_publicityStart': 4, # 4 公示开始时间(公示时间、公示期)
|
|
|
|
+ 'time_publicityEnd': 5, # 5 公示截止时间
|
|
|
|
+ 'time_getFileStart': 6, # 6 文件获取开始时间(文件获取时间)
|
|
|
|
+ 'time_getFileEnd': 7, # 7 文件获取截止时间
|
|
|
|
+ 'time_registrationStart': 8, # 8 报名开始时间(报名时间)
|
|
|
|
+ 'time_registrationEnd': 9, # 9 报名截止时间
|
|
|
|
+ 'time_earnestMoneyStart': 10, # 10 保证金递交开始时间(保证金递交时间)
|
|
|
|
+ 'time_earnestMoneyEnd': 11, # 11 保证金递交截止时间
|
|
|
|
+ 'time_commencement': 13, # 13 开工日期
|
|
|
|
+ 'time_completion': 14, # 14 竣工日期
|
|
|
|
+ 'time_listingStart': 15, # 15 挂牌开始日期(挂牌时间)
|
|
|
|
+ 'time_listingEnd': 16, # 16 挂牌结束日期、挂牌截止日期
|
|
|
|
+ 'time_signContract': 17, # 17 合同签订时间
|
|
|
|
+ 'time_contractStart': 18, # 18 合同开始时间
|
|
|
|
+ 'time_contractEnd': 19 # 19 合同结束时间
|
|
|
|
+ }
|
|
|
|
+
|
|
last_sentence_index = 0
|
|
last_sentence_index = 0
|
|
last_time_type = ""
|
|
last_time_type = ""
|
|
last_time_index = {
|
|
last_time_index = {
|
|
@@ -3067,6 +3091,9 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
}
|
|
}
|
|
for entity in time_entitys:
|
|
for entity in time_entitys:
|
|
sentence_text = list_sentence[entity.sentence_index].sentence_text
|
|
sentence_text = list_sentence[entity.sentence_index].sentence_text
|
|
|
|
+ if entity.sentence_index!=last_sentence_index:
|
|
|
|
+ # sentence_index 不同句子重置last_time_type
|
|
|
|
+ last_time_type = ""
|
|
entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
|
|
entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
|
|
entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
|
|
entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
|
|
entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
|
|
entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
|
|
@@ -3079,37 +3106,42 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
# definite_time = "00:00:00"
|
|
# definite_time = "00:00:00"
|
|
if extract_time:
|
|
if extract_time:
|
|
definite_time_list = []
|
|
definite_time_list = []
|
|
- t = re.compile("(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
|
|
|
|
- t_in_word = re.search(t,entity_text.replace(" ",""))
|
|
|
|
- t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,sentence_text[entity.wordOffset_end:])
|
|
|
|
- if t_in_word:
|
|
|
|
- # print('t_in_word',entity_text,t_in_word.groupdict())
|
|
|
|
- day = t_in_word.groupdict().get('day',"")
|
|
|
|
- hour = t_in_word.groupdict().get('hour',"")
|
|
|
|
- half_hour = t_in_word.groupdict().get('half_hour',"")
|
|
|
|
- minute = t_in_word.groupdict().get('minute',"")
|
|
|
|
- second = t_in_word.groupdict().get('second',"")
|
|
|
|
- if hour:
|
|
|
|
- if day=='下午' and int(hour)<12:
|
|
|
|
- hour = str(int(hour)+12)
|
|
|
|
- if int(hour)>24:
|
|
|
|
- continue
|
|
|
|
- else:
|
|
|
|
- hour = "00"
|
|
|
|
- if not minute:
|
|
|
|
- if half_hour:
|
|
|
|
- minute = "30"
|
|
|
|
|
|
+ t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
|
|
|
|
+ _entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
|
|
|
|
+ t_in_word_num = len(re.findall(t,_entity_text))
|
|
|
|
+ t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
|
|
|
|
+ begin_index = 0
|
|
|
|
+ for _num in range(t_in_word_num):
|
|
|
|
+ t_in_word = re.search(t, _entity_text[begin_index:])
|
|
|
|
+ if t_in_word:
|
|
|
|
+ begin_index = t_in_word.end()
|
|
|
|
+ # print('t_in_word',entity_text,t_in_word.groupdict())
|
|
|
|
+ day = t_in_word.groupdict().get('day',"")
|
|
|
|
+ hour = t_in_word.groupdict().get('hour',"")
|
|
|
|
+ half_hour = t_in_word.groupdict().get('half_hour',"")
|
|
|
|
+ minute = t_in_word.groupdict().get('minute',"")
|
|
|
|
+ second = t_in_word.groupdict().get('second',"")
|
|
|
|
+ if hour:
|
|
|
|
+ if day=='下午' and int(hour)<12:
|
|
|
|
+ hour = str(int(hour)+12)
|
|
|
|
+ if int(hour)>24:
|
|
|
|
+ continue
|
|
else:
|
|
else:
|
|
- minute = "00"
|
|
|
|
- if int(minute)>60:
|
|
|
|
- continue
|
|
|
|
- if not second:
|
|
|
|
- second = "00"
|
|
|
|
- if int(second)>60:
|
|
|
|
- continue
|
|
|
|
- definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
|
|
|
|
- # print(definite_time)
|
|
|
|
- definite_time_list.append(definite_time)
|
|
|
|
|
|
+ hour = "00"
|
|
|
|
+ if not minute:
|
|
|
|
+ if half_hour:
|
|
|
|
+ minute = "30"
|
|
|
|
+ else:
|
|
|
|
+ minute = "00"
|
|
|
|
+ if int(minute)>60:
|
|
|
|
+ continue
|
|
|
|
+ if not second:
|
|
|
|
+ second = "00"
|
|
|
|
+ if int(second)>60:
|
|
|
|
+ continue
|
|
|
|
+ definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
|
|
|
|
+ # print(definite_time)
|
|
|
|
+ definite_time_list.append(definite_time)
|
|
|
|
|
|
if t_out_of_word:
|
|
if t_out_of_word:
|
|
# print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
|
|
# print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
|
|
@@ -3154,16 +3186,34 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
last_index = item.start() + 1
|
|
last_index = item.start() + 1
|
|
label_prob = label_prob - 0.2 * last_index / len(entity_left2)
|
|
label_prob = label_prob - 0.2 * last_index / len(entity_left2)
|
|
# print('prob优化',label_prob,extract_time)
|
|
# print('prob优化',label_prob,extract_time)
|
|
|
|
+ elif re.search("改正|更正|修正|更改|延期",entity_left2):
|
|
|
|
+ new_label = dict_time2label.get(last_time_type,None)
|
|
|
|
+ if new_label and entity.label==0:
|
|
|
|
+ entity.label = new_label
|
|
|
|
+ label_prob = 1
|
|
|
|
+
|
|
# 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
|
|
# 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
|
|
if entity.label in [2,3,9]:
|
|
if entity.label in [2,3,9]:
|
|
- if entity.label==2 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
|
|
|
|
+ if entity.label==2 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
|
|
dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
|
|
dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
|
|
if entity.label==3 and re.search("开标|评审.{,2}(?:开始)?时间|选取.{,2}时间",entity_left3):
|
|
if entity.label==3 and re.search("开标|评审.{,2}(?:开始)?时间|选取.{,2}时间",entity_left3):
|
|
dict_time['time_bidopen'].append((extract_time[0], 0.5, in_attachment))
|
|
dict_time['time_bidopen'].append((extract_time[0], 0.5, in_attachment))
|
|
if entity.label==3 and re.search("报名",entity_left3):
|
|
if entity.label==3 and re.search("报名",entity_left3):
|
|
dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
|
|
dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
|
|
- if entity.label==9 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
|
|
|
|
+ if entity.label==9 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
|
|
|
|
+ dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
|
|
|
|
+ if entity.label in [11, 3]:
|
|
|
|
+ if entity.label==11 and re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
|
|
dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
|
|
|
|
+ if entity.label==3 and re.search("保证金.{,2}(接受|收取)|(接受|收取).{,2}保证金",entity_left3):
|
|
|
|
+ dict_time['time_earnestMoneyEnd'].append((extract_time[0], 0.5, in_attachment))
|
|
|
|
+ if entity.label in [6, 7]:
|
|
|
|
+ if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
|
|
+ dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
|
|
|
|
+ if entity.label==0:
|
|
|
|
+ if re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
|
|
+ dict_time['time_bidclose'].append((extract_time[0], 0.45, in_attachment))
|
|
|
|
+
|
|
# 补充公告末尾处的发布时间
|
|
# 补充公告末尾处的发布时间
|
|
if entity.label==0:
|
|
if entity.label==0:
|
|
if entity.is_tail:
|
|
if entity.is_tail:
|
|
@@ -3237,6 +3287,24 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
|
|
dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
|
|
dict_time['time_contractEnd'].append((extract_time[1], 0.5, in_attachment))
|
|
dict_time['time_contractEnd'].append((extract_time[1], 0.5, in_attachment))
|
|
last_time_type = ''
|
|
last_time_type = ''
|
|
|
|
+ # 报价/投标时间补充
|
|
|
|
+ if entity.label == 0:
|
|
|
|
+ if re.search("[报竞]价.{,2}(开始|起始).{,2}时间",entity_left2):
|
|
|
|
+ entity.label = 12
|
|
|
|
+ label_prob = 0.8
|
|
|
|
+ elif re.search("[报竞]价.{,2}起止.{,2}时间",entity_left2):
|
|
|
|
+ entity.label = 12
|
|
|
|
+ label_prob = 0.6
|
|
|
|
+ elif re.search("响应.{,2}文件([递提]交|接收).{,2}时间[::]|([递提]交|接收).{,2}响应.{,2}文件.{,2}时间[::]",entity_left2):
|
|
|
|
+ entity.label = 3
|
|
|
|
+ label_prob = 0.501
|
|
|
|
+ elif re.search("响应.{,2}文件([递提]交|接收).{,2}时间|([递提]交|接收).{,2}响应.{,2}文件.{,2}时间",entity_left2) and not re.search("截[止至]",entity_left2):
|
|
|
|
+ entity.label = 12
|
|
|
|
+ label_prob = 0.51
|
|
|
|
+ elif re.search("[报竞]价.{,2}截[止至].{,2}时间",entity_left2):
|
|
|
|
+ entity.label = 3
|
|
|
|
+ label_prob = 0.8
|
|
|
|
+
|
|
|
|
|
|
if re.search("至|到|[日\d][-—]$|[~~]", entity_left):
|
|
if re.search("至|到|[日\d][-—]$|[~~]", entity_left):
|
|
if entity.sentence_index == last_sentence_index:
|
|
if entity.sentence_index == last_sentence_index:
|
|
@@ -3254,8 +3322,13 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
dict_time['time_bidopen'].append((extract_time[0],label_prob,in_attachment))
|
|
dict_time['time_bidopen'].append((extract_time[0],label_prob,in_attachment))
|
|
last_time_type = 'time_bidopen'
|
|
last_time_type = 'time_bidopen'
|
|
elif entity.label==3 and label_prob>0.5:
|
|
elif entity.label==3 and label_prob>0.5:
|
|
- dict_time['time_bidclose'].append((extract_time[0],label_prob,in_attachment))
|
|
|
|
- last_time_type = 'time_bidclose'
|
|
|
|
|
|
+ if len(extract_time)==1:
|
|
|
|
+ dict_time['time_bidclose'].append((extract_time[0],label_prob,in_attachment))
|
|
|
|
+ last_time_type = 'time_bidclose'
|
|
|
|
+ elif len(extract_time)==2:
|
|
|
|
+ dict_time['time_bidstart'].append((extract_time[0], 0.5, in_attachment))
|
|
|
|
+ dict_time['time_bidclose'].append((extract_time[1], label_prob, in_attachment))
|
|
|
|
+ last_time_type = 'time_bidclose'
|
|
elif entity.label==12 and label_prob>0.5:
|
|
elif entity.label==12 and label_prob>0.5:
|
|
if len(extract_time)==1:
|
|
if len(extract_time)==1:
|
|
if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
|
|
if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
|
|
@@ -3388,6 +3461,23 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
if in_attachment==True and len(result_dict[time_type])>0:
|
|
if in_attachment==True and len(result_dict[time_type])>0:
|
|
break
|
|
break
|
|
result_dict[time_type] = _list_time[0][0]
|
|
result_dict[time_type] = _list_time[0][0]
|
|
|
|
+ # result_dict 纠错
|
|
|
|
+ if not result_dict['time_bidclose']:
|
|
|
|
+ if result_dict['time_bidstart']: # 无截标时间,投标开始和开标时间一样
|
|
|
|
+ if result_dict['time_bidstart'][:10] in result_dict['time_bidopen']:
|
|
|
|
+ result_dict['time_bidstart'] = ""
|
|
|
|
+ result_dict['time_bidclose'] = result_dict['time_bidopen']
|
|
|
|
+ if not result_dict['time_bidclose']:
|
|
|
|
+ if result_dict['time_getFileEnd']: # 无截标时间,获取文件截止时间和开标时间一样
|
|
|
|
+ if result_dict['time_getFileEnd'][:10] in result_dict['time_bidopen']:
|
|
|
|
+ result_dict['time_bidclose'] = result_dict['time_bidopen']
|
|
|
|
+ else:
|
|
|
|
+ if result_dict['time_bidopen']: # 截标时间 和 开标时间 时分秒互补
|
|
|
|
+ if len(result_dict['time_bidclose'])<len(result_dict['time_bidopen']) and result_dict['time_bidclose'] in result_dict['time_bidopen']:
|
|
|
|
+ result_dict['time_bidclose'] = result_dict['time_bidopen']
|
|
|
|
+ elif len(result_dict['time_bidclose'])>len(result_dict['time_bidopen']) and result_dict['time_bidopen'] in result_dict['time_bidclose']:
|
|
|
|
+ result_dict['time_bidopen'] = result_dict['time_bidclose']
|
|
|
|
+
|
|
return result_dict
|
|
return result_dict
|
|
|
|
|
|
|
|
|