|
@@ -1417,7 +1417,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
last_phone_mask = False
|
|
|
continue
|
|
|
# 排除号码实体为时间格式 ,例如:20150515
|
|
|
- if re.search("^20(1[0-9]|2[0-2])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
|
|
|
+ if re.search("^20(1[0-9]|2[0-5])(0[1-9]|1[012])(0[1-9]|[1-2][0-9]|3[01])$",item[0]):
|
|
|
error_numStr_index.append(numStr_index)
|
|
|
last_phone_mask = False
|
|
|
continue
|
|
@@ -2964,9 +2964,12 @@ def turnMoneySource(moneysource):
|
|
|
|
|
|
my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
|
|
|
from BiddingKG.dl.ratio.re_ratio import getUnifyNum
|
|
|
-import time
|
|
|
-def my_timeFormat(_time):
|
|
|
- current_year = time.strftime("%Y",time.localtime())
|
|
|
+import time,datetime
|
|
|
+def my_timeFormat(_time,page_time):
|
|
|
+ if page_time:
|
|
|
+ current_year = time.strftime("%Y",time.localtime(int(datetime.datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
|
|
|
+ else:
|
|
|
+ current_year = time.strftime("%Y",time.localtime())
|
|
|
all_match = re.finditer(my_time_format_pattern,_time)
|
|
|
time_list = []
|
|
|
for _match in all_match:
|
|
@@ -2986,10 +2989,10 @@ def my_timeFormat(_time):
|
|
|
if re.search("^\d+$", year):
|
|
|
if len(year) == 2:
|
|
|
year = "20" + year
|
|
|
- if int(year) - int(current_year) > 5:
|
|
|
+ if int(year) - int(current_year) > 5 or int(year) - int(current_year) < -1:
|
|
|
legal = False
|
|
|
else:
|
|
|
- if int(year) - int(current_year)>10:
|
|
|
+ if int(year) - int(current_year)>10 or int(year) - int(current_year) < -1:
|
|
|
legal = False
|
|
|
else:
|
|
|
_year = ""
|
|
@@ -3035,7 +3038,7 @@ def my_timeFormat(_time):
|
|
|
time_list.append("%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0")))
|
|
|
return time_list
|
|
|
|
|
|
-def getTimeAttributes(list_entity,list_sentence):
|
|
|
+def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
time_entitys = [i for i in list_entity if i.entity_type=='time']
|
|
|
time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
|
|
|
list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
|
|
@@ -3104,24 +3107,29 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
|
last_time_type = ""
|
|
|
entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
|
|
|
entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
|
|
|
- entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 20):entity.wordOffset_begin]
|
|
|
+ entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 25):entity.wordOffset_begin]
|
|
|
entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
|
|
|
label_prob = entity.values[entity.label]
|
|
|
entity_text = entity.entity_text
|
|
|
in_attachment = entity.in_attachment
|
|
|
- extract_time = my_timeFormat(entity_text)
|
|
|
+ extract_time = my_timeFormat(entity_text,page_time)
|
|
|
# print(entity_text,entity_left2)
|
|
|
- # definite_time = "00:00:00"
|
|
|
if extract_time:
|
|
|
definite_time_list = []
|
|
|
- t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
|
|
|
+ t = re.compile("(北京时间)?(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{1,2})?[::分]?(?P<second>\d{2})?秒?")
|
|
|
_entity_text = re.sub(" (?=[^\d])|(?<=[^\d]) ","",entity_text)
|
|
|
+ _entity_text_len = len(_entity_text)
|
|
|
+ _entity_text = _entity_text + sentence_text[entity.wordOffset_end:entity.wordOffset_end+20]
|
|
|
t_in_word_num = len(re.findall(t,_entity_text))
|
|
|
- t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
|
|
|
+ # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,re.sub(" (?=[^\d])|(?<=[^\d]) ","",sentence_text[entity.wordOffset_end:]))
|
|
|
begin_index = 0
|
|
|
for _num in range(t_in_word_num):
|
|
|
+ if begin_index> _entity_text_len + 8:
|
|
|
+ break
|
|
|
t_in_word = re.search(t, _entity_text[begin_index:])
|
|
|
if t_in_word:
|
|
|
+ if _num==0 and t_in_word.start() > _entity_text_len + 8:
|
|
|
+ break
|
|
|
begin_index = t_in_word.end()
|
|
|
# print('t_in_word',entity_text,t_in_word.groupdict())
|
|
|
day = t_in_word.groupdict().get('day',"")
|
|
@@ -3151,35 +3159,35 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
|
# print(definite_time)
|
|
|
definite_time_list.append(definite_time)
|
|
|
|
|
|
- if t_out_of_word:
|
|
|
- # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
|
|
|
- day = t_out_of_word.groupdict().get('day', "")
|
|
|
- hour = t_out_of_word.groupdict().get('hour', "")
|
|
|
- half_hour = t_out_of_word.groupdict().get('half_hour', "")
|
|
|
- minute = t_out_of_word.groupdict().get('minute', "")
|
|
|
- second = t_out_of_word.groupdict().get('second', "")
|
|
|
- if hour:
|
|
|
- if day == '下午' and int(hour) < 12:
|
|
|
- hour = str(int(hour) + 12)
|
|
|
- if int(hour) > 24:
|
|
|
- continue
|
|
|
- else:
|
|
|
- hour = "00"
|
|
|
- if not minute:
|
|
|
- if half_hour:
|
|
|
- minute = "30"
|
|
|
- else:
|
|
|
- minute = "00"
|
|
|
- if int(minute) > 60:
|
|
|
- continue
|
|
|
- if not second:
|
|
|
- second = "00"
|
|
|
- if int(second) > 60:
|
|
|
- continue
|
|
|
- definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
|
|
|
- # print(definite_time)
|
|
|
- definite_time_list.append(definite_time)
|
|
|
-
|
|
|
+ # if t_out_of_word:
|
|
|
+ # # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
|
|
|
+ # day = t_out_of_word.groupdict().get('day', "")
|
|
|
+ # hour = t_out_of_word.groupdict().get('hour', "")
|
|
|
+ # half_hour = t_out_of_word.groupdict().get('half_hour', "")
|
|
|
+ # minute = t_out_of_word.groupdict().get('minute', "")
|
|
|
+ # second = t_out_of_word.groupdict().get('second', "")
|
|
|
+ # if hour:
|
|
|
+ # if day == '下午' and int(hour) < 12:
|
|
|
+ # hour = str(int(hour) + 12)
|
|
|
+ # if int(hour) > 24:
|
|
|
+ # continue
|
|
|
+ # else:
|
|
|
+ # hour = "00"
|
|
|
+ # if not minute:
|
|
|
+ # if half_hour:
|
|
|
+ # minute = "30"
|
|
|
+ # else:
|
|
|
+ # minute = "00"
|
|
|
+ # if int(minute) > 60:
|
|
|
+ # continue
|
|
|
+ # if not second:
|
|
|
+ # second = "00"
|
|
|
+ # if int(second) > 60:
|
|
|
+ # continue
|
|
|
+ # definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
|
|
|
+ # # print(definite_time)
|
|
|
+ # definite_time_list.append(definite_time)
|
|
|
+ #
|
|
|
|
|
|
min_len = min(len(extract_time),len(definite_time_list))
|
|
|
for i in range(min_len):
|
|
@@ -3489,7 +3497,7 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
|
return result_dict
|
|
|
|
|
|
|
|
|
-def getOtherAttributes(list_entity):
|
|
|
+def getOtherAttributes(list_entity,page_time):
|
|
|
dict_other = {"moneysource":"",
|
|
|
"person_review":[],
|
|
|
"serviceTime":"",
|
|
@@ -3535,7 +3543,7 @@ def getOtherAttributes(list_entity):
|
|
|
for _serviceTime in list_time:
|
|
|
# 优先取具体时间(20XX年x月x日-20XX年x月x日)
|
|
|
if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,4}20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
|
|
|
- _extract_time = my_timeFormat(_serviceTime.entity_text)
|
|
|
+ _extract_time = my_timeFormat(_serviceTime.entity_text,page_time)
|
|
|
if _extract_time and len(_extract_time)==2:
|
|
|
# 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
|
|
|
if _extract_time[0]!=_extract_time[1]:
|
|
@@ -3570,7 +3578,7 @@ def getOtherAttributes(list_entity):
|
|
|
def getMoneyRange(RoleList):
|
|
|
pass
|
|
|
|
|
|
-def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
|
|
|
+def getPREMs(list_sentences,list_entitys,list_articles,list_outlines,page_time):
|
|
|
'''
|
|
|
@param:
|
|
|
list_sentence:所有文章的句子list
|
|
@@ -3581,7 +3589,7 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
|
|
|
for list_sentence,list_entity,list_article,list_outline in zip(list_sentences,list_entitys,list_articles,list_outlines):
|
|
|
RoleList = getPackageRoleMoney(list_sentence,list_entity,list_outline)
|
|
|
result.append(dict({"prem": RoleList, "docid": list_article.doc_id},
|
|
|
- **getTimeAttributes(list_entity, list_sentence),
|
|
|
+ **getTimeAttributes(list_entity, list_sentence,page_time),
|
|
|
**{"fingerprint": list_article.fingerprint,
|
|
|
"match_enterprise": list_article.match_enterprise,
|
|
|
"match_enterprise_type": list_article.match_enterprise_type,
|