|
@@ -12,6 +12,8 @@ import os
|
|
|
from scipy.optimize import linear_sum_assignment
|
|
|
from BiddingKG.dl.interface.Entitys import Match
|
|
|
import numpy as np
|
|
|
+import time,calendar
|
|
|
+from datetime import datetime
|
|
|
|
|
|
def getTheRole(entity,role_list):
|
|
|
'''
|
|
@@ -3011,10 +3013,9 @@ def turnMoneySource(moneysource):
|
|
|
|
|
|
my_time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
|
|
|
from BiddingKG.dl.ratio.re_ratio import getUnifyNum
|
|
|
-import time,datetime
|
|
|
def my_timeFormat(_time,page_time):
|
|
|
if page_time:
|
|
|
- current_year = time.strftime("%Y",time.localtime(int(datetime.datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
|
|
|
+ current_year = time.strftime("%Y",time.localtime(int(datetime.strptime(page_time, '%Y-%m-%d').timestamp())))
|
|
|
else:
|
|
|
current_year = time.strftime("%Y",time.localtime())
|
|
|
all_match = re.finditer(my_time_format_pattern,_time)
|
|
@@ -3086,6 +3087,20 @@ def my_timeFormat(_time,page_time):
|
|
|
return time_list
|
|
|
|
|
|
def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
+ # from BiddingKG.dl.interface.htmlparser import get_childs
|
|
|
+ # document_tree = parse_document.tree
|
|
|
+ # new_document_tree = []
|
|
|
+ # _data_i = -1
|
|
|
+ # while _data_i < len(document_tree) - 1:
|
|
|
+ # _data_i += 1
|
|
|
+ # _data = document_tree[_data_i]
|
|
|
+ # _type = _data["type"]
|
|
|
+ # if _type == "sentence":
|
|
|
+ # if _data["sentence_title"] is not None:
|
|
|
+ # new_document_tree.append(_data)
|
|
|
+ # document_tree = new_document_tree
|
|
|
+
|
|
|
+
|
|
|
time_entitys = [i for i in list_entity if i.entity_type=='time']
|
|
|
time_entitys = sorted(time_entitys,key=lambda x:(x.sentence_index, x.begin_index))
|
|
|
list_sentence = sorted(list_sentence,key=lambda x:x.sentence_index)
|
|
@@ -3147,19 +3162,34 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
'time_listingStart':"time_listingEnd",
|
|
|
'time_contractStart':"time_contractEnd"
|
|
|
}
|
|
|
- for entity in time_entitys:
|
|
|
+ time_entitys = [[_entity,my_timeFormat(_entity.entity_text,page_time)] for _entity in time_entitys]
|
|
|
+ time_entitys = [item for item in time_entitys if item[1]]
|
|
|
+ for entity_idx in range(len(time_entitys)):
|
|
|
+ entity = time_entitys[entity_idx][0]
|
|
|
+ extract_time = time_entitys[entity_idx][1]
|
|
|
sentence_text = list_sentence[entity.sentence_index].sentence_text
|
|
|
+ previous_entity = time_entitys[entity_idx-1][0] if entity_idx!=0 else None
|
|
|
+ previous_extract_time = time_entitys[entity_idx-1][1] if entity_idx!=0 else None
|
|
|
+ next_entity = time_entitys[entity_idx+1][0] if entity_idx!=len(time_entitys)-1 else None
|
|
|
+ next_extract_time = time_entitys[entity_idx+1][1] if entity_idx!=len(time_entitys)-1 else None
|
|
|
+ # 实体有效上下文
|
|
|
+ entity_context_begin = previous_entity.wordOffset_end if previous_entity and previous_entity.sentence_index==entity.sentence_index else 0
|
|
|
+ entity_context_end = next_entity.wordOffset_begin if next_entity and next_entity.sentence_index==entity.sentence_index else len(sentence_text)
|
|
|
+
|
|
|
if entity.sentence_index!=last_sentence_index:
|
|
|
# sentence_index 不同句子重置last_time_type
|
|
|
last_time_type = ""
|
|
|
- entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
|
|
|
- entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
|
|
|
- entity_left3 = sentence_text[max(0, entity.wordOffset_begin - 25):entity.wordOffset_begin]
|
|
|
- entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
|
|
|
+ entity_left = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 2):entity.wordOffset_begin]
|
|
|
+ entity_left2 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin]
|
|
|
+ entity_left3 = sentence_text[max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin]
|
|
|
+ entity_right = sentence_text[entity.wordOffset_end:min(entity.wordOffset_end + 3,entity_context_end)]
|
|
|
+ entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end]
|
|
|
+ entity_right2 = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",'',entity_right2)[:60] # 去除网址
|
|
|
+ # print(entity.entity_text,entity_right2)
|
|
|
label_prob = entity.values[entity.label]
|
|
|
entity_text = entity.entity_text
|
|
|
in_attachment = entity.in_attachment
|
|
|
- extract_time = my_timeFormat(entity_text,page_time)
|
|
|
+ # extract_time = my_timeFormat(entity_text,page_time)
|
|
|
# print(entity_text,entity_left2)
|
|
|
if extract_time:
|
|
|
definite_time_list = []
|
|
@@ -3208,6 +3238,8 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
|
|
|
min_len = min(len(extract_time),len(definite_time_list))
|
|
|
for i in range(min_len):
|
|
|
+ if definite_time_list[i] == "24:00:00": # 修正不规范时间表述
|
|
|
+ definite_time_list[i] = "23:59:59"
|
|
|
if definite_time_list[i] != "00:00:00":
|
|
|
extract_time[i] = extract_time[i] + " " + definite_time_list[i]
|
|
|
|
|
@@ -3228,13 +3260,13 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
# 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
|
|
|
if entity.label in [2,3,9]:
|
|
|
if entity.label==2 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
|
|
|
- dict_time['time_bidclose'].append((extract_time[0], label_prob, in_attachment))
|
|
|
+ dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment))
|
|
|
if entity.label==3 and re.search("开标|(评审|比选).{,2}(?:开始)?(时间|日期)|选取.{,2}(时间|日期)",entity_left3):
|
|
|
- dict_time['time_bidopen'].append((extract_time[0], label_prob, in_attachment))
|
|
|
+ dict_time['time_bidopen'].append((extract_time[0], label_prob-0.1, in_attachment))
|
|
|
if entity.label==3 and re.search("报名",entity_left3):
|
|
|
dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
|
|
|
if entity.label==9 and re.search("截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止|文件.{,2}([递提]交|接收)",entity_left3):
|
|
|
- dict_time['time_bidclose'].append((extract_time[0], label_prob, in_attachment))
|
|
|
+ dict_time['time_bidclose'].append((extract_time[0], label_prob-0.1, in_attachment))
|
|
|
if entity.label in [11, 3]:
|
|
|
if entity.label==11 and re.search("文件.{,2}([递提]交|接收)|截标|投标.{,2}截止|([递提]交|接收)(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
|
dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
|
|
@@ -3254,6 +3286,73 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
else:
|
|
|
dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
|
|
|
+ # 获取文件/报名/报价 时间补充(上下文表达过长无法通过模型识别)
|
|
|
+ # if entity.label == 0:
|
|
|
+ # if re.search("(获取|领取|售卖|出售|购买|下载).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_left3):
|
|
|
+ # if len(extract_time)==2:
|
|
|
+ # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment))
|
|
|
+ # else:
|
|
|
+ # if next_entity and next_entity.sentence_index==entity.sentence_index:
|
|
|
+ # mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
|
|
|
+ # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1:
|
|
|
+ # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # dict_time['time_getFileEnd'].append((next_extract_time[0], 0.51, in_attachment))
|
|
|
+ # if not dict_time['time_getFileEnd']:
|
|
|
+ # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
|
|
|
+ # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
|
|
|
+ # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # if re.search("(进行|在线|线下|线上|网上).{,2}报名|报名.{,2}(开始)?(时间|日期)", entity_left3):
|
|
|
+ # if len(extract_time)==2:
|
|
|
+ # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
|
|
|
+ # else:
|
|
|
+ # if next_entity and next_entity.sentence_index==entity.sentence_index:
|
|
|
+ # mid_text = sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
|
|
|
+ # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(next_extract_time)==1:
|
|
|
+ # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # dict_time['time_registrationEnd'].append((next_extract_time[0], 0.51, in_attachment))
|
|
|
+ # if not dict_time['time_registrationEnd']:
|
|
|
+ # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
|
|
|
+ # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
|
|
|
+ # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ #
|
|
|
+ # if re.search("(获取|售卖|出售|购买).{,4}(招标|投标|采购)?(文件|标书)|(文件|标书).{,4}(获取|售卖|出售|发售|购买)", entity_right2):
|
|
|
+ # if len(extract_time)==2:
|
|
|
+ # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # dict_time['time_getFileEnd'].append((extract_time[1], 0.51, in_attachment))
|
|
|
+ # else:
|
|
|
+ # if previous_entity and previous_entity.sentence_index==entity.sentence_index:
|
|
|
+ # mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
|
|
|
+ # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
|
|
|
+ # dict_time['time_getFileStart'].append((previous_extract_time[0], 0.51, in_attachment))
|
|
|
+ # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # if not dict_time['time_getFileEnd']:
|
|
|
+ # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
|
|
|
+ # dict_time['time_getFileEnd'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
|
|
|
+ # dict_time['time_getFileStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # if re.search("(进行|在线|线下).{,2}报名", entity_right2):
|
|
|
+ # if len(extract_time) == 2:
|
|
|
+ # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
|
|
|
+ # else:
|
|
|
+ # if previous_entity and previous_entity.sentence_index==entity.sentence_index:
|
|
|
+ # mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
|
|
|
+ # if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
|
|
|
+ # dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment))
|
|
|
+ # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # if not dict_time['time_registrationEnd']:
|
|
|
+ # if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
|
|
|
+ # dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
|
|
|
+ # dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # if re.search("(进行|开始).{,4}(报价|投标|竞价)", entity_right2):
|
|
|
+ # if len(extract_time) == 2:
|
|
|
+ # dict_time['time_bidstart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ # # dict_time['time_bidclose'].append((extract_time[1], 0.51, in_attachment))
|
|
|
|
|
|
# 补充公告末尾处的发布时间
|
|
|
if entity.label==0:
|
|
@@ -3304,6 +3403,8 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
else:
|
|
|
dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
|
|
|
last_time_type = 'time_contractStart'
|
|
|
+ last_sentence_index = entity.sentence_index
|
|
|
+ continue
|
|
|
else:
|
|
|
if re.search("(?:合同|服务|履约|(合同|服务)履行)(?:期限?|有效期)|(?:服务|履约|(合同|服务)履行)(?:时间|日期|周期)|服务[时年]限|合同周期", entity_left2):
|
|
|
# 排除开始和借宿时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
|
|
@@ -3311,6 +3412,8 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
dict_time['time_contractStart'].append((extract_time[0], 0.6, in_attachment))
|
|
|
dict_time['time_contractEnd'].append((extract_time[1], 0.6, in_attachment))
|
|
|
last_time_type = ''
|
|
|
+ last_sentence_index = entity.sentence_index
|
|
|
+ continue
|
|
|
# 服务期限表达补充
|
|
|
if entity.label==0:
|
|
|
re_service = '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
|
|
@@ -3328,7 +3431,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
dict_time['time_contractStart'].append((extract_time[0], 0.5, in_attachment))
|
|
|
dict_time['time_contractEnd'].append((extract_time[1], 0.5, in_attachment))
|
|
|
last_time_type = ''
|
|
|
- # 报价/投标时间补充
|
|
|
+ # 报价/投标时间补充(规则补充)
|
|
|
if entity.label == 0:
|
|
|
if re.search("[报竞]价.{,2}(开始|起始).{,2}(时间|日期)",entity_left2):
|
|
|
entity.label = 12
|
|
@@ -3351,6 +3454,26 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
elif re.search("(竞价|报价).?(时间|日期)",entity_left3) and re.search("参与|报价|有意",entity_left2):
|
|
|
entity.label = 12
|
|
|
label_prob = 0.501
|
|
|
+ # 文档结构补充
|
|
|
+ # if entity.label == 0:
|
|
|
+ # re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|"
|
|
|
+ # "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)")
|
|
|
+ # _data_i = -1
|
|
|
+ # while _data_i < len(document_tree) - 1:
|
|
|
+ # _data_i += 1
|
|
|
+ # _data = document_tree[_data_i]
|
|
|
+ # _type = _data["type"]
|
|
|
+ # _text = _data["text"].strip()
|
|
|
+ # childs = get_childs([_data])
|
|
|
+ # last_child = childs[-1]
|
|
|
+ # if entity.sentence_index>=_data.sentence_index and entity.wordOffset_begin>=_data.wordOffset_begin and
|
|
|
+ # ():
|
|
|
+ # if re.search(re_registration, re.split("[::;;,]", _text)[0][:20]) is not None:
|
|
|
+ #
|
|
|
+ # content_text = ""
|
|
|
+ # for c in childs:
|
|
|
+ # content_text += c["text"] + ""
|
|
|
+ # print('concat_text', content_text)
|
|
|
|
|
|
|
|
|
if re.search("至|到|[日\d][-—]$|[~~]", entity_left):
|
|
@@ -3496,6 +3619,88 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
last_time_type = ""
|
|
|
last_sentence_index = entity.sentence_index
|
|
|
|
|
|
+ # 通过文档分析树形结构补充部分时间实体
|
|
|
+ def add_time_by_parseDocument(dict_time,parse_document):
|
|
|
+ from BiddingKG.dl.interface.htmlparser import get_childs
|
|
|
+ document_tree = parse_document.tree
|
|
|
+ # if not dict_time['time_getFileStart'] or not dict_time['time_getFileEnd']:
|
|
|
+ # time_pattern = re.compile("")
|
|
|
+
|
|
|
+ concat_text_list = []
|
|
|
+ if not dict_time['time_registrationStart'] or not dict_time['time_registrationEnd']:
|
|
|
+ re_registration = re.compile("报名|(文件|标书)[\u4e00-\u9fa5、]{,4}(获取|出售|售卖|购买|下载)|"
|
|
|
+ "(获取|出售|售卖|购买|下载)[\u4e00-\u9fa5、]{,4}(文件|标书)")
|
|
|
+ _data_i = -1
|
|
|
+ while _data_i < len(document_tree) - 1:
|
|
|
+ _data_i += 1
|
|
|
+ _data = document_tree[_data_i]
|
|
|
+ _type = _data["type"]
|
|
|
+ _text = _data["text"].strip()
|
|
|
+ # print(_data.keys())
|
|
|
+ if _type == "sentence":
|
|
|
+ print('_text:',_text,_data["sentence_title"])
|
|
|
+ if _data["sentence_title"] is not None:
|
|
|
+ print("aptitude_pattern", _text)
|
|
|
+ print(_data['sentence_index'],_data['wordOffset_begin'],_data['wordOffset_end'])
|
|
|
+ if re.search(re_registration, re.split("[::;;。]",_text)[0][:15]) is not None:
|
|
|
+ childs = get_childs([_data])
|
|
|
+ concat_text = ""
|
|
|
+ for c in childs:
|
|
|
+ concat_text += c["text"] + ""
|
|
|
+ print('concat_text',concat_text)
|
|
|
+ concat_text_list.append(concat_text)
|
|
|
+ _data_i += len(childs)-1
|
|
|
+ # if _type == "table":
|
|
|
+ # list_table = _data["list_table"]
|
|
|
+ # parent_title = _data["parent_title"]
|
|
|
+ # if list_table is not None:
|
|
|
+ # for line in list_table[:2]:
|
|
|
+ # for cell_i in range(len(line)):
|
|
|
+ # cell = line[cell_i]
|
|
|
+ # cell_text = cell[0]
|
|
|
+ # if len(cell_text) > 120 and re.search(re_registration, cell_text) is not None:
|
|
|
+ # concat_text += cell_text + "\n"
|
|
|
+ print('_text',concat_text_list)
|
|
|
+ for text in concat_text_list:
|
|
|
+ time_list = re.finditer(my_time_format_pattern,text)
|
|
|
+ time_list = [(i,my_timeFormat(i.group(),page_time)) for i in time_list]
|
|
|
+ for time_idx in range(len(time_list)):
|
|
|
+ _time = time_list[time_idx][0]
|
|
|
+ extract_time = time_list[time_idx][1]
|
|
|
+ entity_left = text[:_time.start()]
|
|
|
+ entity_left = re.split("[。;;!??]",entity_left)[-1]
|
|
|
+ # entity_left2 = sentence_text[
|
|
|
+ # max(entity_context_begin, entity.wordOffset_begin - 10):entity.wordOffset_begin]
|
|
|
+ # entity_left3 = sentence_text[
|
|
|
+ # max(entity_context_begin, entity.wordOffset_begin - 30):entity.wordOffset_begin]
|
|
|
+ entity_right = text[_time.end():]
|
|
|
+ entity_right = re.split("[。;;!??]",entity_right)[0]
|
|
|
+ # entity_right2 = sentence_text[entity.wordOffset_end:entity_context_end]
|
|
|
+ entity_right2 = re.sub(r"(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F])){6,}",
|
|
|
+ '', entity_right)[:60] # 去除网址
|
|
|
+ print('entity_right2',entity_right2)
|
|
|
+ if re.search("(进行|在线|线下).{,2}报名", entity_right2):
|
|
|
+ print('报名text',entity_right2)
|
|
|
+ if len(extract_time) == 2:
|
|
|
+ dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ dict_time['time_registrationEnd'].append((extract_time[1], 0.51, in_attachment))
|
|
|
+ else:
|
|
|
+ if previous_entity and previous_entity.sentence_index==entity.sentence_index:
|
|
|
+ mid_text = sentence_text[previous_entity.wordOffset_end:entity.wordOffset_begin]
|
|
|
+ if len(mid_text)<=10 and re.search("至|到|[-—]|[~~]",mid_text) and len(previous_extract_time)==1:
|
|
|
+ dict_time['time_registrationStart'].append((previous_extract_time[0], 0.51, in_attachment))
|
|
|
+ dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ if not dict_time['time_registrationEnd']:
|
|
|
+ if re.search("前|止|截止", entity_right) or re.search("前",entity_text[-2:]):
|
|
|
+ dict_time['time_registrationEnd'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+ elif re.search("起|开?始", entity_right) or re.search("起",entity_text[-2:]):
|
|
|
+ dict_time['time_registrationStart'].append((extract_time[0], 0.51, in_attachment))
|
|
|
+
|
|
|
+
|
|
|
+ return dict_time
|
|
|
+
|
|
|
+ # dict_time = add_time_by_parseDocument(dict_time,parse_document)
|
|
|
+
|
|
|
# print(dict_time)
|
|
|
result_dict = dict((key,"") for key in dict_time.keys())
|
|
|
for time_type,value in dict_time.items():
|
|
@@ -3504,7 +3709,7 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
for in_attachment in [False,True]:
|
|
|
_list_time = [_time for _time in list_time if _time[2]==in_attachment]
|
|
|
if _list_time:
|
|
|
- _list_time.sort(key=lambda x:x[1],reverse=True)
|
|
|
+ _list_time.sort(key=lambda x:(x[1],len(x[0])),reverse=True) # sort_key: label_prob,时间文本长度(优先有具体时分秒的)
|
|
|
if in_attachment==True and len(result_dict[time_type])>0:
|
|
|
break
|
|
|
result_dict[time_type] = _list_time[0][0]
|
|
@@ -3527,8 +3732,199 @@ def getTimeAttributes(list_entity,list_sentence,page_time):
|
|
|
|
|
|
return result_dict
|
|
|
|
|
|
+def get_days_between(day1,day2,get_abs=0):
|
|
|
+ '''
|
|
|
+ :param day1: 较小日期
|
|
|
+ :param day2: 较大日期
|
|
|
+ :param get_abs: 是否取绝对值
|
|
|
+ :return: 天数差
|
|
|
+ '''
|
|
|
+ # 将日期字符串转换为datetime对象
|
|
|
+ date1 = datetime.strptime(day1, '%Y-%m-%d')
|
|
|
+ date2 = datetime.strptime(day2, '%Y-%m-%d')
|
|
|
+ # 计算日期差
|
|
|
+ delta = date2 - date1
|
|
|
+ # 获取天数差
|
|
|
+ days_difference = delta.days
|
|
|
+ if get_abs:
|
|
|
+ return abs(days_difference)
|
|
|
+ else:
|
|
|
+ return days_difference
|
|
|
+
|
|
|
+def extract_serviceTime(service_time,page_time):
|
|
|
+ pattern1 = re.compile("\d{4}[年\-\./]\d{1,2}[月\-\./]\d{1,2}日?")
|
|
|
+ pattern2 = re.compile("\d+(?:\.\d+)?[\((]?个?[^\d]?[^\d]?(?:日|天|周年|整年|学?年|月|周|日历[天日]|工作[天日])")
|
|
|
+ pattern3 = re.compile("\d{4}[年\-\./]\d{1,2}月?")
|
|
|
+ pattern4 = re.compile("(?:日|天|周年|年|月|周|日历[天日]|工作[天日]|星期)[^\d]{1,3}\d+(?:\.\d+)?")
|
|
|
+ DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
|
|
|
+ "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9,
|
|
|
+ "两":2, '貮': 2}
|
|
|
+
|
|
|
+ def get_month_days(year, month):
|
|
|
+ # calendar.monthrange(year, month)返回一个元组,其中第一个元素是月份的第一天是星期几(0-6为星期一到星期日),
|
|
|
+ # 第二个元素是该月的天数。
|
|
|
+ _, last_day = calendar.monthrange(year, month)
|
|
|
+ return last_day
|
|
|
+ def get_num(text):
|
|
|
+ CN_UNIT = {'十': 10,'拾': 10,'百': 100,
|
|
|
+ '佰': 100,'千': 1000,'仟': 1000}
|
|
|
+
|
|
|
+ regex = re.compile(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+')
|
|
|
+ text = regex.search(text)
|
|
|
+ if text:
|
|
|
+ text = text.group()
|
|
|
+ else:
|
|
|
+ return ""
|
|
|
+ result = 0
|
|
|
+ result_list = []
|
|
|
+ unit = 0
|
|
|
+ control = 0
|
|
|
+ for i, d in enumerate(text):
|
|
|
+ if d in '零百佰千仟' and i == 0:
|
|
|
+ return ""
|
|
|
+ if d in DigitsDic:
|
|
|
+ result += DigitsDic[d]
|
|
|
+ elif d in CN_UNIT:
|
|
|
+ if unit == 0:
|
|
|
+ unit_1 = CN_UNIT[d]
|
|
|
+ # 这里的处理主要是考虑到类似于二十三亿五千万这种数
|
|
|
+ if result == 0:
|
|
|
+ result = CN_UNIT[d]
|
|
|
+ else:
|
|
|
+ result *= CN_UNIT[d]
|
|
|
+ unit = CN_UNIT[d]
|
|
|
+ result_1 = result
|
|
|
+ elif unit > CN_UNIT[d]:
|
|
|
+ result -= DigitsDic[text[i - 1]]
|
|
|
+ result += DigitsDic[text[i - 1]] * CN_UNIT[d]
|
|
|
+ unit = CN_UNIT[d]
|
|
|
+ elif unit <= CN_UNIT[d]:
|
|
|
+ if (CN_UNIT[d] < unit_1) and (len(result_list) == control):
|
|
|
+ result_list.append(result_1)
|
|
|
+ result = (result - result_1) * CN_UNIT[d]
|
|
|
+ control += 1
|
|
|
+ else:
|
|
|
+ result *= CN_UNIT[d]
|
|
|
+ unit = CN_UNIT[d]
|
|
|
+ if len(result_list) == control:
|
|
|
+ unit_1 = unit
|
|
|
+ result_1 = result
|
|
|
+ else:
|
|
|
+ return ""
|
|
|
+
|
|
|
+ return sum(result_list) + result
|
|
|
+
|
|
|
+ serviceTime_dict = {"service_start": "", "service_end": "", "service_days": ""}
|
|
|
+ re_num = re.findall(r'[〇一二三四五六七八九零壹贰叁肆伍陆柒捌玖貮两十拾百佰千仟]+',service_time)
|
|
|
+ for _num in re_num:
|
|
|
+ if not re.search("[十拾百佰千仟]",_num):
|
|
|
+ num = ""
|
|
|
+ for word in _num:
|
|
|
+ num += str(DigitsDic.get(word,word))
|
|
|
+ service_time = service_time.replace(_num,num,1)
|
|
|
+ else:
|
|
|
+ num = str(get_num(_num))
|
|
|
+ service_time = service_time.replace(_num,num,1)
|
|
|
+
|
|
|
+ end_time = ""
|
|
|
+ service_days = 0
|
|
|
+ re_page_time = re.search("20\d{2}-\d{2}-\d{2}", page_time)
|
|
|
+ page_time = re_page_time.group() if re_page_time else "2000-01-01" # page_time为空时默认值为2000-01-01
|
|
|
+ if re.search(pattern1,service_time):
|
|
|
+ # end_time = re.findall(pattern1,service_time)[-1]
|
|
|
+ time_list = []
|
|
|
+ for _time in re.findall(pattern1,service_time):
|
|
|
+ _time = re.sub("日","",_time)
|
|
|
+ _time = re.sub("[年月\./]","-",_time)
|
|
|
+ _year,_month,_day = _time.split("-")
|
|
|
+ _month = int(_month)
|
|
|
+ _day = int(_day)
|
|
|
+ _year = int(_year)
|
|
|
+ if _year>2050 or _year<=2000 or _month>12 or _month<=0 or _day<=0 or _day>31:
|
|
|
+ service_days = 0
|
|
|
+ else:
|
|
|
+ if isValidDate(_year,_month,_day):
|
|
|
+ _time = str(_year)+'-'+str(_month)+'-'+str(_day)
|
|
|
+ _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0")
|
|
|
+ time_list.append(_time)
|
|
|
+ if len(time_list)>=2:
|
|
|
+ if get_days_between(page_time,time_list[1])>1 and get_days_between(time_list[0],time_list[1])>0:
|
|
|
+ serviceTime_dict['service_end'] = time_list[1]
|
|
|
+ serviceTime_dict['service_start'] = time_list[0]
|
|
|
+ else:
|
|
|
+ if get_days_between(page_time, time_list[0]) > 1:
|
|
|
+ serviceTime_dict['service_end'] = time_list[0]
|
|
|
+ # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
|
|
|
+ elif re.search(pattern3,service_time):
|
|
|
+ time_list = []
|
|
|
+ # end_time = re.findall(pattern3,service_time)[-1]
|
|
|
+ for _time in re.findall(pattern3,service_time):
|
|
|
+ _time = re.sub("月","",_time)
|
|
|
+ _time = re.sub("[年\./]","-",_time)
|
|
|
+ _year,_month = _time.split("-")
|
|
|
+ _day = 0
|
|
|
+ _month = int(_month)
|
|
|
+ _year = int(_year)
|
|
|
+ if _year>2050 or _year<=2000 or _month>12 or _month<=0:
|
|
|
+ service_days = 0
|
|
|
+ else:
|
|
|
+ _day = get_month_days(_year,_month)
|
|
|
+ if isValidDate(_year, _month, _day):
|
|
|
+ _time = str(_year)+'-'+str(_month)+'-'+str(_day)
|
|
|
+ _time = _time.split("-")[0] + '-' + _time.split("-")[1].rjust(2,"0") + '-' + _time.split("-")[2].rjust(2,"0")
|
|
|
+ time_list.append(_time)
|
|
|
+ if len(time_list) >= 2:
|
|
|
+ if get_days_between(page_time, time_list[1]) > 1 and get_days_between(time_list[0], time_list[1]) > 0:
|
|
|
+ serviceTime_dict['service_end'] = time_list[1]
|
|
|
+ serviceTime_dict['service_start'] = time_list[0]
|
|
|
+ else:
|
|
|
+ if get_days_between(page_time, time_list[0]) > 1:
|
|
|
+ serviceTime_dict['service_end'] = time_list[0]
|
|
|
+ # service_days = (time.mktime(time.strptime(end_time,"%Y-%m-%d"))-page_timestamp)/(24*60*60)
|
|
|
+ elif re.search(pattern2,service_time) or re.search(pattern4,service_time):
|
|
|
+ for pattern in [pattern2,pattern4]:
|
|
|
+ unit = 1
|
|
|
+ match = re.findall(pattern,service_time)
|
|
|
+ if len(set(match))==1:
|
|
|
+ match_text = match[0]
|
|
|
+ if "月" in match_text:
|
|
|
+ unit = 30
|
|
|
+ elif "年" in match_text:
|
|
|
+ unit = 365
|
|
|
+ elif "周" in match_text or "星期" in match_text:
|
|
|
+ unit = 7
|
|
|
+ match_num = float(re.search("\d+",match_text).group())
|
|
|
+ # 数字能被365整除,单位更正为天
|
|
|
+ if int(match_num)%365==0:
|
|
|
+ unit = 1
|
|
|
+ if unit==365:
|
|
|
+ if match_num>10:#单位为'年'时,排除数字过大的
|
|
|
+ match_num = 0
|
|
|
+ elif unit==30:
|
|
|
+ if match_num>60:#单位为'月'时,排除数字过大的
|
|
|
+ match_num = 0
|
|
|
+ elif unit==1:
|
|
|
+ if match_num>4000:#单位为'日'时,排除数字过大的
|
|
|
+ match_num = 0
|
|
|
+ service_days = match_num * unit
|
|
|
+ if int(service_days) % 360==0:
|
|
|
+ service_days = service_days / 360 * 365
|
|
|
+ service_days = int(service_days)
|
|
|
+ if service_days <= 1 and service_days > 4000:
|
|
|
+ service_days = 0
|
|
|
+
|
|
|
+ if service_days>0:
|
|
|
+ service_days = str(service_days) + "天"
|
|
|
+ serviceTime_dict['service_days'] = service_days
|
|
|
+ break
|
|
|
+ elif "半年" in service_time:
|
|
|
+ service_days = 180
|
|
|
+ service_days = str(service_days) + "天"
|
|
|
+ serviceTime_dict['service_days'] = service_days
|
|
|
+
|
|
|
+ return serviceTime_dict
|
|
|
|
|
|
-def getOtherAttributes(list_entity,page_time):
|
|
|
+def getOtherAttributes(list_entity,page_time,prem):
|
|
|
dict_other = {"moneysource":"",
|
|
|
"person_review":[],
|
|
|
"serviceTime":"",
|
|
@@ -3553,7 +3949,7 @@ def getOtherAttributes(list_entity,page_time):
|
|
|
# print(entity.entity_text)
|
|
|
# if list_serviceTime and entity.in_attachment:
|
|
|
# continue
|
|
|
- if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[\-\./]\d{1,2}", entity.entity_text):
|
|
|
+ if re.search("[^之]日|天|年|月|周|星期", entity.entity_text) or re.search("\d{4}[-./]\d{1,2}", entity.entity_text):
|
|
|
list_serviceTime.append(entity)
|
|
|
elif entity.entity_type=="person" and entity.label ==4:
|
|
|
dict_other["person_review"].append(entity.entity_text)
|
|
@@ -3562,14 +3958,25 @@ def getOtherAttributes(list_entity,page_time):
|
|
|
elif entity.entity_type=='money' and entity.notes=='总投资' and float(dict_other["total_tendereeMoney"])<float(entity.entity_text):
|
|
|
dict_other["total_tendereeMoney"] = str(Decimal(entity.entity_text))
|
|
|
dict_other["total_tendereeMoneyUnit"] = entity.money_unit
|
|
|
- if list_serviceTime:
|
|
|
+
|
|
|
+ time_contractEnd = prem[0].get("time_contractEnd","")[:10]
|
|
|
+ time_contractStart = prem[0].get("time_contractStart","")[:10]
|
|
|
+ serviceTime_dict = {"service_start":"", "service_end":"", "service_days": ""}
|
|
|
+ if time_contractEnd:
|
|
|
+ serviceTime_dict['service_end'] = time_contractEnd
|
|
|
+ if time_contractStart:
|
|
|
+ if get_days_between(time_contractStart,time_contractEnd)>0:
|
|
|
+ serviceTime_dict['service_start'] = time_contractStart
|
|
|
+ # print([i.entity_text for i in list_serviceTime])
|
|
|
+ if list_serviceTime and not serviceTime_dict['service_end']:
|
|
|
list_serviceTime_inAtt = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==1]
|
|
|
list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
|
|
|
# if not list_serviceTime:
|
|
|
# list_serviceTime = list_serviceTime_inAtt
|
|
|
error_serviceTime = []
|
|
|
for list_time in [list_serviceTime,list_serviceTime_inAtt]:
|
|
|
- if not dict_other["serviceTime"]:
|
|
|
+ # if not dict_other["serviceTime"]:
|
|
|
+ if not serviceTime_dict['service_end']:
|
|
|
list_time.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
|
|
|
for _serviceTime in list_time:
|
|
|
# 优先取具体时间(20XX年x月x日-20XX年x月x日)
|
|
@@ -3578,28 +3985,49 @@ def getOtherAttributes(list_entity,page_time):
|
|
|
if _extract_time and len(_extract_time)==2:
|
|
|
# 排除开始和结束时间一样的错误模板,例:“履约期限:2023年02月15日至2023年02月15日”
|
|
|
if _extract_time[0]!=_extract_time[1]:
|
|
|
- dict_other["serviceTime"] = _serviceTime.entity_text
|
|
|
+ # dict_other["serviceTime"] = _serviceTime.entity_text
|
|
|
+ # extract_time = extract_serviceTime(_serviceTime.entity_text)
|
|
|
+ # if extract_time['service_end']:
|
|
|
+ serviceTime_dict['service_start'] = _extract_time[0]
|
|
|
+ serviceTime_dict['service_end'] = _extract_time[1]
|
|
|
break
|
|
|
else:
|
|
|
error_serviceTime.append(_serviceTime.entity_text)
|
|
|
- if not dict_other["serviceTime"]:
|
|
|
+ # if not dict_other["serviceTime"]:
|
|
|
+ if not serviceTime_dict['service_end']:
|
|
|
for _serviceTime in list_time:
|
|
|
# 优先取具体时间(20XX年x月-20XX年x月)
|
|
|
if re.search("20\d{2}[年/.\-]\d{1,2}月?[^。\d半一二三四五六七八九十壹两叁贰肆伍陆柒捌玖拾;;]{,3}20\d{2}[年/.\-]\d{1,2}月?", _serviceTime.entity_text):
|
|
|
- dict_other["serviceTime"] = _serviceTime.entity_text
|
|
|
- break
|
|
|
- if not dict_other["serviceTime"]:
|
|
|
+ # dict_other["serviceTime"] = _serviceTime.entity_text
|
|
|
+ extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
|
|
|
+ if extract_time['service_end']:
|
|
|
+ serviceTime_dict = extract_time
|
|
|
+ break
|
|
|
+ # if not dict_other["serviceTime"]:
|
|
|
+ if not serviceTime_dict['service_end']:
|
|
|
for _serviceTime in list_time:
|
|
|
# 优先取具体时间(20XX年x月x日)
|
|
|
if re.search("20\d{2}[年/.\-]\d{1,2}[月/.\-]\d{1,2}日?",_serviceTime.entity_text):
|
|
|
if _serviceTime.entity_text not in error_serviceTime:
|
|
|
- dict_other["serviceTime"] = _serviceTime.entity_text
|
|
|
- break
|
|
|
- if not dict_other["serviceTime"]:
|
|
|
+ # dict_other["serviceTime"] = _serviceTime.entity_text
|
|
|
+ extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
|
|
|
+ if extract_time['service_end']:
|
|
|
+ serviceTime_dict = extract_time
|
|
|
+ break
|
|
|
+ # if not dict_other["serviceTime"]:
|
|
|
+ if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
|
|
|
for _serviceTime in list_time:
|
|
|
if _serviceTime.entity_text not in error_serviceTime:
|
|
|
- dict_other["serviceTime"] = _serviceTime.entity_text
|
|
|
- break
|
|
|
+ # dict_other["serviceTime"] = _serviceTime.entity_text
|
|
|
+ extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
|
|
|
+ if extract_time['service_end'] or extract_time['service_days']:
|
|
|
+ serviceTime_dict = extract_time
|
|
|
+ break
|
|
|
+ if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
|
|
|
+ service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
|
|
|
+ serviceTime_dict['service_days'] = str(service_days) + "天"
|
|
|
+ dict_other["serviceTime"] = serviceTime_dict
|
|
|
+
|
|
|
|
|
|
if dict_other['moneysource']:
|
|
|
dict_other['moneysource'] = turnMoneySource(dict_other['moneysource'])
|