|
@@ -1508,7 +1508,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
for one_phone in _phone:
|
|
|
PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
|
|
|
agency_phone.add(one_phone)
|
|
|
-
|
|
|
# 正则提取电话号码实体
|
|
|
# key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
|
|
|
phone = re.compile('1[3-9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
|
|
@@ -1530,6 +1529,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
code_entitys = [ent for ent in list_entity if ent.entity_type=='code']
|
|
|
for _sentence in list_sentence:
|
|
|
sentence_text = _sentence.sentence_text
|
|
|
+ # 过长数字串直接过滤替换
|
|
|
+ for _re in re.findall("\d{50,}",sentence_text):
|
|
|
+ sentence_text = sentence_text.replace(_re,"#"*len(_re))
|
|
|
in_attachment = _sentence.in_attachment
|
|
|
list_tokenbegin = []
|
|
|
begin = 0
|
|
@@ -1556,6 +1558,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
continue
|
|
|
res_set.add((i.group(), i.start(), i.end()))
|
|
|
res_set = sorted(list(res_set),key=lambda x:x[1])
|
|
|
+ # 限制数量,防止异常数据处理时间过长
|
|
|
+ res_set = res_set[:200]
|
|
|
last_phone_mask = True
|
|
|
error_numStr_index = []
|
|
|
sentence_phone_list = []
|
|
@@ -2061,7 +2065,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
for _p in person_phone:
|
|
|
if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
|
|
|
PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
|
|
|
-
|
|
|
re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
|
|
|
split_list = [0] * 16
|
|
|
split_dict = {
|
|
@@ -2418,7 +2421,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
|
prepare_link.append(after_entity)
|
|
|
last_person = after_entity
|
|
|
continue
|
|
|
-
|
|
|
# 统一同类角色的属性
|
|
|
for k in PackDict.keys():
|
|
|
for i in range(len(PackDict[k]["roleList"])):
|
|
@@ -3090,7 +3092,9 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
|
'time_earnestMoneyStart': [], #10 保证金递交开始时间(保证金递交时间)
|
|
|
'time_earnestMoneyEnd': [] , # 11 保证金递交截止时间
|
|
|
'time_commencement':[] , #13 开工日期
|
|
|
- 'time_completion': [] # 14 竣工日期
|
|
|
+ 'time_completion': [], # 14 竣工日期
|
|
|
+ 'time_listingStart': [], # 15 挂牌开始日期(挂牌时间)
|
|
|
+ 'time_listingEnd': [] # 16 挂牌结束日期、挂牌截止日期
|
|
|
}
|
|
|
last_sentence_index = 0
|
|
|
last_time_type = ""
|
|
@@ -3101,22 +3105,49 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
|
'time_registrationStart':"time_registrationEnd",
|
|
|
'time_earnestMoneyStart':"time_earnestMoneyEnd",
|
|
|
'time_commencement':"time_completion",
|
|
|
+ 'time_listingStart':"time_listingEnd"
|
|
|
}
|
|
|
for entity in time_entitys:
|
|
|
sentence_text = list_sentence[entity.sentence_index].sentence_text
|
|
|
entity_left = sentence_text[max(0, entity.wordOffset_begin - 2):entity.wordOffset_begin]
|
|
|
+ entity_left2 = sentence_text[max(0, entity.wordOffset_begin - 10):entity.wordOffset_begin]
|
|
|
entity_right = sentence_text[entity.wordOffset_end:entity.wordOffset_end + 3]
|
|
|
label_prob = entity.values[entity.label]
|
|
|
entity_text = entity.entity_text
|
|
|
in_attachment = entity.in_attachment
|
|
|
extract_time = my_timeFormat(entity_text)
|
|
|
if extract_time:
|
|
|
+ # 2022/12/12 新增挂牌时间正则
|
|
|
+ if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
|
|
|
+ if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
|
|
|
+ if len(extract_time) == 1:
|
|
|
+ if re.search("挂牌.?(开始|起始).?(?:时间|日期)",entity_left2):
|
|
|
+ dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
|
|
|
+ last_time_type = 'time_listingStart'
|
|
|
+ elif re.search("挂牌.?(截[止至]|结束).?(?:时间|日期)",entity_left2):
|
|
|
+ dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
|
|
|
+ last_time_type = 'time_listingEnd'
|
|
|
+ elif re.search("挂牌.?(?:时间|日期)",entity_left2):
|
|
|
+ if re.search("前|止|截止",entity_right) or re.search("至|止|到",entity_left) or re.search("前",entity_text[-2:]):
|
|
|
+ dict_time['time_listingEnd'].append((extract_time[0], 0.5, in_attachment))
|
|
|
+ last_time_type = 'time_listingEnd'
|
|
|
+ else:
|
|
|
+ dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
|
|
|
+ last_time_type = 'time_listingStart'
|
|
|
+ else:
|
|
|
+ dict_time['time_listingStart'].append((extract_time[0], 0.5, in_attachment))
|
|
|
+ dict_time['time_listingEnd'].append((extract_time[1], 0.5, in_attachment))
|
|
|
+ last_time_type = ''
|
|
|
+ last_sentence_index = entity.sentence_index
|
|
|
+ continue
|
|
|
+
|
|
|
if re.search("至|到", entity_left):
|
|
|
if entity.sentence_index == last_sentence_index:
|
|
|
time_type = last_time_index.get(last_time_type)
|
|
|
if time_type:
|
|
|
dict_time[time_type].append((extract_time[0], 0.5 + label_prob / 10,in_attachment))
|
|
|
last_time_type = ""
|
|
|
+ last_sentence_index = entity.sentence_index
|
|
|
continue
|
|
|
if entity.label!=0:
|
|
|
if entity.label==1 and label_prob>0.5:
|