|
@@ -1384,7 +1384,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
error_numStr_index.append(numStr_index)
|
|
error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
- if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
|
|
|
|
|
|
+ if re.search("身份证号?码?|注册[证号]|帐号|编[号码]|报价|费率|标价|证号|价格|型号|附件|代码|列号|行号|税号|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", re.sub(",","",phone_left)):
|
|
error_numStr_index.append(numStr_index)
|
|
error_numStr_index.append(numStr_index)
|
|
last_phone_mask = False
|
|
last_phone_mask = False
|
|
continue
|
|
continue
|
|
@@ -1528,7 +1528,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
# print("预测数据:",len(temp_data))
|
|
# print("预测数据:",len(temp_data))
|
|
# 去重结果
|
|
# 去重结果
|
|
relation_list = list(set(relation_list))
|
|
relation_list = list(set(relation_list))
|
|
- # print(relation_list)
|
|
|
|
|
|
+ # print([(rel[0].entity_text,rel[2].entity_text) for rel in relation_list])
|
|
right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
|
|
right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
|
|
linked_company = set()
|
|
linked_company = set()
|
|
linked_person = set()
|
|
linked_person = set()
|
|
@@ -1542,14 +1542,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
_object = relation[2]
|
|
_object = relation[2]
|
|
if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
|
|
if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
|
|
if relation[1]==predicate:
|
|
if relation[1]==predicate:
|
|
|
|
+ distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
|
|
|
|
+ tokens_num_dict[_subject.sentence_index] + _subject.end_index)
|
|
if predicate=="rel_person":
|
|
if predicate=="rel_person":
|
|
if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
|
|
if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
|
|
continue
|
|
continue
|
|
# 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
# 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
- if _subject.label in [2,3,4] and re.search("质疑|投诉|监督|受理",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
|
|
|
|
|
|
+ if _subject.label in [2,3,4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
|
|
|
|
+ continue
|
|
|
|
+ # 角色为中标候选人,排除距离过远的联系人
|
|
|
|
+ if _subject.label in [2, 3, 4] and distance>=40:
|
|
continue
|
|
continue
|
|
- distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
|
|
|
|
- tokens_num_dict[_subject.sentence_index] + _subject.end_index)
|
|
|
|
if distance>0:
|
|
if distance>0:
|
|
value = (-1 / 2 * (distance ** 2))/10000
|
|
value = (-1 / 2 * (distance ** 2))/10000
|
|
else:
|
|
else:
|
|
@@ -1690,7 +1693,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
sentence_distance = after_entity.sentence_index - entity.sentence_index
|
|
sentence_distance = after_entity.sentence_index - entity.sentence_index
|
|
value = (-1 / 2 * (distance ** 2)) / 10000
|
|
value = (-1 / 2 * (distance ** 2)) / 10000
|
|
if sentence_distance == 0:
|
|
if sentence_distance == 0:
|
|
- if distance < 80:
|
|
|
|
|
|
+ if distance < 70:
|
|
# value = (-1 / 2 * (distance ** 2)) / 10000
|
|
# value = (-1 / 2 * (distance ** 2)) / 10000
|
|
t_match_list.append(Match(entity, after_entity, value))
|
|
t_match_list.append(Match(entity, after_entity, value))
|
|
match_nums += 1
|
|
match_nums += 1
|
|
@@ -1699,7 +1702,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
else:
|
|
else:
|
|
break
|
|
break
|
|
else:
|
|
else:
|
|
- if distance < 50:
|
|
|
|
|
|
+ if distance < 40:
|
|
# value = (-1 / 2 * (distance ** 2)) / 10000
|
|
# value = (-1 / 2 * (distance ** 2)) / 10000
|
|
t_match_list.append(Match(entity, after_entity, value))
|
|
t_match_list.append(Match(entity, after_entity, value))
|
|
match_nums += 1
|
|
match_nums += 1
|
|
@@ -1945,6 +1948,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
match_list2 = []
|
|
match_list2 = []
|
|
for split_index in range(len(new_temporary_list2)):
|
|
for split_index in range(len(new_temporary_list2)):
|
|
split_entitys = new_temporary_list2[split_index]
|
|
split_entitys = new_temporary_list2[split_index]
|
|
|
|
+ if len(split_entitys)<=1:
|
|
|
|
+ continue
|
|
is_skip = False
|
|
is_skip = False
|
|
for index in range(len(split_entitys)):
|
|
for index in range(len(split_entitys)):
|
|
entity = split_entitys[index]
|
|
entity = split_entitys[index]
|
|
@@ -1958,20 +1963,25 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
for after_index in range(index + 1, min(len(split_entitys), index + 4)):
|
|
for after_index in range(index + 1, min(len(split_entitys), index + 4)):
|
|
after_entity = split_entitys[after_index]
|
|
after_entity = split_entitys[after_index]
|
|
if after_entity.entity_type in ['person']:
|
|
if after_entity.entity_type in ['person']:
|
|
-
|
|
|
|
|
|
+ distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
|
|
|
|
+ tokens_num_dict[entity.sentence_index] + entity.end_index)
|
|
# 实体为中标人/候选人,联系人已确定类别【1,2】
|
|
# 实体为中标人/候选人,联系人已确定类别【1,2】
|
|
if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
|
|
if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
|
|
break
|
|
break
|
|
|
|
+ if entity.label in [2, 3, 4] and distance>=20:
|
|
|
|
+ break
|
|
# 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
# 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
|
|
- if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
|
|
|
|
+ if entity.label in [2, 3, 4] and re.search("质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
|
|
break
|
|
break
|
|
if after_entity.label in [1, 2, 3]:
|
|
if after_entity.label in [1, 2, 3]:
|
|
- distance = (tokens_num_dict[
|
|
|
|
- after_entity.sentence_index] + after_entity.begin_index) - (
|
|
|
|
- tokens_num_dict[entity.sentence_index] + entity.end_index)
|
|
|
|
|
|
+ # distance = (tokens_num_dict[
|
|
|
|
+ # after_entity.sentence_index] + after_entity.begin_index) - (
|
|
|
|
+ # tokens_num_dict[entity.sentence_index] + entity.end_index)
|
|
sentence_distance = after_entity.sentence_index - entity.sentence_index
|
|
sentence_distance = after_entity.sentence_index - entity.sentence_index
|
|
if sentence_distance == 0:
|
|
if sentence_distance == 0:
|
|
if distance < 100:
|
|
if distance < 100:
|
|
|
|
+ if entity.label in [2, 3, 4] and distance>40:
|
|
|
|
+ break
|
|
if (entity.label == 0 and after_entity.label == 1) or (
|
|
if (entity.label == 0 and after_entity.label == 1) or (
|
|
entity.label == 1 and after_entity.label == 2):
|
|
entity.label == 1 and after_entity.label == 2):
|
|
distance = distance / 100
|
|
distance = distance / 100
|
|
@@ -1980,6 +1990,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
match_nums += 1
|
|
match_nums += 1
|
|
else:
|
|
else:
|
|
if distance < 60:
|
|
if distance < 60:
|
|
|
|
+ if entity.label in [2, 3, 4] and distance>20:
|
|
|
|
+ break
|
|
if (entity.label == 0 and after_entity.label == 1) or (
|
|
if (entity.label == 0 and after_entity.label == 1) or (
|
|
entity.label == 1 and after_entity.label == 2):
|
|
entity.label == 1 and after_entity.label == 2):
|
|
distance = distance / 100
|
|
distance = distance / 100
|
|
@@ -2008,17 +2020,15 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
if re.search("地,?址", after_entity_left):
|
|
if re.search("地,?址", after_entity_left):
|
|
is_skip = True
|
|
is_skip = True
|
|
continue
|
|
continue
|
|
- if re.search("\(|(", after_entity_left) and re.search("\)|)",
|
|
|
|
- after_entity_right):
|
|
|
|
|
|
+ if re.search("\(|(", after_entity_left) and re.search("\)|)",after_entity_right):
|
|
is_skip = True
|
|
is_skip = True
|
|
continue
|
|
continue
|
|
- if entity.label in [0, 1] and after_entity.label in [0,
|
|
|
|
- 1] and entity.label == after_entity.label:
|
|
|
|
|
|
+ if entity.label in [0, 1] and after_entity.label in [0, 1] and entity.label == after_entity.label:
|
|
break
|
|
break
|
|
if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[
|
|
if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[
|
|
index + 1].entity_type == "person":
|
|
index + 1].entity_type == "person":
|
|
break
|
|
break
|
|
- if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
|
|
|
|
|
|
+ if entity.label in [0, 1 ,5] and after_entity.label in [2, 3, 4]:
|
|
break
|
|
break
|
|
if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
|
|
if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
|
|
break
|
|
break
|
|
@@ -2044,23 +2054,30 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
distance = 1
|
|
distance = 1
|
|
if is_same_sentence:
|
|
if is_same_sentence:
|
|
if phone_begin <= 200:
|
|
if phone_begin <= 200:
|
|
|
|
+ if entity.label in [2,3,4] and phone_begin>80:
|
|
|
|
+ break
|
|
value = (-1 / 2 * (distance ** 2)) / 10000
|
|
value = (-1 / 2 * (distance ** 2)) / 10000
|
|
match_list2.append(Match(entity, (entity, _phone), value))
|
|
match_list2.append(Match(entity, (entity, _phone), value))
|
|
match_nums += 1
|
|
match_nums += 1
|
|
else:
|
|
else:
|
|
if phone_begin <= 60:
|
|
if phone_begin <= 60:
|
|
|
|
+ if entity.label in [2,3,4] and phone_begin>40:
|
|
|
|
+ break
|
|
value = (-1 / 2 * (distance ** 2)) / 10000
|
|
value = (-1 / 2 * (distance ** 2)) / 10000
|
|
match_list2.append(Match(entity, (entity, _phone), value))
|
|
match_list2.append(Match(entity, (entity, _phone), value))
|
|
match_nums += 1
|
|
match_nums += 1
|
|
else:
|
|
else:
|
|
next_entity = split_entitys[index + 1]
|
|
next_entity = split_entitys[index + 1]
|
|
if next_entity.entity_type in ["org","company"]:
|
|
if next_entity.entity_type in ["org","company"]:
|
|
- _entity_left = list_sentence[next_entity.sentence_index].sentence_text[max(0, next_entity.wordOffset_begin - 20):next_entity.wordOffset_begin]
|
|
|
|
|
|
+ _entity_left = list_sentence[next_entity.sentence_index].sentence_text[entity.wordOffset_end:next_entity.wordOffset_begin]
|
|
_entity_left2 = re.sub(",()\(\)::", "", _entity_left)
|
|
_entity_left2 = re.sub(",()\(\)::", "", _entity_left)
|
|
_entity_left2 = _entity_left2[-5:]
|
|
_entity_left2 = _entity_left2[-5:]
|
|
if re.search("(地,?址|地,?点)[::][^,。]*$", _entity_left) or re.search("地址|地点", _entity_left2):
|
|
if re.search("(地,?址|地,?点)[::][^,。]*$", _entity_left) or re.search("地址|地点", _entity_left2):
|
|
if index + 2<= len(split_entitys) - 1:
|
|
if index + 2<= len(split_entitys) - 1:
|
|
next_entity = split_entitys[index + 2]
|
|
next_entity = split_entitys[index + 2]
|
|
|
|
+ if len(_entity_left)<=2 and re.search("[、(\(]",_entity_left):
|
|
|
|
+ if index + 2 <= len(split_entitys) - 1:
|
|
|
|
+ next_entity = split_entitys[index + 2]
|
|
if entity.sentence_index == next_entity.sentence_index:
|
|
if entity.sentence_index == next_entity.sentence_index:
|
|
mid_tokens += list_sentence[entity.sentence_index].tokens[
|
|
mid_tokens += list_sentence[entity.sentence_index].tokens[
|
|
entity.end_index + 1:next_entity.begin_index]
|
|
entity.end_index + 1:next_entity.begin_index]
|
|
@@ -2226,6 +2243,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
prepare_link.append(after_entity)
|
|
prepare_link.append(after_entity)
|
|
last_person = after_entity
|
|
last_person = after_entity
|
|
continue
|
|
continue
|
|
|
|
+
|
|
# 统一同类角色的属性
|
|
# 统一同类角色的属性
|
|
for k in PackDict.keys():
|
|
for k in PackDict.keys():
|
|
for i in range(len(PackDict[k]["roleList"])):
|
|
for i in range(len(PackDict[k]["roleList"])):
|
|
@@ -2259,8 +2277,11 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
(_pointer_person.entity_text, _p))
|
|
(_pointer_person.entity_text, _p))
|
|
|
|
|
|
# "roleList"中联系人电话去重
|
|
# "roleList"中联系人电话去重
|
|
|
|
+ tenderee_agency_phone = []
|
|
for k in PackDict.keys():
|
|
for k in PackDict.keys():
|
|
for i in range(len(PackDict[k]["roleList"])):
|
|
for i in range(len(PackDict[k]["roleList"])):
|
|
|
|
+ if PackDict[k]["roleList"][i].role_name in ['agency','tenderee']:
|
|
|
|
+ tenderee_agency_phone.extend([person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[1]])
|
|
# 带有联系人的电话
|
|
# 带有联系人的电话
|
|
with_person = [person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[0]]
|
|
with_person = [person_phone[1] for person_phone in PackDict[k]["roleList"][i].linklist if person_phone[0]]
|
|
# 带有电话的联系人
|
|
# 带有电话的联系人
|
|
@@ -2276,7 +2297,26 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
remove_list.append(item)
|
|
remove_list.append(item)
|
|
for _item in remove_list:
|
|
for _item in remove_list:
|
|
PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
-
|
|
|
|
|
|
+ # 中标候选人联系方式异常排除
|
|
|
|
+ for k in PackDict.keys():
|
|
|
|
+ for i in range(len(PackDict[k]["roleList"])):
|
|
|
|
+ if PackDict[k]["roleList"][i].role_name in ['win_tenderer', 'second_tenderer','third_tenderer']:
|
|
|
|
+ if tenderee_agency_phone:
|
|
|
|
+ remove_list = []
|
|
|
|
+ for item in PackDict[k]["roleList"][i].linklist:
|
|
|
|
+ if item[1] and item[1] in tenderee_agency_phone:
|
|
|
|
+ remove_list.append(item)
|
|
|
|
+ for _item in remove_list:
|
|
|
|
+ PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
|
|
+ # else:
|
|
|
|
+ # # 公告中无招标代理联系方式时,可排除中标联系方式
|
|
|
|
+ # remove_list = []
|
|
|
|
+ # for _item in PackDict[k]["roleList"][i].linklist:
|
|
|
|
+ # # 有联系方式
|
|
|
|
+ # if _item[1]:
|
|
|
|
+ # remove_list.append(_item)
|
|
|
|
+ # for _item in remove_list:
|
|
|
|
+ # PackDict[k]["roleList"][i].linklist.remove(_item)
|
|
# PackDict更新company/org地址
|
|
# PackDict更新company/org地址
|
|
last_role_prob = {}
|
|
last_role_prob = {}
|
|
for ent in pre_entity:
|
|
for ent in pre_entity:
|
|
@@ -2704,9 +2744,19 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
# 公告中只有"招标人"且无"联系人"链接时
|
|
# 公告中只有"招标人"且无"联系人"链接时
|
|
if len(PackDict)==1:
|
|
if len(PackDict)==1:
|
|
k = list(PackDict.keys())[0]
|
|
k = list(PackDict.keys())[0]
|
|
- if len(PackDict[k]["roleList"])==1:
|
|
|
|
- if PackDict[k]["roleList"][0].role_name == "tenderee":
|
|
|
|
- if not PackDict[k]["roleList"][0].linklist:
|
|
|
|
|
|
+ tenderee_agency_role = [role for role in PackDict[k]["roleList"] if role.role_name in ['tenderee','agency']]
|
|
|
|
+ if len(tenderee_agency_role)==1:
|
|
|
|
+ exist_person = []
|
|
|
|
+ exist_phone = []
|
|
|
|
+ for role in PackDict[k]["roleList"]:
|
|
|
|
+ for group in role.linklist:
|
|
|
|
+ if group[0]:
|
|
|
|
+ exist_person.append(group[0])
|
|
|
|
+ if group[1]:
|
|
|
|
+ exist_phone.append(group[1])
|
|
|
|
+
|
|
|
|
+ if tenderee_agency_role[0].role_name == "tenderee":
|
|
|
|
+ if not tenderee_agency_role[0].linklist:
|
|
get_contacts = False
|
|
get_contacts = False
|
|
if not get_contacts:
|
|
if not get_contacts:
|
|
# 根据大纲Outline类召回联系人
|
|
# 根据大纲Outline类召回联系人
|
|
@@ -2718,8 +2768,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
if t_person.person_phone:
|
|
if t_person.person_phone:
|
|
_phone = [p.entity_text for p in t_person.person_phone]
|
|
_phone = [p.entity_text for p in t_person.person_phone]
|
|
for _p in _phone:
|
|
for _p in _phone:
|
|
- PackDict[k]["roleList"][0].linklist.append((t_person.entity_text, _p))
|
|
|
|
- get_contacts = True
|
|
|
|
|
|
+ if t_person.entity_text not in exist_person and _p not in exist_phone:
|
|
|
|
+ tenderee_agency_role[0].linklist.append((t_person.entity_text, _p))
|
|
|
|
+ get_contacts = True
|
|
break
|
|
break
|
|
elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \
|
|
elif words_num_dict[t_person.sentence_index] + t_person.wordOffset_begin >= \
|
|
words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
|
|
words_num_dict[outline.sentence_end_index] + outline.wordOffset_end:
|
|
@@ -2727,9 +2778,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
if not get_contacts:
|
|
if not get_contacts:
|
|
sentence_phone = phone.findall(outline.outline_text)
|
|
sentence_phone = phone.findall(outline.outline_text)
|
|
if sentence_phone:
|
|
if sentence_phone:
|
|
- PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
|
|
|
|
- get_contacts = True
|
|
|
|
- break
|
|
|
|
|
|
+ if sentence_phone[0] not in exist_phone:
|
|
|
|
+ tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
|
|
|
|
+ get_contacts = True
|
|
|
|
+ break
|
|
if not get_contacts:
|
|
if not get_contacts:
|
|
# 直接取文中倒数第一个联系人
|
|
# 直接取文中倒数第一个联系人
|
|
for _entity in temporary_list2[::-1]:
|
|
for _entity in temporary_list2[::-1]:
|
|
@@ -2737,14 +2789,16 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
if _entity.person_phone:
|
|
if _entity.person_phone:
|
|
_phone = [p.entity_text for p in _entity.person_phone]
|
|
_phone = [p.entity_text for p in _entity.person_phone]
|
|
for _p in _phone:
|
|
for _p in _phone:
|
|
- PackDict[k]["roleList"][0].linklist.append((_entity.entity_text, _p))
|
|
|
|
- get_contacts = True
|
|
|
|
|
|
+ if _entity.entity_text not in exist_person and _p not in exist_phone:
|
|
|
|
+ tenderee_agency_role[0].linklist.append((_entity.entity_text, _p))
|
|
|
|
+ get_contacts = True
|
|
break
|
|
break
|
|
if not get_contacts:
|
|
if not get_contacts:
|
|
# 如果文中只有一个“phone”实体,则直接取为联系人电话
|
|
# 如果文中只有一个“phone”实体,则直接取为联系人电话
|
|
if len(phone_entitys) == 1:
|
|
if len(phone_entitys) == 1:
|
|
- PackDict[k]["roleList"][0].linklist.append(("", phone_entitys[0].entity_text))
|
|
|
|
- get_contacts = True
|
|
|
|
|
|
+ if phone_entitys[0].entity_text not in exist_phone:
|
|
|
|
+ tenderee_agency_role[0].linklist.append(("", phone_entitys[0].entity_text))
|
|
|
|
+ get_contacts = True
|
|
if not get_contacts:
|
|
if not get_contacts:
|
|
# 通过大纲Outline类直接取电话
|
|
# 通过大纲Outline类直接取电话
|
|
if len(new_split_list) > 1:
|
|
if len(new_split_list) > 1:
|
|
@@ -2754,8 +2808,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
|
|
if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
|
|
sentence_phone = phone.findall(temp_sentence)
|
|
sentence_phone = phone.findall(temp_sentence)
|
|
if sentence_phone:
|
|
if sentence_phone:
|
|
- if sentence_phone[0] in [ent.entity_text for ent in phone_entitys]:
|
|
|
|
- PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
|
|
|
|
|
|
+ if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in exist_phone:
|
|
|
|
+ tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
|
|
get_contacts = True
|
|
get_contacts = True
|
|
break
|
|
break
|
|
if not get_contacts:
|
|
if not get_contacts:
|
|
@@ -2773,9 +2827,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
|
|
match_text = match_text.split("。")[0]
|
|
match_text = match_text.split("。")[0]
|
|
sentence_phone = phone.findall(match_text)
|
|
sentence_phone = phone.findall(match_text)
|
|
if sentence_phone:
|
|
if sentence_phone:
|
|
- PackDict[k]["roleList"][0].linklist.append(("", sentence_phone[0]))
|
|
|
|
- get_tenderee_contacts = True
|
|
|
|
- break
|
|
|
|
|
|
+ if sentence_phone[0] not in exist_phone:
|
|
|
|
+ tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
|
|
|
|
+ get_tenderee_contacts = True
|
|
|
|
+ break
|
|
if get_tenderee_contacts:
|
|
if get_tenderee_contacts:
|
|
break
|
|
break
|
|
|
|
|
|
@@ -2990,48 +3045,84 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
in_attachment = entity.in_attachment
|
|
in_attachment = entity.in_attachment
|
|
extract_time = my_timeFormat(entity_text)
|
|
extract_time = my_timeFormat(entity_text)
|
|
# definite_time = "00:00:00"
|
|
# definite_time = "00:00:00"
|
|
- # if extract_time:
|
|
|
|
- # t = re.compile("(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
|
|
|
|
- # t_in_word = re.search(t,entity_text)
|
|
|
|
- # t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,sentence_text[entity.wordOffset_end:])
|
|
|
|
- # if t_in_word:
|
|
|
|
- # print('t_in_word',entity_text,t_in_word.groupdict())
|
|
|
|
- # day = t_in_word.groupdict().get('day',"")
|
|
|
|
- # hour = t_in_word.groupdict().get('hour',"")
|
|
|
|
- # half_hour = t_in_word.groupdict().get('half_hour',"")
|
|
|
|
- # minute = t_in_word.groupdict().get('minute',"")
|
|
|
|
- # second = t_in_word.groupdict().get('second',"")
|
|
|
|
- # if hour:
|
|
|
|
- # if day=='下午' and int(hour)<12:
|
|
|
|
- # hour = str(int(hour)+12)
|
|
|
|
- # if int(hour)>24:
|
|
|
|
- # continue
|
|
|
|
- # else:
|
|
|
|
- # hour = "00"
|
|
|
|
- # if not minute:
|
|
|
|
- # if half_hour:
|
|
|
|
- # minute = "30"
|
|
|
|
- # else:
|
|
|
|
- # minute = "00"
|
|
|
|
- # if int(minute)>60:
|
|
|
|
- # continue
|
|
|
|
- # if not second:
|
|
|
|
- # second = "00"
|
|
|
|
- # if int(second)>60:
|
|
|
|
- # continue
|
|
|
|
- # # 数字字符格式化
|
|
|
|
- # # hour = str(int(hour))
|
|
|
|
- # # minute = str(int(minute))
|
|
|
|
- # # second = str(int(second))
|
|
|
|
- # definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
|
|
|
|
- # print(definite_time)
|
|
|
|
- #
|
|
|
|
- # elif t_out_of_word:
|
|
|
|
- # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
|
|
|
|
|
|
+ if extract_time:
|
|
|
|
+ definite_time_list = []
|
|
|
|
+ t = re.compile("(?P<day>下午|上午|早上)?(?P<hour>\d{1,2})[::时点](?P<half_hour>半)?(?P<minute>\d{2})?[::分]?(?P<second>\d{2})?秒?")
|
|
|
|
+ t_in_word = re.search(t,entity_text.replace(" ",""))
|
|
|
|
+ t_out_of_word = re.search("^[^\d]{,2}"+t.pattern,sentence_text[entity.wordOffset_end:])
|
|
|
|
+ if t_in_word:
|
|
|
|
+ # print('t_in_word',entity_text,t_in_word.groupdict())
|
|
|
|
+ day = t_in_word.groupdict().get('day',"")
|
|
|
|
+ hour = t_in_word.groupdict().get('hour',"")
|
|
|
|
+ half_hour = t_in_word.groupdict().get('half_hour',"")
|
|
|
|
+ minute = t_in_word.groupdict().get('minute',"")
|
|
|
|
+ second = t_in_word.groupdict().get('second',"")
|
|
|
|
+ if hour:
|
|
|
|
+ if day=='下午' and int(hour)<12:
|
|
|
|
+ hour = str(int(hour)+12)
|
|
|
|
+ if int(hour)>24:
|
|
|
|
+ continue
|
|
|
|
+ else:
|
|
|
|
+ hour = "00"
|
|
|
|
+ if not minute:
|
|
|
|
+ if half_hour:
|
|
|
|
+ minute = "30"
|
|
|
|
+ else:
|
|
|
|
+ minute = "00"
|
|
|
|
+ if int(minute)>60:
|
|
|
|
+ continue
|
|
|
|
+ if not second:
|
|
|
|
+ second = "00"
|
|
|
|
+ if int(second)>60:
|
|
|
|
+ continue
|
|
|
|
+ definite_time = "%s:%s:%s"%(hour.rjust(2,"0"),minute.rjust(2,"0"),second.rjust(2,"0"))
|
|
|
|
+ # print(definite_time)
|
|
|
|
+ definite_time_list.append(definite_time)
|
|
|
|
+
|
|
|
|
+ if t_out_of_word:
|
|
|
|
+ # print('t_out_of_word', entity_text+sentence_text[entity.wordOffset_end:], t_out_of_word.groupdict())
|
|
|
|
+ day = t_out_of_word.groupdict().get('day', "")
|
|
|
|
+ hour = t_out_of_word.groupdict().get('hour', "")
|
|
|
|
+ half_hour = t_out_of_word.groupdict().get('half_hour', "")
|
|
|
|
+ minute = t_out_of_word.groupdict().get('minute', "")
|
|
|
|
+ second = t_out_of_word.groupdict().get('second', "")
|
|
|
|
+ if hour:
|
|
|
|
+ if day == '下午' and int(hour) < 12:
|
|
|
|
+ hour = str(int(hour) + 12)
|
|
|
|
+ if int(hour) > 24:
|
|
|
|
+ continue
|
|
|
|
+ else:
|
|
|
|
+ hour = "00"
|
|
|
|
+ if not minute:
|
|
|
|
+ if half_hour:
|
|
|
|
+ minute = "30"
|
|
|
|
+ else:
|
|
|
|
+ minute = "00"
|
|
|
|
+ if int(minute) > 60:
|
|
|
|
+ continue
|
|
|
|
+ if not second:
|
|
|
|
+ second = "00"
|
|
|
|
+ if int(second) > 60:
|
|
|
|
+ continue
|
|
|
|
+ definite_time = "%s:%s:%s" % (hour.rjust(2, "0"), minute.rjust(2, "0"), second.rjust(2, "0"))
|
|
|
|
+ # print(definite_time)
|
|
|
|
+ definite_time_list.append(definite_time)
|
|
|
|
|
|
|
|
|
|
|
|
+ min_len = min(len(extract_time),len(definite_time_list))
|
|
|
|
+ for i in range(min_len):
|
|
|
|
+ if definite_time_list[i] != "00:00:00":
|
|
|
|
+ extract_time[i] = extract_time[i] + " " + definite_time_list[i]
|
|
|
|
|
|
if extract_time:
|
|
if extract_time:
|
|
|
|
+ # 时间变更prob优化
|
|
|
|
+ if re.search("原",entity_left2):
|
|
|
|
+ last_index = 0
|
|
|
|
+ for item in re.finditer("原",entity_left2):
|
|
|
|
+ last_index = item.start() + 1
|
|
|
|
+ label_prob = label_prob - 0.2 * last_index / len(entity_left2)
|
|
|
|
+ # print('prob优化',label_prob,extract_time)
|
|
|
|
+
|
|
# 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
|
|
# 优化多个并列的时间,如:开标时间和截标时间,截标时间和报名结束时间
|
|
if entity.label in [2,3,9]:
|
|
if entity.label in [2,3,9]:
|
|
if entity.label==2 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
if entity.label==2 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
@@ -3042,8 +3133,12 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
|
|
dict_time['time_registrationEnd'].append((extract_time[0], 0.5, in_attachment))
|
|
if entity.label==9 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
if entity.label==9 and re.search("截标|投标.{,2}截止|递交(?:文件)?.{,2}截止|报价.{,2}截止|响应.{,2}截止",entity_left3):
|
|
dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
|
|
dict_time['time_bidclose'].append((extract_time[0], 0.5, in_attachment))
|
|
-
|
|
|
|
-
|
|
|
|
|
|
+ # 补充公告末尾处的发布时间
|
|
|
|
+ if entity.label==0:
|
|
|
|
+ if entity.is_tail:
|
|
|
|
+ entity.label = 1
|
|
|
|
+ entity.values[1] = 0.5
|
|
|
|
+ dict_time['time_release'].append((extract_time[0], 0.5, in_attachment))
|
|
# 2022/12/12 新增挂牌时间正则
|
|
# 2022/12/12 新增挂牌时间正则
|
|
if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
|
|
if re.search("挂牌.{,4}(?:时间|日期)",entity_left2):
|
|
if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
|
|
if re.search("挂牌.{,4}(?:时间|日期)",entity_left2).end()>len(entity_left2)/2:
|
|
@@ -3206,7 +3301,7 @@ def getTimeAttributes(list_entity,list_sentence):
|
|
last_time_type = ""
|
|
last_time_type = ""
|
|
last_sentence_index = entity.sentence_index
|
|
last_sentence_index = entity.sentence_index
|
|
|
|
|
|
-
|
|
|
|
|
|
+ # print(dict_time)
|
|
result_dict = dict((key,"") for key in dict_time.keys())
|
|
result_dict = dict((key,"") for key in dict_time.keys())
|
|
for time_type,value in dict_time.items():
|
|
for time_type,value in dict_time.items():
|
|
list_time = dict_time[time_type]
|
|
list_time = dict_time[time_type]
|