|
@@ -1093,7 +1093,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
return [(main_roles[row], attributes[col]) for row, col in max_dispatch]
|
|
|
|
|
|
# 正则提取电话号码实体
|
|
|
- key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
|
|
|
+ # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
|
|
|
phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
|
|
|
'\+86.?1[3|4|5|6|7|8|9]\d{9}|'
|
|
|
# '0[^0]\d{1,2}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
|
|
@@ -1118,8 +1118,6 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
res_set = set()
|
|
|
for i in re.finditer(phone, sentence_text):
|
|
|
res_set.add((i.group(), i.start(), i.end()))
|
|
|
- # for i in re.finditer(key_word, sentence_text):
|
|
|
- # res_set.add((i.group(2), i.start() + len(i.group(1)), i.end()))
|
|
|
res_set = sorted(list(res_set),key=lambda x:x[1])
|
|
|
last_phone_mask = True
|
|
|
for item_idx in range(len(res_set)):
|
|
@@ -1131,10 +1129,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
if not re.search("电,?话", phone_left):
|
|
|
last_phone_mask = False
|
|
|
continue
|
|
|
- if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]", phone_left):
|
|
|
+ if re.search("注册[证号]|帐,?号|编,?[号码]|报,?价|证,?号|价,?格|[\(\(]万?元[\)\)]|[a-zA-Z]+\d*$", phone_left):
|
|
|
last_phone_mask = False
|
|
|
continue
|
|
|
- if re.search("^\d{0,4}[.,]\d{2,}", phone_right):
|
|
|
+ if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+", phone_right):
|
|
|
last_phone_mask = False
|
|
|
continue
|
|
|
# if:上一个phone实体不符合条件
|
|
@@ -1742,6 +1740,73 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
|
|
|
for _item in remove_list:
|
|
|
PackDict["Project"]["roleList"][i].linklist.remove(_item)
|
|
|
|
|
|
+ # 联系人——电子邮箱链接
|
|
|
+ temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
|
|
|
+ temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index))
|
|
|
+ new_temporary_list3 = []
|
|
|
+ for _split in new_split_list:
|
|
|
+ temp_list = []
|
|
|
+ for _entity in temporary_list3:
|
|
|
+ if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
|
|
|
+ _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
|
|
|
+ temp_list.append(_entity)
|
|
|
+ elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
|
|
|
+ break
|
|
|
+ new_temporary_list3.append(temp_list)
|
|
|
+ # print(new_temporary_list3)
|
|
|
+ match_list3 = []
|
|
|
+ for split_index in range(len(new_temporary_list3)):
|
|
|
+ split_entitys = new_temporary_list3[split_index]
|
|
|
+ for index in range(len(split_entitys)):
|
|
|
+ entity = split_entitys[index]
|
|
|
+ if entity.entity_type == 'person':
|
|
|
+ match_nums = 0
|
|
|
+ for after_index in range(index + 1, min(len(split_entitys), index + 4)):
|
|
|
+ after_entity = split_entitys[after_index]
|
|
|
+ if match_nums > 2:
|
|
|
+ break
|
|
|
+ if after_entity.entity_type == 'email':
|
|
|
+ distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
|
|
|
+ tokens_num_dict[entity.sentence_index] + entity.end_index)
|
|
|
+ sentence_distance = after_entity.sentence_index - entity.sentence_index
|
|
|
+ if sentence_distance == 0:
|
|
|
+ if distance < 100:
|
|
|
+ if (entity.label == 0 and after_entity.label == 1) or (
|
|
|
+ entity.label == 1 and after_entity.label == 2):
|
|
|
+ distance = distance / 100
|
|
|
+ value = (-1 / 2 * (distance ** 2)) / 10000
|
|
|
+ match_list3.append(Match(entity, after_entity, value))
|
|
|
+ match_nums += 1
|
|
|
+ else:
|
|
|
+ if distance < 60:
|
|
|
+ if (entity.label == 0 and after_entity.label == 1) or (
|
|
|
+ entity.label == 1 and after_entity.label == 2):
|
|
|
+ distance = distance / 100
|
|
|
+ value = (-1 / 2 * (distance ** 2)) / 10000
|
|
|
+ match_list3.append(Match(entity, after_entity, value))
|
|
|
+ match_nums += 1
|
|
|
+ # 前向查找匹配
|
|
|
+ # if not match_nums:
|
|
|
+ if index != 0:
|
|
|
+ previous_entity = split_entitys[index - 1]
|
|
|
+ if previous_entity.entity_type == 'email':
|
|
|
+ if previous_entity.sentence_index == entity.sentence_index:
|
|
|
+ distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
|
|
|
+ tokens_num_dict[
|
|
|
+ previous_entity.sentence_index] + previous_entity.end_index)
|
|
|
+ if distance < 30:
|
|
|
+ # 距离相等时,前向添加处罚值
|
|
|
+ # distance += 1
|
|
|
+ # 前向 没有 /10000
|
|
|
+ value = (-1 / 2 * (distance ** 2))
|
|
|
+ match_list3.append(Match(entity, previous_entity, value))
|
|
|
+ # print(match_list3)
|
|
|
+ # km算法分配求解
|
|
|
+ result3 = dispatch(match_list3)
|
|
|
+ for match in result3:
|
|
|
+ match_person = match[0]
|
|
|
+ match_email = match[1]
|
|
|
+ match_person.pointer_email = match_email
|
|
|
|
|
|
# # 1)第一个公司实体的招标人,则看看下一个实体是否为代理人,如果是则联系人错位连接 。2)在同一句中往后找联系人。3)连接不上在整个文章找联系人。
|
|
|
# temp_ent_list = [] # 临时列表,记录0,1角色及3联系人
|