''' Created on 2019年5月21日 @author: User ''' import re def edit_distance(source,target): dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)] for i in range(len(dp)): for j in range(len(dp[i])): if i==0: dp[i][j] = j elif j==0: dp[i][j] = i else: if source[j-1]==target[i-1]: cost = 0 else: cost = 2 dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost]) return dp[-1][-1] def jaccard_score(source,target): source_set = set([s for s in source]) target_set = set([s for s in target]) if len(source_set)==0 or len(target_set)==0: return 0 return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set)) def link_entitys(list_entitys,on_value=0.8): for list_entity in list_entitys: range_entity = [] for _entity in list_entity: if _entity.entity_type in ["org","company"]: range_entity.append(_entity) range_entity = range_entity[:1000] for first_i in range(len(range_entity)): _entity = range_entity[first_i] for second_i in range(first_i+1,len(range_entity)): _ent = range_entity[second_i] _score = jaccard_score(_entity.entity_text, _ent.entity_text) if _entity.entity_text!=_ent.entity_text and _score>=on_value: _entity.linked_entitys.append(_ent) _ent.linked_entitys.append(_entity) #替换公司名称 for _entity in range_entity: if re.search("公司",_entity.entity_text) is None: for _ent in _entity.linked_entitys: if re.search("公司$",_ent.entity_text) is not None: if len(_ent.entity_text)>len(_entity.entity_text): _entity.entity_text = _ent.entity_text if __name__=="__main__": edit_distance("GUMBO","GAMBOL") print(jaccard_score("GUMBO","GAMBOL"))