''' Created on 2019年5月21日 @author: User ''' def edit_distance(source,target): dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)] for i in range(len(dp)): for j in range(len(dp[i])): if i==0: dp[i][j] = j elif j==0: dp[i][j] = i else: if source[j-1]==target[i-1]: cost = 0 else: cost = 2 dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost]) return dp[-1][-1] def jaccard_score(source,target): source_set = set([s for s in source]) target_set = set([s for s in target]) if len(source_set)==0 or len(target_set)==0: return 0 return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set)) def link_entitys(list_entitys,on_value=0.8): for list_entity in list_entitys: range_entity = [] for _entity in list_entity: if _entity.entity_type in ["org","company"]: range_entity.append(_entity) range_entity = range_entity[:1000] for first_i in range(len(range_entity)): _entity = list_entity[first_i] for second_i in range(first_i+1,len(range_entity)): _ent = list_entity[second_i] _score = jaccard_score(_entity.entity_text, _ent.entity_text) if _entity.entity_text!=_ent.entity_text and _score>=on_value: _entity.linked_entitys.append(_ent) _ent.linked_entitys.append(_entity) if __name__=="__main__": edit_distance("GUMBO","GAMBOL") print(jaccard_score("GUMBO","GAMBOL"))