''' Created on 2019年5月21日 @author: User ''' def edit_distance(source,target): dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)] for i in range(len(dp)): for j in range(len(dp[i])): if i==0: dp[i][j] = j elif j==0: dp[i][j] = i else: if source[j-1]==target[i-1]: cost = 0 else: cost = 2 dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost]) return dp[-1][-1] def jaccard_score(source,target): source_set = set([s for s in source]) target_set = set([s for s in target]) if len(source_set)==0 or len(target_set)==0: return 0 return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set)) def link_entitys(list_entitys,on_value=0.8): for list_entity in list_entitys: for _entity in list_entity: if _entity.entity_type in ["org","company","location"]: linked_entitys = [] for _ent in list_entity: if _ent.entity_type in ["org","company"]: _score = jaccard_score(_entity.entity_text, _ent.entity_text) if _entity.entity_text!=_ent.entity_text and _score>=on_value: linked_entitys.append(_ent) _entity.linked_entitys = linked_entitys if __name__=="__main__": edit_distance("GUMBO","GAMBOL") print(jaccard_score("GUMBO","GAMBOL"))