1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- '''
- Created on 2019年5月21日
- @author: User
- '''
- def edit_distance(source,target):
- dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
- for i in range(len(dp)):
- for j in range(len(dp[i])):
- if i==0:
- dp[i][j] = j
- elif j==0:
- dp[i][j] = i
- else:
- if source[j-1]==target[i-1]:
- cost = 0
- else:
- cost = 2
- dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
- return dp[-1][-1]
-
- def jaccard_score(source,target):
- source_set = set([s for s in source])
- target_set = set([s for s in target])
- if len(source_set)==0 or len(target_set)==0:
- return 0
- return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
- def link_entitys(list_entitys,on_value=0.8):
- for list_entity in list_entitys:
- range_entity = []
- for _entity in list_entity:
- if _entity.entity_type in ["org","company"]:
- range_entity.append(_entity)
- range_entity = range_entity[:1000]
- for first_i in range(len(range_entity)):
- _entity = list_entity[first_i]
- for second_i in range(first_i+1,len(range_entity)):
- _ent = list_entity[second_i]
- _score = jaccard_score(_entity.entity_text, _ent.entity_text)
- if _entity.entity_text!=_ent.entity_text and _score>=on_value:
- _entity.linked_entitys.append(_ent)
- _ent.linked_entitys.append(_entity)
-
- if __name__=="__main__":
- edit_distance("GUMBO","GAMBOL")
- print(jaccard_score("GUMBO","GAMBOL"))
|