entityLink.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. '''
  2. Created on 2019年5月21日
  3. @author: User
  4. '''
  5. def edit_distance(source,target):
  6. dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
  7. for i in range(len(dp)):
  8. for j in range(len(dp[i])):
  9. if i==0:
  10. dp[i][j] = j
  11. elif j==0:
  12. dp[i][j] = i
  13. else:
  14. if source[j-1]==target[i-1]:
  15. cost = 0
  16. else:
  17. cost = 2
  18. dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
  19. return dp[-1][-1]
  20. def jaccard_score(source,target):
  21. source_set = set([s for s in source])
  22. target_set = set([s for s in target])
  23. if len(source_set)==0 or len(target_set)==0:
  24. return 0
  25. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  26. def link_entitys(list_entitys,on_value=0.8):
  27. for list_entity in list_entitys:
  28. range_entity = []
  29. for _entity in list_entity:
  30. if _entity.entity_type in ["org","company"]:
  31. range_entity.append(_entity)
  32. range_entity = range_entity[:1000]
  33. for first_i in range(len(range_entity)):
  34. _entity = list_entity[first_i]
  35. for second_i in range(first_i+1,len(range_entity)):
  36. _ent = list_entity[second_i]
  37. _score = jaccard_score(_entity.entity_text, _ent.entity_text)
  38. if _entity.entity_text!=_ent.entity_text and _score>=on_value:
  39. _entity.linked_entitys.append(_ent)
  40. _ent.linked_entitys.append(_entity)
  41. if __name__=="__main__":
  42. edit_distance("GUMBO","GAMBOL")
  43. print(jaccard_score("GUMBO","GAMBOL"))