entityLink.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. '''
  2. Created on 2019年5月21日
  3. @author: User
  4. '''
  5. def edit_distance(source,target):
  6. dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
  7. for i in range(len(dp)):
  8. for j in range(len(dp[i])):
  9. if i==0:
  10. dp[i][j] = j
  11. elif j==0:
  12. dp[i][j] = i
  13. else:
  14. if source[j-1]==target[i-1]:
  15. cost = 0
  16. else:
  17. cost = 2
  18. dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
  19. return dp[-1][-1]
  20. def jaccard_score(source,target):
  21. source_set = set([s for s in source])
  22. target_set = set([s for s in target])
  23. if len(source_set)==0 or len(target_set)==0:
  24. return 0
  25. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  26. def link_entitys(list_entitys,on_value=0.8):
  27. for list_entity in list_entitys:
  28. for _entity in list_entity:
  29. if _entity.entity_type in ["org","company","location"]:
  30. linked_entitys = []
  31. for _ent in list_entity:
  32. if _ent.entity_type in ["org","company"]:
  33. _score = jaccard_score(_entity.entity_text, _ent.entity_text)
  34. if _entity.entity_text!=_ent.entity_text and _score>=on_value:
  35. linked_entitys.append(_ent)
  36. _entity.linked_entitys = linked_entitys
  37. if __name__=="__main__":
  38. edit_distance("GUMBO","GAMBOL")
  39. print(jaccard_score("GUMBO","GAMBOL"))