entityLink.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. '''
  2. Created on 2019年5月21日
  3. @author: User
  4. '''
  5. import re
  6. def edit_distance(source,target):
  7. dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
  8. for i in range(len(dp)):
  9. for j in range(len(dp[i])):
  10. if i==0:
  11. dp[i][j] = j
  12. elif j==0:
  13. dp[i][j] = i
  14. else:
  15. if source[j-1]==target[i-1]:
  16. cost = 0
  17. else:
  18. cost = 2
  19. dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
  20. return dp[-1][-1]
  21. def jaccard_score(source,target):
  22. source_set = set([s for s in source])
  23. target_set = set([s for s in target])
  24. if len(source_set)==0 or len(target_set)==0:
  25. return 0
  26. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  27. def link_entitys(list_entitys,on_value=0.8):
  28. for list_entity in list_entitys:
  29. range_entity = []
  30. for _entity in list_entity:
  31. if _entity.entity_type in ["org","company"]:
  32. range_entity.append(_entity)
  33. range_entity = range_entity[:1000]
  34. for first_i in range(len(range_entity)):
  35. _entity = range_entity[first_i]
  36. for second_i in range(first_i+1,len(range_entity)):
  37. _ent = range_entity[second_i]
  38. _score = jaccard_score(_entity.entity_text, _ent.entity_text)
  39. if _entity.entity_text!=_ent.entity_text and _score>=on_value:
  40. _entity.linked_entitys.append(_ent)
  41. _ent.linked_entitys.append(_entity)
  42. #替换公司名称
  43. for _entity in range_entity:
  44. if re.search("公司",_entity.entity_text) is None:
  45. for _ent in _entity.linked_entitys:
  46. if re.search("公司$",_ent.entity_text) is not None:
  47. if len(_ent.entity_text)>len(_entity.entity_text):
  48. _entity.entity_text = _ent.entity_text
  49. if __name__=="__main__":
  50. edit_distance("GUMBO","GAMBOL")
  51. print(jaccard_score("GUMBO","GAMBOL"))