12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758 |
- '''
- Created on 2019年5月21日
- @author: User
- '''
- import re
- def edit_distance(source,target):
- dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
- for i in range(len(dp)):
- for j in range(len(dp[i])):
- if i==0:
- dp[i][j] = j
- elif j==0:
- dp[i][j] = i
- else:
- if source[j-1]==target[i-1]:
- cost = 0
- else:
- cost = 2
- dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
- return dp[-1][-1]
-
- def jaccard_score(source,target):
- source_set = set([s for s in source])
- target_set = set([s for s in target])
- if len(source_set)==0 or len(target_set)==0:
- return 0
- return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
- def link_entitys(list_entitys,on_value=0.8):
- for list_entity in list_entitys:
- range_entity = []
- for _entity in list_entity:
- if _entity.entity_type in ["org","company"]:
- range_entity.append(_entity)
- range_entity = range_entity[:1000]
- for first_i in range(len(range_entity)):
- _entity = range_entity[first_i]
- for second_i in range(first_i+1,len(range_entity)):
- _ent = range_entity[second_i]
- _score = jaccard_score(_entity.entity_text, _ent.entity_text)
- if _entity.entity_text!=_ent.entity_text and _score>=on_value:
- _entity.linked_entitys.append(_ent)
- _ent.linked_entitys.append(_entity)
- #替换公司名称
- for _entity in range_entity:
- if re.search("公司",_entity.entity_text) is None:
- for _ent in _entity.linked_entitys:
- if re.search("公司$",_ent.entity_text) is not None:
- if len(_ent.entity_text)>len(_entity.entity_text):
- _entity.entity_text = _ent.entity_text
-
- if __name__=="__main__":
- edit_distance("GUMBO","GAMBOL")
- print(jaccard_score("GUMBO","GAMBOL"))
|