entityLink.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
  1. '''
  2. Created on 2019年5月21日
  3. @author: User
  4. '''
  5. import re
  6. import os
  7. import time
  8. _time = time.time()
  9. from BiddingKG.dl.common.Utils import *
  10. from BiddingKG.dl.interface.Entitys import *
  11. import json
  12. def edit_distance(source,target):
  13. dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
  14. for i in range(len(dp)):
  15. for j in range(len(dp[i])):
  16. if i==0:
  17. dp[i][j] = j
  18. elif j==0:
  19. dp[i][j] = i
  20. else:
  21. if source[j-1]==target[i-1]:
  22. cost = 0
  23. else:
  24. cost = 2
  25. dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
  26. return dp[-1][-1]
  27. def jaccard_score(source,target):
  28. source_set = set([s for s in source])
  29. target_set = set([s for s in target])
  30. if len(source_set)==0 or len(target_set)==0:
  31. return 0
  32. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  33. def link_entitys(list_entitys,on_value=0.8):
  34. for list_entity in list_entitys:
  35. range_entity = []
  36. for _entity in list_entity:
  37. if _entity.entity_type in ["org","company"]:
  38. range_entity.append(_entity)
  39. range_entity = range_entity[:1000]
  40. for first_i in range(len(range_entity)):
  41. _entity = range_entity[first_i]
  42. for second_i in range(first_i+1,len(range_entity)):
  43. _ent = range_entity[second_i]
  44. _score = jaccard_score(_entity.entity_text, _ent.entity_text)
  45. if _entity.entity_text!=_ent.entity_text and _score>=on_value:
  46. _entity.linked_entitys.append(_ent)
  47. _ent.linked_entitys.append(_entity)
  48. #替换公司名称
  49. for _entity in range_entity:
  50. if re.search("公司",_entity.entity_text) is None:
  51. for _ent in _entity.linked_entitys:
  52. if re.search("公司$",_ent.entity_text) is not None:
  53. if len(_ent.entity_text)>len(_entity.entity_text):
  54. _entity.entity_text = _ent.entity_text
  55. DICT_ENTERPRISE = {}
  56. DICT_ENTERPRISE_DONE = False
  57. def getDict_enterprise():
  58. global DICT_ENTERPRISE,DICT_ENTERPRISE_DONE
  59. filename = os.path.dirname(__file__)+"/../LEGAL_ENTERPRISE.txt"
  60. filepath = os.path.dirname(__file__)+"/../"
  61. real_path = filename
  62. if os.path.exists(os.path.join(filepath,filename)):
  63. real_path = os.path.join(filepath,filename)
  64. with open(real_path,"r",encoding="UTF8") as f:
  65. for _e in f:
  66. if not _e:
  67. continue
  68. _e = _e.strip()
  69. if len(_e)>=4:
  70. key_enter = _e[:4]
  71. if key_enter not in DICT_ENTERPRISE:
  72. DICT_ENTERPRISE[key_enter] = set()
  73. DICT_ENTERPRISE[key_enter].add(_e[4:])
  74. # for _e in ["河南省柘源","建筑工程有限公司"]:
  75. # if not _e:
  76. # continue
  77. # _e = _e.strip()
  78. # if len(_e)>=4:
  79. # key_enter = _e[:4]
  80. # if key_enter not in DICT_ENTERPRISE:
  81. # DICT_ENTERPRISE[key_enter] = set()
  82. # DICT_ENTERPRISE[key_enter].add(_e[4:])
  83. DICT_ENTERPRISE_DONE = True
  84. return DICT_ENTERPRISE
  85. import threading
  86. import time
  87. load_enterprise_thread = threading.Thread(target=getDict_enterprise)
  88. load_enterprise_thread.start()
  89. MAX_ENTERPRISE_LEN = 30
  90. def match_enterprise_max_first(sentence):
  91. while True:
  92. if not DICT_ENTERPRISE_DONE:
  93. time.sleep(1)
  94. else:
  95. break
  96. list_match = []
  97. begin_index = 0
  98. if len(sentence)>4:
  99. while True:
  100. if begin_index+4<len(sentence):
  101. key_enter = sentence[begin_index:begin_index+4]
  102. if key_enter in DICT_ENTERPRISE:
  103. for _i in range(MAX_ENTERPRISE_LEN-4+1):
  104. enter_name = sentence[begin_index+4:begin_index+MAX_ENTERPRISE_LEN-_i]
  105. if enter_name in DICT_ENTERPRISE[key_enter]:
  106. match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
  107. list_match.append(match_item)
  108. begin_index += (len(key_enter)+len(enter_name))-1
  109. break
  110. begin_index += 1
  111. else:
  112. break
  113. return list_match
  114. def calibrateEnterprise(list_articles,list_sentences,list_entitys):
  115. for _article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
  116. list_calibrate = []
  117. match_add = False
  118. match_replace = False
  119. range_entity = []
  120. for p_entity in list_entity:
  121. if p_entity.entity_type in ("org","company","location"):
  122. range_entity.append(p_entity)
  123. if len(range_entity)>1000:
  124. break
  125. for p_sentence in list_sentence:
  126. sentence = p_sentence.sentence_text
  127. list_match = match_enterprise_max_first(sentence)
  128. doc_id = p_sentence.doc_id
  129. sentence_index = p_sentence.sentence_index
  130. tokens = p_sentence.tokens
  131. list_match.sort(key=lambda x:x["begin_index"])
  132. for _match_index in range(len(list_match)):
  133. _match = list_match[_match_index]
  134. find_flag = False
  135. for p_entity in range_entity:
  136. if p_entity.sentence_index!=p_sentence.sentence_index:
  137. continue
  138. if p_entity=="location" and p_entity.entity_text==_match["entity_text"]:
  139. find_flag = True
  140. p_entity.entity_type = "company"
  141. #有重叠
  142. #match部分被包含则不处理
  143. if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  144. find_flag = True
  145. #判断是否是多个公司
  146. for _match_j in range(_match_index,len(list_match)):
  147. if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
  148. _match_j -= 1
  149. break
  150. if _match_j>_match_index:
  151. match_replace = True
  152. match_add = True
  153. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  154. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  155. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  156. p_entity.entity_text = _match["entity_text"]
  157. p_entity.wordOffset_begin = _match["begin_index"]
  158. p_entity.wordOffset_end = _match["end_index"]
  159. p_entity.begin_index = begin_index
  160. p_entity.end_index = end_index
  161. for _match_h in range(_match_index+1,_match_j+1):
  162. entity_text = list_match[_match_h]["entity_text"]
  163. entity_type = "company"
  164. begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
  165. end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
  166. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  167. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
  168. list_entity.append(add_entity)
  169. range_entity.append(add_entity)
  170. list_calibrate.append({"type":"add","from":"","to":entity_text})
  171. _match_index = _match_j
  172. break
  173. continue
  174. elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin:
  175. find_flag = True
  176. if p_entity.entity_type in ("org","company"):
  177. if _match["begin_index"]<p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  178. _diff_text = sentence[p_entity.wordOffset_end:_match["end_index"]]
  179. if re.search("分",_diff_text) is not None:
  180. pass
  181. else:
  182. match_replace = True
  183. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  184. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  185. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  186. p_entity.entity_text = _match["entity_text"]
  187. p_entity.wordOffset_begin = _match["begin_index"]
  188. p_entity.wordOffset_end = _match["end_index"]
  189. p_entity.begin_index = begin_index
  190. p_entity.end_index = end_index
  191. elif _match["end_index"]>=p_entity.wordOffset_end:
  192. match_replace = True
  193. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  194. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  195. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  196. p_entity.entity_text = _match["entity_text"]
  197. p_entity.wordOffset_begin = _match["begin_index"]
  198. p_entity.wordOffset_end = _match["end_index"]
  199. p_entity.begin_index = begin_index
  200. p_entity.end_index = end_index
  201. elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
  202. find_flag = True
  203. if p_entity.entity_type in ("org","company"):
  204. match_replace = True
  205. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  206. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  207. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  208. p_entity.entity_text = _match["entity_text"]
  209. p_entity.wordOffset_begin = _match["begin_index"]
  210. p_entity.wordOffset_end = _match["end_index"]
  211. p_entity.begin_index = begin_index
  212. p_entity.end_index = end_index
  213. if not find_flag:
  214. match_add = True
  215. entity_text = _match["entity_text"]
  216. entity_type = "company"
  217. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  218. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  219. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  220. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
  221. list_entity.append(add_entity)
  222. range_entity.append(add_entity)
  223. list_calibrate.append({"type":"add","from":"","to":entity_text})
  224. #去重
  225. set_calibrate = set()
  226. list_match_enterprise = []
  227. for _calibrate in list_calibrate:
  228. _from = _calibrate.get("from","")
  229. _to = _calibrate.get("to","")
  230. _key = _from+_to
  231. if _key not in set_calibrate:
  232. list_match_enterprise.append(_calibrate)
  233. set_calibrate.add(_key)
  234. match_enterprise_type = 0
  235. if match_add:
  236. match_enterprise_type += 1
  237. if match_replace:
  238. match_enterprise_type += 2
  239. _article.match_enterprise = list_match_enterprise
  240. _article.match_enterprise_type = match_enterprise_type
  241. if __name__=="__main__":
  242. # edit_distance("GUMBO","GAMBOL")
  243. # print(jaccard_score("GUMBO","GAMBOL"))
  244. sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
  245. print(match_enterprise_max_first(sentences))
  246. print("takes %d s"%(time.time()-_time))