entityLink.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. '''
  2. Created on 2019年5月21日
  3. @author: User
  4. '''
  5. import re
  6. import os
  7. import time
  8. _time = time.time()
  9. from BiddingKG.dl.common.Utils import *
  10. from BiddingKG.dl.interface.Entitys import *
  11. import json
  12. def edit_distance(source,target):
  13. dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
  14. for i in range(len(dp)):
  15. for j in range(len(dp[i])):
  16. if i==0:
  17. dp[i][j] = j
  18. elif j==0:
  19. dp[i][j] = i
  20. else:
  21. if source[j-1]==target[i-1]:
  22. cost = 0
  23. else:
  24. cost = 2
  25. dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
  26. return dp[-1][-1]
  27. def jaccard_score(source,target):
  28. source_set = set([s for s in source])
  29. target_set = set([s for s in target])
  30. if len(source_set)==0 or len(target_set)==0:
  31. return 0
  32. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  33. def link_entitys(list_entitys,on_value=0.8):
  34. for list_entity in list_entitys:
  35. range_entity = []
  36. for _entity in list_entity:
  37. if _entity.entity_type in ["org","company"]:
  38. range_entity.append(_entity)
  39. range_entity = range_entity[:1000]
  40. for first_i in range(len(range_entity)):
  41. _entity = range_entity[first_i]
  42. for second_i in range(first_i+1,len(range_entity)):
  43. _ent = range_entity[second_i]
  44. # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过
  45. if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
  46. continue
  47. _score = jaccard_score(re.sub("股份|责任|有限|公司","",_entity.entity_text), re.sub("股份|责任|有限|公司","",_ent.entity_text))
  48. if _entity.entity_text!=_ent.entity_text and _score>=on_value:
  49. _entity.linked_entitys.append(_ent)
  50. _ent.linked_entitys.append(_entity)
  51. #替换公司名称
  52. for _entity in range_entity:
  53. if re.search("公司",_entity.entity_text) is None:
  54. for _ent in _entity.linked_entitys:
  55. if re.search("公司$",_ent.entity_text) is not None:
  56. if len(_ent.entity_text)>len(_entity.entity_text):
  57. _entity.entity_text = _ent.entity_text
  58. # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
  59. for _entity in range_entity:
  60. used_linked_entitys = []
  61. if not _entity.linked_entitys:
  62. continue
  63. _entity.linked_entitys.sort(key=lambda x: len(x.entity_text), reverse=True)
  64. for _ent in _entity.linked_entitys:
  65. if _ent in used_linked_entitys:
  66. break
  67. # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
  68. if _ent.if_dict_match == 1:
  69. if len(_ent.entity_text) > len(_entity.entity_text):
  70. # print("字典替换", _entity.entity_text, "->", _ent.entity_text)
  71. _entity.entity_text = _ent.entity_text
  72. used_linked_entitys.append(_ent)
  73. def getEnterprisePath():
  74. filename = "LEGAL_ENTERPRISE.txt"
  75. real_path = getFileFromSysPath(filename)
  76. if real_path is None:
  77. real_path = filename
  78. return real_path
  79. DICT_ENTERPRISE = {}
  80. DICT_ENTERPRISE_DONE = False
  81. def getDict_enterprise():
  82. global DICT_ENTERPRISE,DICT_ENTERPRISE_DONE
  83. real_path = getEnterprisePath()
  84. with open(real_path,"r",encoding="UTF8") as f:
  85. for _e in f:
  86. if not _e:
  87. continue
  88. _e = _e.strip()
  89. if len(_e)>=4:
  90. key_enter = _e[:4]
  91. if key_enter not in DICT_ENTERPRISE:
  92. DICT_ENTERPRISE[key_enter] = set()
  93. DICT_ENTERPRISE[key_enter].add(_e[4:])
  94. # for _e in ["河南省柘源","建筑工程有限公司"]:
  95. # if not _e:
  96. # continue
  97. # _e = _e.strip()
  98. # if len(_e)>=4:
  99. # key_enter = _e[:4]
  100. # if key_enter not in DICT_ENTERPRISE:
  101. # DICT_ENTERPRISE[key_enter] = set()
  102. # DICT_ENTERPRISE[key_enter].add(_e[4:])
  103. DICT_ENTERPRISE_DONE = True
  104. return DICT_ENTERPRISE
  105. import threading
  106. import time
  107. load_enterprise_thread = threading.Thread(target=getDict_enterprise)
  108. load_enterprise_thread.start()
  109. MAX_ENTERPRISE_LEN = 30
  110. def match_enterprise_max_first(sentence):
  111. while True:
  112. if not DICT_ENTERPRISE_DONE:
  113. time.sleep(1)
  114. else:
  115. break
  116. list_match = []
  117. begin_index = 0
  118. if len(sentence)>4:
  119. while True:
  120. if begin_index+4<len(sentence):
  121. key_enter = sentence[begin_index:begin_index+4]
  122. if key_enter in DICT_ENTERPRISE:
  123. for _i in range(MAX_ENTERPRISE_LEN-4+1):
  124. enter_name = sentence[begin_index+4:begin_index+MAX_ENTERPRISE_LEN-_i]
  125. if enter_name in DICT_ENTERPRISE[key_enter]:
  126. match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
  127. list_match.append(match_item)
  128. begin_index += (len(key_enter)+len(enter_name))-1
  129. break
  130. begin_index += 1
  131. else:
  132. break
  133. return list_match
  134. def calibrateEnterprise(list_articles,list_sentences,list_entitys):
  135. for _article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
  136. list_calibrate = []
  137. match_add = False
  138. match_replace = False
  139. range_entity = []
  140. for p_entity in list_entity:
  141. if p_entity.entity_type in ("org","company","location"):
  142. range_entity.append(p_entity)
  143. if len(range_entity)>1000:
  144. break
  145. for p_sentence in list_sentence:
  146. sentence = p_sentence.sentence_text
  147. list_match = match_enterprise_max_first(sentence)
  148. # print("list_match", list_match)
  149. doc_id = p_sentence.doc_id
  150. sentence_index = p_sentence.sentence_index
  151. tokens = p_sentence.tokens
  152. list_match.sort(key=lambda x:x["begin_index"])
  153. for _match_index in range(len(list_match)):
  154. _match = list_match[_match_index]
  155. find_flag = False
  156. for p_entity in range_entity:
  157. if p_entity.sentence_index!=p_sentence.sentence_index:
  158. continue
  159. if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
  160. find_flag = True
  161. p_entity.entity_type = "company"
  162. p_entity.if_dict_match = 1
  163. if p_entity.entity_type not in ["location","org","company"]:
  164. continue
  165. if _match["entity_text"] == p_entity.entity_text:
  166. p_entity.if_dict_match = 1
  167. #有重叠
  168. #match部分被包含则不处理
  169. if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  170. find_flag = True
  171. #判断是否是多个公司
  172. for _match_j in range(_match_index,len(list_match)):
  173. if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
  174. _match_j -= 1
  175. break
  176. if _match_j>_match_index:
  177. match_replace = True
  178. match_add = True
  179. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  180. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  181. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  182. p_entity.entity_text = _match["entity_text"]
  183. p_entity.wordOffset_begin = _match["begin_index"]
  184. p_entity.wordOffset_end = _match["end_index"]
  185. p_entity.begin_index = begin_index
  186. p_entity.end_index = end_index
  187. # 该公司实体是字典识别的
  188. p_entity.if_dict_match = 1
  189. for _match_h in range(_match_index+1,_match_j+1):
  190. entity_text = list_match[_match_h]["entity_text"]
  191. entity_type = "company"
  192. begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
  193. end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
  194. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  195. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
  196. add_entity.if_dict_match = 1
  197. list_entity.append(add_entity)
  198. range_entity.append(add_entity)
  199. list_calibrate.append({"type":"add","from":"","to":entity_text})
  200. _match_index = _match_j
  201. break
  202. continue
  203. elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin:
  204. find_flag = True
  205. if _match["begin_index"]<p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  206. if p_entity.entity_type in ("org","company"):
  207. _diff_text = sentence[p_entity.wordOffset_end:_match["end_index"]]
  208. if re.search("分",_diff_text) is not None:
  209. pass
  210. else:
  211. match_replace = True
  212. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  213. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  214. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  215. p_entity.entity_text = _match["entity_text"]
  216. p_entity.wordOffset_begin = _match["begin_index"]
  217. p_entity.wordOffset_end = _match["end_index"]
  218. p_entity.begin_index = begin_index
  219. p_entity.end_index = end_index
  220. p_entity.if_dict_match = 1
  221. elif _match["end_index"]>=p_entity.wordOffset_end:
  222. match_replace = True
  223. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  224. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  225. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  226. p_entity.entity_text = _match["entity_text"]
  227. p_entity.wordOffset_begin = _match["begin_index"]
  228. p_entity.wordOffset_end = _match["end_index"]
  229. p_entity.begin_index = begin_index
  230. p_entity.end_index = end_index
  231. p_entity.entity_type = "company"
  232. p_entity.if_dict_match = 1
  233. elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
  234. find_flag = True
  235. if p_entity.entity_type in ("org","company"):
  236. match_replace = True
  237. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  238. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  239. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  240. p_entity.entity_text = _match["entity_text"]
  241. p_entity.wordOffset_begin = _match["begin_index"]
  242. p_entity.wordOffset_end = _match["end_index"]
  243. p_entity.begin_index = begin_index
  244. p_entity.end_index = end_index
  245. p_entity.if_dict_match = 1
  246. if not find_flag:
  247. match_add = True
  248. entity_text = _match["entity_text"]
  249. entity_type = "company"
  250. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  251. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  252. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  253. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
  254. list_entity.append(add_entity)
  255. range_entity.append(add_entity)
  256. list_calibrate.append({"type":"add","from":"","to":entity_text})
  257. #去重
  258. set_calibrate = set()
  259. list_match_enterprise = []
  260. for _calibrate in list_calibrate:
  261. _from = _calibrate.get("from","")
  262. _to = _calibrate.get("to","")
  263. _key = _from+_to
  264. if _key not in set_calibrate:
  265. list_match_enterprise.append(_calibrate)
  266. set_calibrate.add(_key)
  267. match_enterprise_type = 0
  268. if match_add:
  269. match_enterprise_type += 1
  270. if match_replace:
  271. match_enterprise_type += 2
  272. _article.match_enterprise = list_match_enterprise
  273. _article.match_enterprise_type = match_enterprise_type
  274. def isLegalEnterprise(name):
  275. is_legal = True
  276. if re.search("^[省市区县]",name) is not None or re.search("^.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会)$",name) or re.search("标段|标包|名称",name) is not None:
  277. is_legal = False
  278. return is_legal
  279. def fix_LEGAL_ENTERPRISE():
  280. unlegal_enterprise = []
  281. _path = getEnterprisePath()
  282. _sum = 0
  283. set_enter = set()
  284. paths = [_path]
  285. for _p in paths:
  286. with open(_p,"r",encoding="utf8") as f:
  287. while True:
  288. line = f.readline()
  289. if not line:
  290. break
  291. line = line.strip()
  292. if line=="工会委员会":
  293. print(line,isLegalEnterprise(line))
  294. if isLegalEnterprise(line):
  295. set_enter.add(line)
  296. with open("enter.txt","w",encoding="utf8") as fwrite:
  297. for line in list(set_enter):
  298. fwrite.write(line.replace("(","(").replace(")",")"))
  299. fwrite.write("\n")
  300. # if re.search("标段|地址|标包|名称",line) is not None:#\(|\)||
  301. # _count += 1
  302. # print("=",line)
  303. # print("%d/%d"%(_count,_sum))
  304. if __name__=="__main__":
  305. # edit_distance("GUMBO","GAMBOL")
  306. # print(jaccard_score("周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目竞争性谈判公告","周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目-成交公告"))
  307. #
  308. # sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
  309. # print(match_enterprise_max_first(sentences))
  310. #
  311. # print("takes %d s"%(time.time()-_time))
  312. fix_LEGAL_ENTERPRISE()
  313. # print(jaccard_score("中国南方航空股份有限公司上海分公司","南方航空上海分公司"))
  314. # print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))