entityLink.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. #coding:UTF8
  2. '''
  3. Created on 2019年5月21日
  4. @author: User
  5. '''
  6. import csv
  7. import re
  8. import os
  9. import time
  10. _time = time.time()
  11. from BiddingKG.dl.common.Utils import *
  12. from BiddingKG.dl.interface.Entitys import *
  13. import json
  14. def edit_distance(source,target):
  15. dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
  16. for i in range(len(dp)):
  17. for j in range(len(dp[i])):
  18. if i==0:
  19. dp[i][j] = j
  20. elif j==0:
  21. dp[i][j] = i
  22. else:
  23. if source[j-1]==target[i-1]:
  24. cost = 0
  25. else:
  26. cost = 2
  27. dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
  28. return dp[-1][-1]
  29. def jaccard_score(source,target):
  30. source_set = set([s for s in source])
  31. target_set = set([s for s in target])
  32. if len(source_set)==0 or len(target_set)==0:
  33. return 0
  34. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  35. def get_place_list():
  36. with open(os.path.abspath(__file__) + '/../../place_info.csv', 'r') as f:
  37. reader = csv.reader(f)
  38. place_list = []
  39. for r in reader:
  40. place_list.append(r[1:3])
  41. place_list = place_list[1:]
  42. place_list.append(['台湾', '台湾'])
  43. place_list.append(['澳门', '澳门'])
  44. place_list.append(['香港', '香港'])
  45. place_list.append(['東莞', '東莞'])
  46. place_list.append(['廣州', '廣州'])
  47. place_list.append(['韩国', '韩国'])
  48. place_list.append(['德国', '德国'])
  49. place_list.append(['英国', '英国'])
  50. place_list.append(['日本', '日本'])
  51. place_list.append(['意大利', '意大利'])
  52. place_list.append(['新加坡', '新加坡'])
  53. place_list.append(['加拿大', '加拿大'])
  54. place_list.append(['西班牙', '西班牙'])
  55. place_list.append(['澳大利亚', '澳大利亚'])
  56. place_list.append(['美国', '美国'])
  57. # 去重
  58. place_list_str = []
  59. for place in place_list:
  60. place_list_str.append(str(place))
  61. place_list_str = list(set(place_list_str))
  62. place_list = []
  63. for place in place_list_str:
  64. place_list.append(eval(place))
  65. return place_list
  66. place_list = get_place_list()
  67. def link_entitys(list_entitys,on_value=0.8):
  68. for list_entity in list_entitys:
  69. range_entity = []
  70. for _entity in list_entity:
  71. if _entity.entity_type in ["org","company"]:
  72. range_entity.append(_entity)
  73. range_entity = range_entity[:1000]
  74. for first_i in range(len(range_entity)):
  75. _entity = range_entity[first_i]
  76. for second_i in range(first_i+1,len(range_entity)):
  77. _ent = range_entity[second_i]
  78. # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过
  79. if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
  80. continue
  81. _score = jaccard_score(_entity.entity_text, _ent.entity_text)
  82. if _entity.entity_text!=_ent.entity_text and _score>=on_value:
  83. _entity.linked_entitys.append(_ent)
  84. _ent.linked_entitys.append(_entity)
  85. #替换公司名称
  86. for _entity in range_entity:
  87. if re.search("公司",_entity.entity_text) is None:
  88. for _ent in _entity.linked_entitys:
  89. if re.search("公司$",_ent.entity_text) is not None:
  90. if len(_ent.entity_text)>len(_entity.entity_text):
  91. _entity.entity_text = _ent.entity_text
  92. # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
  93. for _entity in range_entity:
  94. used_linked_entitys = []
  95. if not _entity.linked_entitys:
  96. continue
  97. _entity.linked_entitys.sort(key=lambda x: len(x.entity_text), reverse=True)
  98. for _ent in _entity.linked_entitys:
  99. if _ent in used_linked_entitys:
  100. break
  101. # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
  102. if _ent.if_dict_match == 1:
  103. if len(_ent.entity_text) > len(_entity.entity_text):
  104. # 判断两个公司地区相同
  105. match_list_1, match_list_2 = [], []
  106. for place in place_list:
  107. if place[0] in _entity.entity_text:
  108. match_list_1.append(place[0])
  109. if place[0] in _ent.entity_text:
  110. match_list_2.append(place[0])
  111. if str(match_list_1) == str(match_list_2):
  112. # print("字典替换", _entity.entity_text, "->", _ent.entity_text)
  113. _entity.origin_entity_text = _entity.entity_text
  114. _entity.entity_text = _ent.entity_text
  115. used_linked_entitys.append(_ent)
  116. # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match)
  117. def getEnterprisePath():
  118. filename = "../LEGAL_ENTERPRISE.txt"
  119. real_path = getFileFromSysPath(filename)
  120. if real_path is None:
  121. real_path = filename
  122. return real_path
  123. DICT_ENTERPRISE = {}
  124. DICT_ENTERPRISE_DONE = False
  125. def getDict_enterprise():
  126. global DICT_ENTERPRISE,DICT_ENTERPRISE_DONE
  127. real_path = getEnterprisePath()
  128. with open(real_path,"r",encoding="UTF8") as f:
  129. for _e in f:
  130. if not _e:
  131. continue
  132. _e = _e.strip()
  133. if len(_e)>=4:
  134. key_enter = _e[:4]
  135. if key_enter not in DICT_ENTERPRISE:
  136. DICT_ENTERPRISE[key_enter] = set()
  137. DICT_ENTERPRISE[key_enter].add(_e[4:])
  138. # for _e in ["河南省柘源","建筑工程有限公司"]:
  139. # if not _e:
  140. # continue
  141. # _e = _e.strip()
  142. # if len(_e)>=4:
  143. # key_enter = _e[:4]
  144. # if key_enter not in DICT_ENTERPRISE:
  145. # DICT_ENTERPRISE[key_enter] = set()
  146. # DICT_ENTERPRISE[key_enter].add(_e[4:])
  147. DICT_ENTERPRISE_DONE = True
  148. return DICT_ENTERPRISE
  149. import threading
  150. import time
  151. load_enterprise_thread = threading.Thread(target=getDict_enterprise)
  152. load_enterprise_thread.start()
  153. MAX_ENTERPRISE_LEN = 30
  154. def match_enterprise_max_first(sentence):
  155. while True:
  156. if not DICT_ENTERPRISE_DONE:
  157. time.sleep(1)
  158. else:
  159. break
  160. list_match = []
  161. begin_index = 0
  162. if len(sentence)>4:
  163. while True:
  164. if begin_index+4<len(sentence):
  165. key_enter = sentence[begin_index:begin_index+4]
  166. if key_enter in DICT_ENTERPRISE:
  167. for _i in range(MAX_ENTERPRISE_LEN-4+1):
  168. enter_name = sentence[begin_index+4:begin_index+MAX_ENTERPRISE_LEN-_i]
  169. if enter_name in DICT_ENTERPRISE[key_enter]:
  170. match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
  171. list_match.append(match_item)
  172. begin_index += (len(key_enter)+len(enter_name))-1
  173. break
  174. begin_index += 1
  175. else:
  176. break
  177. return list_match
  178. def calibrateEnterprise(list_articles,list_sentences,list_entitys):
  179. for _article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
  180. list_calibrate = []
  181. match_add = False
  182. match_replace = False
  183. range_entity = []
  184. for p_entity in list_entity:
  185. if p_entity.entity_type in ("org","company","location"):
  186. range_entity.append(p_entity)
  187. if len(range_entity)>1000:
  188. break
  189. for p_sentence in list_sentence:
  190. sentence = p_sentence.sentence_text
  191. list_match = match_enterprise_max_first(sentence)
  192. # print("list_match", list_match)
  193. doc_id = p_sentence.doc_id
  194. sentence_index = p_sentence.sentence_index
  195. tokens = p_sentence.tokens
  196. list_match.sort(key=lambda x:x["begin_index"])
  197. for _match_index in range(len(list_match)):
  198. _match = list_match[_match_index]
  199. find_flag = False
  200. for p_entity in range_entity:
  201. if p_entity.sentence_index!=p_sentence.sentence_index:
  202. continue
  203. if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
  204. find_flag = True
  205. p_entity.entity_type = "company"
  206. p_entity.if_dict_match = 1
  207. if p_entity.entity_type not in ["location","org","company"]:
  208. continue
  209. if _match["entity_text"] == p_entity.entity_text:
  210. p_entity.if_dict_match = 1
  211. #有重叠
  212. #match部分被包含则不处理
  213. if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  214. find_flag = True
  215. #判断是否是多个公司
  216. for _match_j in range(_match_index,len(list_match)):
  217. if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
  218. _match_j -= 1
  219. break
  220. if _match_j>_match_index:
  221. match_replace = True
  222. match_add = True
  223. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  224. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  225. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  226. p_entity.entity_text = _match["entity_text"]
  227. p_entity.wordOffset_begin = _match["begin_index"]
  228. p_entity.wordOffset_end = _match["end_index"]
  229. p_entity.begin_index = begin_index
  230. p_entity.end_index = end_index
  231. # 该公司实体是字典识别的
  232. p_entity.if_dict_match = 1
  233. for _match_h in range(_match_index+1,_match_j+1):
  234. entity_text = list_match[_match_h]["entity_text"]
  235. entity_type = "company"
  236. begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
  237. end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"])
  238. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  239. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"])
  240. add_entity.if_dict_match = 1
  241. list_entity.append(add_entity)
  242. range_entity.append(add_entity)
  243. list_calibrate.append({"type":"add","from":"","to":entity_text})
  244. _match_index = _match_j
  245. break
  246. continue
  247. elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin:
  248. find_flag = True
  249. if _match["begin_index"]<p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  250. if p_entity.entity_type in ("org","company"):
  251. _diff_text = sentence[p_entity.wordOffset_end:_match["end_index"]]
  252. if re.search("分",_diff_text) is not None:
  253. pass
  254. else:
  255. match_replace = True
  256. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  257. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  258. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  259. p_entity.entity_text = _match["entity_text"]
  260. p_entity.wordOffset_begin = _match["begin_index"]
  261. p_entity.wordOffset_end = _match["end_index"]
  262. p_entity.begin_index = begin_index
  263. p_entity.end_index = end_index
  264. p_entity.if_dict_match = 1
  265. elif _match["end_index"]>=p_entity.wordOffset_end:
  266. match_replace = True
  267. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  268. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  269. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  270. p_entity.entity_text = _match["entity_text"]
  271. p_entity.wordOffset_begin = _match["begin_index"]
  272. p_entity.wordOffset_end = _match["end_index"]
  273. p_entity.begin_index = begin_index
  274. p_entity.end_index = end_index
  275. p_entity.entity_type = "company"
  276. p_entity.if_dict_match = 1
  277. elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
  278. find_flag = True
  279. if p_entity.entity_type in ("org","company"):
  280. match_replace = True
  281. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  282. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  283. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  284. p_entity.entity_text = _match["entity_text"]
  285. p_entity.wordOffset_begin = _match["begin_index"]
  286. p_entity.wordOffset_end = _match["end_index"]
  287. p_entity.begin_index = begin_index
  288. p_entity.end_index = end_index
  289. p_entity.if_dict_match = 1
  290. if not find_flag:
  291. match_add = True
  292. entity_text = _match["entity_text"]
  293. entity_type = "company"
  294. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  295. end_index = changeIndexFromWordToWords(tokens,_match["end_index"])
  296. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  297. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"])
  298. list_entity.append(add_entity)
  299. range_entity.append(add_entity)
  300. list_calibrate.append({"type":"add","from":"","to":entity_text})
  301. #去重
  302. set_calibrate = set()
  303. list_match_enterprise = []
  304. for _calibrate in list_calibrate:
  305. _from = _calibrate.get("from","")
  306. _to = _calibrate.get("to","")
  307. _key = _from+_to
  308. if _key not in set_calibrate:
  309. list_match_enterprise.append(_calibrate)
  310. set_calibrate.add(_key)
  311. match_enterprise_type = 0
  312. if match_add:
  313. match_enterprise_type += 1
  314. if match_replace:
  315. match_enterprise_type += 2
  316. _article.match_enterprise = list_match_enterprise
  317. _article.match_enterprise_type = match_enterprise_type
  318. def isLegalEnterprise(name):
  319. is_legal = True
  320. if re.search("^[省市区县]",name) is not None or re.search("^.{,3}(分(公司|行|支)|街道|中心|办事处|经营部)$",name) or re.search("标段|标包|名称",name) is not None:
  321. is_legal = False
  322. return is_legal
  323. def fix_LEGAL_ENTERPRISE():
  324. unlegal_enterprise = []
  325. _path = getEnterprisePath()
  326. _sum = 0
  327. set_enter = set()
  328. paths = [_path,"enterprise_name.txt"]
  329. for _p in paths:
  330. with open(_p,"r",encoding="utf8") as f:
  331. while True:
  332. line = f.readline()
  333. if not line:
  334. break
  335. line = line.strip()
  336. if isLegalEnterprise(line):
  337. set_enter.add(line)
  338. with open("enter.txt","w",encoding="utf8") as fwrite:
  339. for line in list(set_enter):
  340. fwrite.write(line.replace("(","(").replace(")",")"))
  341. fwrite.write("\n")
  342. # if re.search("标段|地址|标包|名称",line) is not None:#\(|\)||
  343. # _count += 1
  344. # print("=",line)
  345. # print("%d/%d"%(_count,_sum))
  346. if __name__=="__main__":
  347. # edit_distance("GUMBO","GAMBOL")
  348. # print(jaccard_score("周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目竞争性谈判公告","周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目-成交公告"))
  349. #
  350. # sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
  351. # print(match_enterprise_max_first(sentences))
  352. #
  353. # print("takes %d s"%(time.time()-_time))
  354. fix_LEGAL_ENTERPRISE()