entityLink.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. #coding:UTF8
  2. '''
  3. Created on 2019年5月21日
  4. @author: User
  5. '''
  6. import re
  7. import os
  8. import time
  9. import pandas as pd
  10. _time = time.time()
  11. from BiddingKG.dl.common.Utils import *
  12. from BiddingKG.dl.interface.Entitys import *
  13. import json
  14. from BiddingKG.dl.common.constDict import ConstDict
  15. def edit_distance(source,target):
  16. dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
  17. for i in range(len(dp)):
  18. for j in range(len(dp[i])):
  19. if i==0:
  20. dp[i][j] = j
  21. elif j==0:
  22. dp[i][j] = i
  23. else:
  24. if source[j-1]==target[i-1]:
  25. cost = 0
  26. else:
  27. cost = 2
  28. dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
  29. return dp[-1][-1]
  30. def jaccard_score(source,target):
  31. source_set = set([s for s in source])
  32. target_set = set([s for s in target])
  33. if len(source_set)==0 or len(target_set)==0:
  34. return 0
  35. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  36. def get_place_list():
  37. path = os.path.dirname(__file__) + '/../place_info.csv'
  38. place_df = pd.read_csv(path)
  39. place_list = []
  40. for index, row in place_df.iterrows():
  41. place_list.append(row[1])
  42. place_list.append('台湾')
  43. place_list.append('澳门')
  44. place_list.append('香港')
  45. # place_list.append('東莞')
  46. # place_list.append('廣州')
  47. # place_list.append('韩国')
  48. # place_list.append('德国')
  49. # place_list.append('英国')
  50. # place_list.append('日本')
  51. # place_list.append('意大利')
  52. # place_list.append('新加坡')
  53. # place_list.append('加拿大')
  54. # place_list.append('西班牙')
  55. # place_list.append('澳大利亚')
  56. # place_list.append('美国')
  57. place_list = list(set(place_list))
  58. return place_list
  59. place_list = get_place_list()
  60. place_pattern = "|".join(place_list)
  61. def is_short(shorter_cut, longer):
  62. '''
  63. 判断是否为简称
  64. :param shorter_cut: 简称
  65. :param longer: 全称
  66. :return:
  67. '''
  68. flag = 1
  69. for words in shorter_cut:
  70. if words in longer:
  71. longer = longer[longer.find(words) + len(words):]
  72. else:
  73. flag = 0
  74. break
  75. if flag:
  76. return 1
  77. else:
  78. return 0
  79. def link_entitys(list_entitys,on_value=1):#on_value=0.81
  80. for list_entity in list_entitys:
  81. range_entity = []
  82. short_entity = []
  83. long_entity = []
  84. n = 0
  85. for _entity in list_entity:
  86. if _entity.entity_type in ["org","company"]:
  87. range_entity.append(_entity)
  88. if len(_entity.entity_text) in [4, 5, 6]:
  89. short_entity.append(_entity)
  90. if len(_entity.entity_text)>6:
  91. long_entity.append(_entity)
  92. n += 1
  93. if n > 1000:
  94. break
  95. range_entity = range_entity[:1000]
  96. #替换公司的逻辑有问题,先取消
  97. # for first_i in range(len(range_entity)):
  98. # _entity = range_entity[first_i]
  99. # for second_i in range(first_i+1,len(range_entity)):
  100. # _ent = range_entity[second_i]
  101. # # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过
  102. # if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
  103. # continue
  104. # _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text))
  105. # if _entity.entity_text!=_ent.entity_text and _score>=on_value:
  106. # _entity.linked_entitys.append(_ent)
  107. # _ent.linked_entitys.append(_entity)
  108. # print("=-===",_entity.entity_text,_ent.entity_text,_score)
  109. # #替换公司名称
  110. # for _entity in range_entity:
  111. # if re.search("公司",_entity.entity_text) is None:
  112. # for _ent in _entity.linked_entitys:
  113. # if re.search("公司$",_ent.entity_text) is not None:
  114. # if len(_ent.entity_text)>len(_entity.entity_text):
  115. # _entity.entity_text = _ent.entity_text
  116. if short_entity and long_entity:
  117. for first_i in range(len(short_entity)):
  118. _entity = short_entity[first_i]
  119. if is_enterprise_exist(_entity.entity_text): # 实体表存在的不替换
  120. continue
  121. if _entity.label == 0 and re.search('(医院|学院|学校|中学|小学|大学|幼儿园|保健院|党校)', _entity.entity_text)==None:
  122. ree_l = []
  123. other_l = []
  124. for second_i in range(len(long_entity)):
  125. _ent = long_entity[second_i]
  126. if _ent.label in [0,1,5] and is_short(_entity.entity_text, _ent.entity_text):
  127. if _ent.label in [0 ,1]:
  128. ree_l.append(_ent)
  129. elif _ent.label in [5]:
  130. other_l.append(_ent)
  131. for _ent in ree_l + other_l:
  132. if is_enterprise_exist(_ent.entity_text) or re.search('有限(责任)?公司', _ent.entity_text):
  133. _entity.entity_text = _ent.entity_text
  134. # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
  135. for _entity in range_entity:
  136. used_linked_entitys = []
  137. if not _entity.linked_entitys:
  138. continue
  139. _entity.linked_entitys.sort(key=lambda x: len(x.entity_text), reverse=True)
  140. for _ent in _entity.linked_entitys:
  141. if _ent in used_linked_entitys:
  142. break
  143. # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
  144. if _ent.if_dict_match == 1:
  145. if len(_ent.entity_text) > len(_entity.entity_text):
  146. # 判断两个公司地区相同
  147. match_list_1, match_list_2 = [], []
  148. for place in place_list:
  149. if place in _entity.entity_text:
  150. match_list_1.append(place)
  151. if place in _ent.entity_text:
  152. match_list_2.append(place)
  153. if str(match_list_1) == str(match_list_2):
  154. # print("字典替换", _entity.entity_text, "->", _ent.entity_text)
  155. _entity.origin_entity_text = _entity.entity_text
  156. _entity.entity_text = _ent.entity_text
  157. used_linked_entitys.append(_ent)
  158. # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match)
  159. # 用于去重的标题
  160. def doctitle_refine(doctitle):
  161. _doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|'
  162. r'交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|竞价|合同', '', doctitle)
  163. return _doctitle_refine
  164. # 前100个公司实体
  165. def get_nlp_enterprise(list_entity):
  166. nlp_enterprise = []
  167. nlp_enterprise_attachment = []
  168. max_num = 100
  169. list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index))
  170. for entity in list_entity:
  171. if entity.entity_type in ['org','company']:
  172. if not entity.in_attachment:
  173. if entity.entity_text not in nlp_enterprise:
  174. nlp_enterprise.append(entity.entity_text)
  175. else:
  176. if entity.entity_text not in nlp_enterprise_attachment:
  177. nlp_enterprise_attachment.append(entity.entity_text)
  178. return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num]
  179. ENTERPRISE_HUGE = None
  180. def getEnterprisePath():
  181. global ENTERPRISE_HUGE
  182. filename_huge = "LEGAL_ENTERPRISE_HUGE.txt"
  183. huge_path = getFileFromSysPath(filename_huge)
  184. if huge_path is None:
  185. if os.path.exists(filename_huge):
  186. log("enterprise path:%s"%(filename_huge))
  187. ENTERPRISE_HUGE = True
  188. return filename_huge,ENTERPRISE_HUGE
  189. else:
  190. log("enterprise path:%s"%(huge_path))
  191. ENTERPRISE_HUGE = True
  192. return huge_path,ENTERPRISE_HUGE
  193. filename = "LEGAL_ENTERPRISE.txt"
  194. real_path = getFileFromSysPath(filename)
  195. if real_path is None:
  196. real_path = filename
  197. log("ENTERPRISE path:%s"%(real_path))
  198. ENTERPRISE_HUGE = False
  199. return real_path,ENTERPRISE_HUGE
  200. DICT_ENTERPRISE_DONE = False
  201. POOL_REDIS = None
  202. ENTERPRISE_KEY_LEN = 3
  203. ENTERPRISE_PREFIX_LEN = 3
  204. ENTERPRISE_TAIL_LEN = 3
  205. SET_ENTERPRISE = set()
  206. SET_PREFIX_ENTERPRISE = set()
  207. SET_TAIL_ENTERPRISE = set()
  208. SET_PREFIX_ENTERPRISE_HUGE_FILE = "SET_PREFIX_ENTERPRISE_HUGE.pk"
  209. SET_TAIL_ENTERPRISE_HUGE_FILE = "SET_TAIL_ENTERPRISE_HUGE.pk"
  210. def getDict_enterprise():
  211. global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
  212. real_path,is_huge = getEnterprisePath()
  213. _ok = False
  214. if is_huge:
  215. if os.path.exists(SET_PREFIX_ENTERPRISE_HUGE_FILE) and os.path.exists(SET_TAIL_ENTERPRISE_HUGE_FILE):
  216. SET_PREFIX_ENTERPRISE = load(SET_PREFIX_ENTERPRISE_HUGE_FILE)
  217. SET_TAIL_ENTERPRISE = load(SET_TAIL_ENTERPRISE_HUGE_FILE)
  218. _ok = True
  219. if not _ok:
  220. with open(real_path,"r",encoding="UTF8") as f:
  221. for _e in f:
  222. if not _e:
  223. continue
  224. _e = _e.strip()
  225. if len(_e)>=4:
  226. key_enter = _e[:ENTERPRISE_KEY_LEN]
  227. SET_PREFIX_ENTERPRISE.add(key_enter)
  228. SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
  229. if not is_huge:
  230. SET_ENTERPRISE.add(_e)
  231. #仅在大文件情况下才使用缓存加载
  232. if is_huge:
  233. save(SET_PREFIX_ENTERPRISE,SET_PREFIX_ENTERPRISE_HUGE_FILE)
  234. save(SET_TAIL_ENTERPRISE,SET_TAIL_ENTERPRISE_HUGE_FILE)
  235. log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE)))
  236. log("SET_TAIL_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_TAIL_ENTERPRISE)/1024/1024,len(SET_TAIL_ENTERPRISE)))
  237. log("SET_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_ENTERPRISE)/1024/1024,len(SET_ENTERPRISE)))
  238. # for _e in ["河南省柘源","建筑工程有限公司"]:
  239. # if not _e:
  240. # continue
  241. # _e = _e.strip()
  242. # if len(_e)>=4:
  243. # key_enter = _e[:4]
  244. # if key_enter not in DICT_ENTERPRISE:
  245. # DICT_ENTERPRISE[key_enter] = set()
  246. # DICT_ENTERPRISE[key_enter].add(_e[4:])
  247. DICT_ENTERPRISE_DONE = True
  248. def init_redis_pool():
  249. from BiddingKG.dl.common.pool import ConnectorPool
  250. from BiddingKG.dl.common.source import getConnect_redis_baseline
  251. global POOL_REDIS
  252. if POOL_REDIS is None:
  253. POOL_REDIS = ConnectorPool(init_num=1,max_num=10,method_init=getConnect_redis_baseline)
  254. def is_enterprise_exist(enterprise_name):
  255. global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
  256. # print("test",enterprise_name)
  257. if ENTERPRISE_HUGE:
  258. if POOL_REDIS is None:
  259. init_redis_pool()
  260. _db = POOL_REDIS.getConnector()
  261. try:
  262. _time = time.time()
  263. _v = _db.get(enterprise_name)
  264. POOL_REDIS.putConnector(_db)
  265. if _v is None:
  266. return False
  267. else:
  268. log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name))
  269. return True
  270. except Exception as e:
  271. traceback.print_exc()
  272. return False
  273. else:
  274. if enterprise_name in SET_ENTERPRISE:
  275. return True
  276. else:
  277. return False
  278. import threading
  279. import time
  280. load_enterprise_thread = threading.Thread(target=getDict_enterprise)
  281. load_enterprise_thread.start()
  282. MAX_ENTERPRISE_LEN = 30
  283. def match_enterprise_max_first(sentence):
  284. while True:
  285. if not DICT_ENTERPRISE_DONE:
  286. time.sleep(1)
  287. else:
  288. break
  289. list_match = []
  290. begin_index = 0
  291. if len(sentence)>4:
  292. while True:
  293. if begin_index+ENTERPRISE_KEY_LEN<len(sentence):
  294. key_enter = sentence[begin_index:begin_index+ENTERPRISE_KEY_LEN]
  295. # if key_enter in DICT_ENTERPRISE:
  296. # _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
  297. # for _i in range(_len):
  298. # enter_name = sentence[begin_index+ENTERPRISE_KEY_LEN:begin_index+_len-_i]
  299. # if enter_name in DICT_ENTERPRISE[key_enter]:
  300. # match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
  301. # list_match.append(match_item)
  302. # begin_index += (len(key_enter)+len(enter_name))-1
  303. # break
  304. if key_enter in SET_PREFIX_ENTERPRISE:
  305. _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
  306. for _i in range(_len):
  307. enter_name = sentence[begin_index:begin_index+_len-_i]
  308. enter_tail = enter_name[-ENTERPRISE_TAIL_LEN:]
  309. if enter_tail in SET_TAIL_ENTERPRISE:
  310. if is_enterprise_exist(enter_name):
  311. match_item = {"entity_text":"%s"%(enter_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
  312. # print("match_item",key_enter,enter_name)
  313. list_match.append(match_item)
  314. begin_index += len(enter_name)-1
  315. break
  316. begin_index += 1
  317. else:
  318. break
  319. # print("======",list_match)
  320. return list_match
  321. def calibrateEnterprise(list_articles,list_sentences,list_entitys):
  322. for _article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
  323. list_calibrate = []
  324. match_add = False
  325. match_replace = False
  326. range_entity = []
  327. for p_entity in list_entity:
  328. if p_entity.entity_type in ("org","company","location"):
  329. range_entity.append(p_entity)
  330. if len(range_entity)>1000:
  331. break
  332. for p_sentence in list_sentence:
  333. sentence = p_sentence.sentence_text
  334. sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']]
  335. list_match = match_enterprise_max_first(sentence)
  336. # print("list_match", list_match)
  337. doc_id = p_sentence.doc_id
  338. sentence_index = p_sentence.sentence_index
  339. tokens = p_sentence.tokens
  340. list_match.sort(key=lambda x:x["begin_index"])
  341. for _match_index in range(len(list_match)):
  342. _match = list_match[_match_index]
  343. find_flag = False
  344. for p_entity in range_entity:
  345. if p_entity.sentence_index!=p_sentence.sentence_index:
  346. continue
  347. if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
  348. find_flag = True
  349. p_entity.entity_type = "company"
  350. p_entity.if_dict_match = 1
  351. if p_entity.entity_type not in ["location","org","company"]:
  352. continue
  353. if _match["entity_text"] == p_entity.entity_text:
  354. p_entity.if_dict_match = 1
  355. #有重叠
  356. #match部分被包含则不处理
  357. if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  358. find_flag = True
  359. #判断是否是多个公司
  360. for _match_j in range(_match_index,len(list_match)):
  361. if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
  362. _match_j -= 1
  363. break
  364. if _match_j>_match_index:
  365. match_replace = True
  366. match_add = True
  367. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  368. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  369. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  370. p_entity.entity_text = _match["entity_text"]
  371. p_entity.wordOffset_begin = _match["begin_index"]
  372. p_entity.wordOffset_end = _match["end_index"]
  373. p_entity.begin_index = begin_index
  374. p_entity.end_index = end_index
  375. # 该公司实体是字典识别的
  376. p_entity.if_dict_match = 1
  377. for _match_h in range(_match_index+1,_match_j+1):
  378. entity_text = list_match[_match_h]["entity_text"]
  379. entity_type = "company"
  380. begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
  381. end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
  382. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  383. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"],in_attachment=p_sentence.in_attachment)
  384. add_entity.if_dict_match = 1
  385. list_entity.append(add_entity)
  386. range_entity.append(add_entity)
  387. list_calibrate.append({"type":"add","from":"","to":entity_text})
  388. _match_index = _match_j
  389. break
  390. continue
  391. elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin:
  392. find_flag = True
  393. if _match["begin_index"]<p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  394. if p_entity.entity_type in ("org","company"):
  395. _diff_text = sentence[p_entity.wordOffset_end:_match["end_index"]]
  396. if re.search("分",_diff_text) is not None:
  397. pass
  398. else:
  399. match_replace = True
  400. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  401. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  402. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  403. p_entity.entity_text = _match["entity_text"]
  404. p_entity.wordOffset_begin = _match["begin_index"]
  405. p_entity.wordOffset_end = _match["end_index"]
  406. p_entity.begin_index = begin_index
  407. p_entity.end_index = end_index
  408. p_entity.if_dict_match = 1
  409. elif _match["end_index"]>=p_entity.wordOffset_end:
  410. # 原entity列表已有实体,则不重复添加
  411. if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys:
  412. match_replace = True
  413. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  414. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  415. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  416. p_entity.entity_text = _match["entity_text"]
  417. p_entity.wordOffset_begin = _match["begin_index"]
  418. p_entity.wordOffset_end = _match["end_index"]
  419. p_entity.begin_index = begin_index
  420. p_entity.end_index = end_index
  421. p_entity.entity_type = "company"
  422. p_entity.if_dict_match = 1
  423. elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
  424. find_flag = True
  425. if p_entity.entity_type in ("org","company"):
  426. match_replace = True
  427. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  428. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  429. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  430. p_entity.entity_text = _match["entity_text"]
  431. p_entity.wordOffset_begin = _match["begin_index"]
  432. p_entity.wordOffset_end = _match["end_index"]
  433. p_entity.begin_index = begin_index
  434. p_entity.end_index = end_index
  435. p_entity.if_dict_match = 1
  436. if not find_flag:
  437. match_add = True
  438. entity_text = _match["entity_text"]
  439. entity_type = "company"
  440. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  441. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  442. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  443. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"],in_attachment=p_sentence.in_attachment)
  444. list_entity.append(add_entity)
  445. range_entity.append(add_entity)
  446. list_calibrate.append({"type":"add","from":"","to":entity_text})
  447. #去重
  448. set_calibrate = set()
  449. list_match_enterprise = []
  450. for _calibrate in list_calibrate:
  451. _from = _calibrate.get("from","")
  452. _to = _calibrate.get("to","")
  453. _key = _from+_to
  454. if _key not in set_calibrate:
  455. list_match_enterprise.append(_calibrate)
  456. set_calibrate.add(_key)
  457. match_enterprise_type = 0
  458. if match_add:
  459. match_enterprise_type += 1
  460. if match_replace:
  461. match_enterprise_type += 2
  462. _article.match_enterprise = list_match_enterprise
  463. _article.match_enterprise_type = match_enterprise_type
  464. def isLegalEnterprise(name):
  465. is_legal = True
  466. if re.search("^[省市区县]",name) is not None or re.search("^\**.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会|有限公司)$",name) or re.search("标段|标包|名称|联系人|联系方式|中标单位|中标人|测试单位|采购单位|采购人|代理人|代理机构|盖章|(主)",name) is not None:
  467. is_legal = False
  468. return is_legal
  469. def fix_LEGAL_ENTERPRISE():
  470. unlegal_enterprise = []
  471. _path = getEnterprisePath()
  472. _sum = 0
  473. set_enter = set()
  474. paths = [_path]
  475. for _p in paths:
  476. with open(_p,"r",encoding="utf8") as f:
  477. while True:
  478. line = f.readline()
  479. if not line:
  480. break
  481. line = line.strip()
  482. if isLegalEnterprise(line):
  483. set_enter.add(line)
  484. if line=="有限责任公司" or line=='设计研究院' or line=='限责任公司' or (re.search("^.{,4}(分公司|支行|分行)$",line) is not None and re.search("电信|移动|联通|建行|工行|农行|中行|交行",line) is None):
  485. print(line)
  486. if line in set_enter:
  487. set_enter.remove(line)
  488. with open("enter.txt","w",encoding="utf8") as fwrite:
  489. for line in list(set_enter):
  490. fwrite.write(line.replace("(","(").replace(")",")"))
  491. fwrite.write("\n")
  492. # if re.search("标段|地址|标包|名称",line) is not None:#\(|\)||
  493. # _count += 1
  494. # print("=",line)
  495. # print("%d/%d"%(_count,_sum))
  496. # a_list = []
  497. # with open("电信分公司.txt","r",encoding="utf8") as f:
  498. # while True:
  499. # _line = f.readline()
  500. # if not _line:
  501. # break
  502. # if _line.strip()!="":
  503. # a_list.append(_line.strip())
  504. # with open("enter.txt","a",encoding="utf8") as f:
  505. # for _line in a_list:
  506. # f.write(_line)
  507. # f.write("\n")
  508. if __name__=="__main__":
  509. # edit_distance("GUMBO","GAMBOL")
  510. # print(jaccard_score("周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目竞争性谈判公告","周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目-成交公告"))
  511. #
  512. # sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
  513. # print(match_enterprise_max_first(sentences))
  514. #
  515. # print("takes %d s"%(time.time()-_time))
  516. # fix_LEGAL_ENTERPRISE()
  517. print(jaccard_score("吉林省九台","吉林省建苑设计集团有限公司"))
  518. # print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))