entityLink.py 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826
  1. #coding:UTF8
  2. '''
  3. Created on 2019年5月21日
  4. @author: User
  5. '''
  6. import re
  7. import os
  8. import time
  9. import pandas as pd
  10. _time = time.time()
  11. from BiddingKG.dl.common.Utils import *
  12. from BiddingKG.dl.interface.Entitys import *
  13. import json
  14. from BiddingKG.dl.common.constDict import ConstDict
  15. # from BiddingKG.dl.interface.classification_process import entity_classify_process
  16. def edit_distance(source,target):
  17. dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)]
  18. for i in range(len(dp)):
  19. for j in range(len(dp[i])):
  20. if i==0:
  21. dp[i][j] = j
  22. elif j==0:
  23. dp[i][j] = i
  24. else:
  25. if source[j-1]==target[i-1]:
  26. cost = 0
  27. else:
  28. cost = 2
  29. dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost])
  30. return dp[-1][-1]
  31. def jaccard_score(source,target):
  32. source_set = set([s for s in source])
  33. target_set = set([s for s in target])
  34. if len(source_set)==0 or len(target_set)==0:
  35. return 0
  36. return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set))
  37. def get_place_list():
  38. path = os.path.dirname(__file__) + '/../place_info.csv'
  39. place_df = pd.read_csv(path)
  40. place_list = []
  41. for index, row in place_df.iterrows():
  42. place_list.append(row[1])
  43. place_list.append('台湾')
  44. place_list.append('澳门')
  45. place_list.append('香港')
  46. # place_list.append('東莞')
  47. # place_list.append('廣州')
  48. # place_list.append('韩国')
  49. # place_list.append('德国')
  50. # place_list.append('英国')
  51. # place_list.append('日本')
  52. # place_list.append('意大利')
  53. # place_list.append('新加坡')
  54. # place_list.append('加拿大')
  55. # place_list.append('西班牙')
  56. # place_list.append('澳大利亚')
  57. # place_list.append('美国')
  58. place_list = list(set(place_list))
  59. return place_list
  60. place_list = get_place_list()
  61. place_pattern = "|".join(place_list)
  62. def is_short(shorter_cut, longer):
  63. '''
  64. 判断是否为简称
  65. :param shorter_cut: 简称
  66. :param longer: 全称
  67. :return:
  68. '''
  69. flag = 1
  70. for words in shorter_cut:
  71. if words in longer:
  72. longer = longer[longer.find(words) + len(words):]
  73. else:
  74. flag = 0
  75. break
  76. if flag:
  77. return 1
  78. else:
  79. return 0
  80. def get_business_data(enterprise_name):
  81. '''
  82. 获取指定公司名称是否有工商数据,有就返回True及相关招投标数据,没有返回False及{}
  83. :param enterprise_name: 公司名称
  84. :return:
  85. '''
  86. global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
  87. # print("test",enterprise_name)
  88. if ENTERPRISE_HUGE:
  89. if POOL_REDIS is None:
  90. init_redis_pool()
  91. _db = POOL_REDIS.getConnector()
  92. try:
  93. _time = time.time()
  94. _v = _db.get(enterprise_name)
  95. POOL_REDIS.putConnector(_db)
  96. if _v is None:
  97. return False, {}
  98. else:
  99. _v = str(_v, 'utf-8')
  100. if 'have_business' in _v:
  101. # log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name))
  102. d = json.loads(_v)
  103. if d.get('have_business', '') == 1:
  104. return True, d
  105. return False, d
  106. else:
  107. return False, {}
  108. except Exception as e:
  109. traceback.print_exc()
  110. return False, {}
  111. else:
  112. if enterprise_name in SET_ENTERPRISE:
  113. return True, {}
  114. else:
  115. return False, {}
  116. def get_role(dic):
  117. '''
  118. 通过字典统计 招标、代理、中标公告数量 返回最大比例及对应类别
  119. :param dic: redics 获取实体的工商数据字典
  120. :return:
  121. '''
  122. if 'zhao_biao_number' in dic:
  123. zhaobiao = dic.get('zhao_biao_number', 0)
  124. daili = dic.get('dai_li_number', 0)
  125. zhongbiao = dic.get('zhong_biao_number', 0)
  126. bid = zhaobiao+ daili+ zhongbiao
  127. if bid > 100: # 总数大于100的才统计
  128. if zhaobiao>=daili:
  129. if zhaobiao>=zhongbiao:
  130. return 0, zhaobiao/bid
  131. else:
  132. return 2, zhongbiao/bid
  133. elif daili >= zhongbiao:
  134. return 1, daili/bid
  135. else:
  136. return 2, zhongbiao/bid
  137. return 5, 0
  138. def link_entitys(list_entitys,on_value=1):#on_value=0.81
  139. business_dic = {}
  140. for list_entity in list_entitys:
  141. range_entity = []
  142. short_entity = [] # 不包含工商数据实体
  143. long_entity = [] # 包含工商数据实体
  144. n = 0
  145. bus_dic = {} # 保存已查询包含工商数据实体 属于招标、代理、中标 何种类别及对应概率
  146. find_tenderee = False
  147. bus_tenderee = []
  148. for _entity in list_entity:
  149. if _entity.entity_type in ["org","company"]:
  150. ser = re.search('(?P<name>.{2,}(医院|大学|公司))(招[投议]?标|采购)(中心|办公室)$', _entity.entity_text) # 2024-06-07 规范单位名称,去除非必要字眼
  151. if ser:
  152. _entity.entity_text = ser.group('name')
  153. range_entity.append(_entity)
  154. if _entity.entity_text in bus_dic:
  155. have_bus = True
  156. else:
  157. if _entity.entity_text not in business_dic:
  158. have_bus, dic = get_business_data(_entity.entity_text)
  159. business_dic[_entity.entity_text] = (have_bus, dic)
  160. else:
  161. have_bus, dic = business_dic.get(_entity.entity_text) # 20240708 字典保存查询过的工商数据,避免重复查询redis
  162. if re.search('^\w{,5}[分支](行|公司)$|^\w{1,3}公司$|^\w{2,5}段$', _entity.entity_text):
  163. have_bus = False
  164. if have_bus:
  165. lb, prob = get_role(dic)
  166. bus_dic[_entity.entity_text] = (lb, prob)
  167. if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']:
  168. bus_tenderee.append(_entity)
  169. elif re.search('^\w{2,6}银行\w{2,10}[分支]行$', _entity.entity_text): # 2024/05/22 补充某些支行没收集到工商数据
  170. have_bus = True
  171. bus_dic[_entity.entity_text] = (0, 0.5)
  172. if have_bus: # 20231115 改为只判断是否有工商数据,没有就考虑替换
  173. long_entity.append(_entity)
  174. if len(_entity.entity_text)< 6 and re.search('(大学|医院)', _entity.entity_text) == None:
  175. short_entity.append(_entity)
  176. lb, prob = bus_dic[_entity.entity_text]
  177. if lb in [0,1] and prob>0.9 and _entity.label in [0, 1] and _entity.values[_entity.label]<0.55: # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况
  178. if _entity.label != lb:
  179. _entity.label = lb
  180. _entity.values[_entity.label] = 0.55
  181. else:
  182. _entity.values[_entity.label] += 0.05
  183. else:
  184. short_entity.append(_entity)
  185. if _entity.label == 0: # 找到招标人
  186. find_tenderee = True
  187. n += 1
  188. if n > 1000:
  189. break
  190. if find_tenderee == False and len(bus_tenderee)==1 and bus_tenderee[0].label==5: # 如果整篇都没招标人,工商统计只有一个高概率招标人把它作为招标人
  191. bus_tenderee[0].label = 0
  192. bus_tenderee[0].values[0] = 0.55
  193. range_entity = range_entity[:1000]
  194. #替换公司的逻辑有问题,先取消
  195. # for first_i in range(len(range_entity)):
  196. # _entity = range_entity[first_i]
  197. # for second_i in range(first_i+1,len(range_entity)):
  198. # _ent = range_entity[second_i]
  199. # # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过
  200. # if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]:
  201. # continue
  202. # _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text))
  203. # if _entity.entity_text!=_ent.entity_text and _score>=on_value:
  204. # _entity.linked_entitys.append(_ent)
  205. # _ent.linked_entitys.append(_entity)
  206. # print("=-===",_entity.entity_text,_ent.entity_text,_score)
  207. # #替换公司名称
  208. # for _entity in range_entity:
  209. # if re.search("公司",_entity.entity_text) is None:
  210. # for _ent in _entity.linked_entitys:
  211. # if re.search("公司$",_ent.entity_text) is not None:
  212. # if len(_ent.entity_text)>len(_entity.entity_text):
  213. # _entity.entity_text = _ent.entity_text
  214. if short_entity and long_entity: #
  215. for first_i in range(len(short_entity)):
  216. _entity = short_entity[first_i]
  217. if _entity.label == 0:
  218. for second_i in range(len(long_entity)):
  219. _ent = long_entity[second_i]
  220. if _ent.label in [0,1,5]:
  221. if len(_entity.entity_text)<len(_ent.entity_text) and is_short(_entity.entity_text, _ent.entity_text): # 简称顺序包含在工商名称内的替换
  222. if _entity.entity_text.endswith('大学'): # 修复 533357339 东北大学 替换为 中国银行沈阳东北大学支行
  223. continue
  224. _entity.entity_text = _ent.entity_text
  225. lb, prob = bus_dic[_entity.entity_text]
  226. if lb in [0, 1] and prob > 0.9 and _entity.values[
  227. _entity.label] < 0.55: # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况
  228. if _entity.label != lb:
  229. _entity.label = lb
  230. _entity.values[_entity.label] = 0.55
  231. else:
  232. _entity.values[_entity.label] += 0.05
  233. break
  234. elif len(_entity.entity_text)>len(_ent.entity_text) and _ent.entity_text in _entity.entity_text \
  235. and re.search('(医院|大学)$', _ent.entity_text) and re.search('[部处室科]$', _entity.entity_text): # 不包含工商数据实体完全包含工商数据实体名称的替换 20240520调整限定部门结尾才替换,防止出错
  236. _entity.entity_text = _ent.entity_text
  237. lb, prob = bus_dic[_entity.entity_text]
  238. if lb in [0, 1] and prob > 0.9 and _entity.values[
  239. _entity.label] < 0.55: # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况
  240. if _entity.label != lb:
  241. _entity.label = lb
  242. _entity.values[_entity.label] = 0.55
  243. else:
  244. _entity.values[_entity.label] += 0.05
  245. break
  246. # 2021/12/21 替换通过字典识别到的取长度最大的相似实体
  247. for _entity in range_entity:
  248. used_linked_entitys = []
  249. if not _entity.linked_entitys:
  250. continue
  251. _entity.linked_entitys.sort(key=lambda x: len(x.entity_text), reverse=True)
  252. for _ent in _entity.linked_entitys:
  253. if _ent in used_linked_entitys:
  254. break
  255. # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text)
  256. if _ent.if_dict_match == 1:
  257. if len(_ent.entity_text) > len(_entity.entity_text):
  258. # 判断两个公司地区相同
  259. match_list_1, match_list_2 = [], []
  260. for place in place_list:
  261. if place in _entity.entity_text:
  262. match_list_1.append(place)
  263. if place in _ent.entity_text:
  264. match_list_2.append(place)
  265. if str(match_list_1) == str(match_list_2):
  266. # print("字典替换", _entity.entity_text, "->", _ent.entity_text)
  267. _entity.origin_entity_text = _entity.entity_text
  268. _entity.entity_text = _ent.entity_text
  269. used_linked_entitys.append(_ent)
  270. # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match)
  271. # 用于去重的标题
  272. def doctitle_refine(doctitle):
  273. _doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|'
  274. r'交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|竞价|合同', '', doctitle)
  275. return _doctitle_refine
  276. # 前100个公司实体
  277. def get_nlp_enterprise(list_entity):
  278. nlp_enterprise = []
  279. nlp_enterprise_attachment = []
  280. dict_enterprise = {}
  281. business_dic = {}
  282. max_num = 100
  283. list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index))
  284. for entity in list_entity:
  285. if entity.entity_type in ['org','company']:
  286. if entity.entity_text not in dict_enterprise:
  287. if entity.entity_text not in business_dic:
  288. have_bus, dic = get_business_data(entity.entity_text)
  289. business_dic[entity.entity_text] = (have_bus, dic)
  290. else:
  291. have_bus, dic = business_dic.get(entity.entity_text) # 20240708 字典保存查询过的工商数据,避免重复查询redis
  292. credit_code = dic.get('credit_code', '')
  293. in_text = 0 if entity.in_attachment else 1
  294. if entity.label in [0,1,2,3,4] or len(dict_enterprise)<=max_num:
  295. dict_enterprise[entity.entity_text] = {'in_text': in_text}
  296. if credit_code != "":
  297. dict_enterprise[entity.entity_text]['credit_code'] = credit_code
  298. else:
  299. in_text = 0 if entity.in_attachment else 1
  300. if in_text != dict_enterprise[entity.entity_text]['in_text']:
  301. dict_enterprise[entity.entity_text]['in_text'] = 2
  302. if not entity.in_attachment:
  303. if entity.entity_text not in nlp_enterprise:
  304. nlp_enterprise.append(entity.entity_text)
  305. else:
  306. if entity.entity_text not in nlp_enterprise_attachment:
  307. nlp_enterprise_attachment.append(entity.entity_text)
  308. # for enterprise,value in dict_enterprise.items():
  309. # enterprise_class = entity_classify_process(enterprise)
  310. # _class = [{"first_level":key.split("-")[0],"second_level":key.split("-")[1]} for key in enterprise_class]
  311. # value['class'] = _class
  312. return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num], dict_enterprise
  313. ENTERPRISE_HUGE = None
  314. def getEnterprisePath():
  315. global ENTERPRISE_HUGE
  316. filename_huge = "LEGAL_ENTERPRISE_HUGE.txt"
  317. huge_path = getFileFromSysPath(filename_huge)
  318. if huge_path is None:
  319. if os.path.exists(filename_huge):
  320. log("enterprise path:%s"%(filename_huge))
  321. ENTERPRISE_HUGE = True
  322. return filename_huge,ENTERPRISE_HUGE
  323. else:
  324. log("enterprise path:%s"%(huge_path))
  325. ENTERPRISE_HUGE = True
  326. return huge_path,ENTERPRISE_HUGE
  327. filename = "LEGAL_ENTERPRISE.txt"
  328. real_path = getFileFromSysPath(filename)
  329. if real_path is None:
  330. real_path = filename
  331. log("ENTERPRISE path:%s"%(real_path))
  332. ENTERPRISE_HUGE = False
  333. return real_path,ENTERPRISE_HUGE
  334. DICT_ENTERPRISE_DONE = False
  335. POOL_REDIS = None
  336. ENTERPRISE_KEY_LEN = 3
  337. ENTERPRISE_PREFIX_LEN = 3
  338. ENTERPRISE_TAIL_LEN = 3
  339. SET_ENTERPRISE = set()
  340. SET_PREFIX_ENTERPRISE = set()
  341. SET_TAIL_ENTERPRISE = set()
  342. SET_PREFIX_ENTERPRISE_HUGE_FILE = "SET_PREFIX_ENTERPRISE_HUGE.pk"
  343. SET_TAIL_ENTERPRISE_HUGE_FILE = "SET_TAIL_ENTERPRISE_HUGE.pk"
  344. def getDict_enterprise():
  345. global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE
  346. real_path,is_huge = getEnterprisePath()
  347. _ok = False
  348. if is_huge:
  349. if os.path.exists(SET_PREFIX_ENTERPRISE_HUGE_FILE) and os.path.exists(SET_TAIL_ENTERPRISE_HUGE_FILE):
  350. SET_PREFIX_ENTERPRISE = load(SET_PREFIX_ENTERPRISE_HUGE_FILE)
  351. SET_TAIL_ENTERPRISE = load(SET_TAIL_ENTERPRISE_HUGE_FILE)
  352. _ok = True
  353. if not _ok:
  354. with open(real_path,"r",encoding="UTF8") as f:
  355. for _e in f:
  356. if not _e:
  357. continue
  358. _e = _e.strip()
  359. if len(_e)>=4:
  360. key_enter = _e[:ENTERPRISE_KEY_LEN]
  361. SET_PREFIX_ENTERPRISE.add(key_enter)
  362. SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:])
  363. if not is_huge:
  364. SET_ENTERPRISE.add(_e)
  365. #仅在大文件情况下才使用缓存加载
  366. if is_huge:
  367. save(SET_PREFIX_ENTERPRISE,SET_PREFIX_ENTERPRISE_HUGE_FILE)
  368. save(SET_TAIL_ENTERPRISE,SET_TAIL_ENTERPRISE_HUGE_FILE)
  369. log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE)))
  370. log("SET_TAIL_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_TAIL_ENTERPRISE)/1024/1024,len(SET_TAIL_ENTERPRISE)))
  371. log("SET_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_ENTERPRISE)/1024/1024,len(SET_ENTERPRISE)))
  372. # for _e in ["河南省柘源","建筑工程有限公司"]:
  373. # if not _e:
  374. # continue
  375. # _e = _e.strip()
  376. # if len(_e)>=4:
  377. # key_enter = _e[:4]
  378. # if key_enter not in DICT_ENTERPRISE:
  379. # DICT_ENTERPRISE[key_enter] = set()
  380. # DICT_ENTERPRISE[key_enter].add(_e[4:])
  381. DICT_ENTERPRISE_DONE = True
  382. def init_redis_pool():
  383. from BiddingKG.dl.common.pool import ConnectorPool
  384. from BiddingKG.dl.common.source import getConnect_redis_baseline
  385. global POOL_REDIS
  386. if POOL_REDIS is None:
  387. POOL_REDIS = ConnectorPool(init_num=1,max_num=10,method_init=getConnect_redis_baseline)
  388. # 插入 Redis
  389. # def add_redis(company_list):
  390. # global ENTERPRISE_HUGE,POOL_REDIS
  391. # if ENTERPRISE_HUGE:
  392. # _db = POOL_REDIS.getConnector()
  393. # for enterprise_name in company_list:
  394. # _v = _db.get(enterprise_name)
  395. # if _v is None:
  396. # if isLegalNewName(enterprise_name):
  397. # _db.set(enterprise_name,1)
  398. # 新实体合法判断
  399. def isLegalNewName(enterprise_name):
  400. # head_character_list = ["[",'【',"(",'(']
  401. # tail_character_list = ["]",'】',")",')']
  402. # 名称开头判断
  403. if re.search("^[\da-zA-Z][^\da-zA-Z]|"
  404. "^[^\da-zA-Z\u4e00-\u9fa5\[【((]|"
  405. "^[\[【((].{,1}[\]】))]|"
  406. "^[0〇]|"
  407. "^(20[0-2][0-9]|[0-2]?[0-9]年|[0-1]?[0-9]月|[0-3]?[0-9]日)",enterprise_name):
  408. return -1
  409. if len(re.findall("[\u4e00-\u9fa5]",enterprise_name))<2:
  410. return -1
  411. if re.search("╳|*|\*|×|xx|XX",enterprise_name):
  412. return -1
  413. if re.search("^(省|自治[县州区]|市|县|区|镇|乡|街道)",enterprise_name) and not re.search("^(镇江|乡宁|镇原|镇海|镇安|镇巴|镇坪|镇赉|镇康|镇沅|镇雄|镇远|镇宁|乡城|镇平|市中|市南|市北)",enterprise_name):
  414. return -1
  415. if re.search("\d{1,2}:\d{2}(:\d{2})?|(rar|xlsx|zip|png|jpg|swf|docx|txt|pdf|PDF|doc|xls|bmp|&?nbsp)",enterprise_name):
  416. return -1
  417. if re.search("(招标|代理)(人|机构)|联系(人|方式)|中标|候选|第.名",enterprise_name):
  418. return -1
  419. if re.search("[a-zA-Z\d]{1,2}(包|标段?)|第.批"):
  420. return 0
  421. return 1
  422. # 过滤掉Redis里值为0的错误实体
  423. def enterprise_filter(entity_list):
  424. global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
  425. if ENTERPRISE_HUGE:
  426. if POOL_REDIS is None:
  427. init_redis_pool()
  428. _db = POOL_REDIS.getConnector()
  429. remove_list = []
  430. try:
  431. for entity in entity_list:
  432. if entity.entity_type in ['company','org']:
  433. _v = _db.get(entity.entity_text)
  434. if _v==0:
  435. remove_list.append(entity)
  436. except Exception as e:
  437. traceback.print_exc()
  438. POOL_REDIS.putConnector(_db)
  439. for _entity in remove_list:
  440. entity_list.remove(_entity)
  441. return entity_list
  442. def is_enterprise_exist(enterprise_name):
  443. global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS
  444. # print("test",enterprise_name)
  445. if ENTERPRISE_HUGE:
  446. if POOL_REDIS is None:
  447. init_redis_pool()
  448. _db = POOL_REDIS.getConnector()
  449. try:
  450. _time = time.time()
  451. _v = _db.get(enterprise_name)
  452. POOL_REDIS.putConnector(_db)
  453. if _v is None:
  454. return False
  455. else:
  456. if _v:
  457. # log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name))
  458. return True
  459. else:
  460. return False
  461. except Exception as e:
  462. traceback.print_exc()
  463. return False
  464. else:
  465. if enterprise_name in SET_ENTERPRISE:
  466. return True
  467. else:
  468. return False
  469. import threading
  470. import time
  471. load_enterprise_thread = threading.Thread(target=getDict_enterprise)
  472. load_enterprise_thread.start()
  473. MAX_ENTERPRISE_LEN = 30
  474. def match_enterprise_max_first(sentence, business_dic):
  475. while True:
  476. if not DICT_ENTERPRISE_DONE:
  477. time.sleep(1)
  478. else:
  479. break
  480. list_match = []
  481. begin_index = 0
  482. if len(sentence)>4:
  483. while True:
  484. if begin_index+ENTERPRISE_KEY_LEN<len(sentence):
  485. key_enter = sentence[begin_index:begin_index+ENTERPRISE_KEY_LEN]
  486. if key_enter.find(',') > 0:
  487. key_enter = sentence[begin_index:begin_index + ENTERPRISE_KEY_LEN+1].replace(',', '') # 20241212 修复实体名称被分割问题 例:北,京千里马网信科技有限公司
  488. # if key_enter in DICT_ENTERPRISE:
  489. # _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
  490. # for _i in range(_len):
  491. # enter_name = sentence[begin_index+ENTERPRISE_KEY_LEN:begin_index+_len-_i]
  492. # if enter_name in DICT_ENTERPRISE[key_enter]:
  493. # match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)}
  494. # list_match.append(match_item)
  495. # begin_index += (len(key_enter)+len(enter_name))-1
  496. # break
  497. if key_enter in SET_PREFIX_ENTERPRISE:
  498. _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index)
  499. for _i in range(_len):
  500. enter_name = sentence[begin_index:begin_index+_len-_i]
  501. if enter_name.endswith(','):
  502. continue
  503. fix_name = enter_name.replace(',', '') # 20241212 修复实体名称被分割问题 例:北,京千里马网信科技有限公司
  504. enter_tail = fix_name[-ENTERPRISE_TAIL_LEN:]
  505. if re.search('[\u4e00-\u9fa5]', enter_tail) == None: # 20240111不包含中文后缀不要
  506. continue
  507. elif fix_name in ['黄埔军校', '五金建材', '铝合金门窗', '测试单位' ,'生产管理部', '华电XXX发电有限公司']: # '国有资产管理处',
  508. continue
  509. elif re.search('^\w{,3}(有限)?(责任)?分?公司$|^第[一二三四五六七八九十](工程|建筑)?分?公司$|交汇处$|大厦$|大楼$|^华电X{1,4}发电有限公司$', fix_name):
  510. continue
  511. if len(fix_name)<4: # 20240521 短于4个字的不要
  512. break
  513. if enter_tail in SET_TAIL_ENTERPRISE or re.search('(中心|中学|小学|医院|学院|大学|学校|体校|监狱|大队|支队|林场|海关|分局|商行)$', enter_tail):
  514. if fix_name not in business_dic:
  515. have_bus, dic = get_business_data(fix_name) # 20210124 改为有工商数据的实体才添加
  516. if have_bus == False and 'have_business' in dic and re.search('^(上海|云南|内蒙古|北京|吉林|四川|天津|宁夏|安徽|山东|山西|广东|广西|新疆|江苏|江西|河北|河南|浙江|海南|湖北|湖南'
  517. '|甘肃|福建|西藏|贵州|辽宁|重庆|陕西|青海|黑龙江|\w{1,5}[市县])[\w()]{2,15}[厂店铺市场行部城室馆中心站处社会狱所园关局司署段厅院队小学]$',fix_name) and re.search(
  518. '某|x|X|^\w{2,3}[分支闵](行|局|院|会|园|中心)$|^\w{2,5}中小学$', fix_name)==None: # 无工商数据有前面地址后面有关键词且在字典表的添加
  519. have_bus = True
  520. log('字典表补充无工商数据有关键词实体:%s'%fix_name)
  521. business_dic[fix_name] = (have_bus, dic)
  522. else:
  523. have_bus, dic = business_dic.get(fix_name) # 20240708 字典保存查询过的工商数据,避免重复查询redis
  524. if have_bus:
  525. # if is_enterprise_exist(enter_name):
  526. match_item = {"entity_text":"%s"%(fix_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)}
  527. # print("match_item",key_enter,enter_name)
  528. list_match.append(match_item)
  529. begin_index += len(enter_name)-1
  530. break
  531. begin_index += 1
  532. else:
  533. break
  534. # print("======",list_match)
  535. not_match_names = ['乌鲁木齐经济技术开发区(乌鲁木齐市头屯河区)市场监督管理局(区知识产权局、区市场监管综合行政执法队)', '政采云有限公司', '徽县发展和改革局', '徽县以工代赈易地搬迁办公室'] # 字典匹配不到的名称列表
  536. pattern = re.compile('|'.join(not_match_names))
  537. for it in re.finditer(pattern, sentence):
  538. match_item = {"entity_text": "%s" % (it.group(0)), "begin_index": it.start(), "end_index": it.end()}
  539. list_match.append(match_item)
  540. return list_match
  541. def calibrateEnterprise(list_articles,list_sentences,list_entitys):
  542. business_dic = {}
  543. for _article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
  544. list_calibrate = []
  545. match_add = False
  546. match_replace = False
  547. range_entity = []
  548. for p_entity in list_entity:
  549. if p_entity.entity_type in ("org","company","location"):
  550. range_entity.append(p_entity)
  551. if len(range_entity)>1000:
  552. break
  553. for p_sentence in list_sentence:
  554. sentence = p_sentence.sentence_text
  555. sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']]
  556. list_match = match_enterprise_max_first(sentence, business_dic)
  557. # print("list_match", list_match)
  558. doc_id = p_sentence.doc_id
  559. sentence_index = p_sentence.sentence_index
  560. tokens = p_sentence.tokens
  561. list_match.sort(key=lambda x:x["begin_index"])
  562. for _match_index in range(len(list_match)):
  563. _match = list_match[_match_index]
  564. find_flag = False
  565. for p_entity in range_entity:
  566. if p_entity.sentence_index!=p_sentence.sentence_index:
  567. continue
  568. if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]:
  569. find_flag = True
  570. p_entity.entity_type = "company"
  571. p_entity.if_dict_match = 1
  572. if p_entity.entity_type not in ["location","org","company"]:
  573. continue
  574. if _match["entity_text"] == p_entity.entity_text:
  575. p_entity.if_dict_match = 1
  576. #有重叠
  577. #match部分被包含则不处理
  578. if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  579. find_flag = True
  580. # 判断是否是多个公司
  581. if re.search('[分支](公司|中心|监狱|部|行)|^\w{4,15}公司\w{2,3}公司$'
  582. '|(大学|学院)\w{,2}附属\w{,6}医院$|(\w{2,5}办事处\w{2,6}$|^\w{2,6}银行\w{2,10}[分支]行$'
  583. '|\w{2,4}[省市县]\w{2,14}村)(股份)?经济(合作|联合)社$|国家税务总局\w{2,10}税务局$',
  584. p_entity.entity_text):
  585. continue
  586. if p_entity.entity_type == "location" and re.search('\d[楼室号]', p_entity.entity_text): # 明确地址不进行替换避免 类似 434052508 西宁市城西区西关大街128号山东大厦15楼1152室 更新为 西宁市城西
  587. continue
  588. for _match_j in range(_match_index,len(list_match)):
  589. if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end:
  590. _match_j -= 1
  591. break
  592. if _match_j>_match_index:
  593. match_replace = True
  594. match_add = True
  595. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  596. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  597. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  598. p_entity.entity_text = _match["entity_text"]
  599. p_entity.wordOffset_begin = _match["begin_index"]
  600. p_entity.wordOffset_end = _match["end_index"]
  601. p_entity.begin_index = begin_index
  602. p_entity.end_index = end_index
  603. # 该公司实体是字典识别的
  604. p_entity.if_dict_match = 1
  605. for _match_h in range(_match_index+1,_match_j+1):
  606. entity_text = list_match[_match_h]["entity_text"]
  607. entity_type = "company"
  608. begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"])
  609. end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1)
  610. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  611. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"],in_attachment=p_sentence.in_attachment)
  612. add_entity.if_dict_match = 1
  613. list_entity.append(add_entity)
  614. range_entity.append(add_entity)
  615. list_calibrate.append({"type":"add","from":"","to":entity_text})
  616. _match_index = _match_j
  617. break
  618. continue
  619. elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin:
  620. find_flag = True
  621. if _match["begin_index"]<p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end:
  622. if p_entity.entity_type in ("org","company"):
  623. _diff_text = sentence[p_entity.wordOffset_end:_match["end_index"]]
  624. if re.search("分",_diff_text) is not None:
  625. pass
  626. else:
  627. match_replace = True
  628. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  629. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  630. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  631. p_entity.entity_text = _match["entity_text"]
  632. p_entity.wordOffset_begin = _match["begin_index"]
  633. p_entity.wordOffset_end = _match["end_index"]
  634. p_entity.begin_index = begin_index
  635. p_entity.end_index = end_index
  636. p_entity.if_dict_match = 1
  637. elif _match["end_index"]>=p_entity.wordOffset_end:
  638. # 原entity列表已有实体,则不重复添加
  639. if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys:
  640. match_replace = True
  641. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  642. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  643. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  644. p_entity.entity_text = _match["entity_text"]
  645. p_entity.wordOffset_begin = _match["begin_index"]
  646. p_entity.wordOffset_end = _match["end_index"]
  647. p_entity.begin_index = begin_index
  648. p_entity.end_index = end_index
  649. p_entity.entity_type = "company"
  650. p_entity.if_dict_match = 1
  651. elif _match["begin_index"]<p_entity.wordOffset_end and _match["end_index"]>p_entity.wordOffset_end:
  652. find_flag = True
  653. if p_entity.entity_type in ("org","company"):
  654. match_replace = True
  655. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  656. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  657. list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]})
  658. p_entity.entity_text = _match["entity_text"]
  659. p_entity.wordOffset_begin = _match["begin_index"]
  660. p_entity.wordOffset_end = _match["end_index"]
  661. p_entity.begin_index = begin_index
  662. p_entity.end_index = end_index
  663. p_entity.if_dict_match = 1
  664. if not find_flag:
  665. match_add = True
  666. entity_text = _match["entity_text"]
  667. entity_type = "company"
  668. begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"])
  669. end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1)
  670. entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
  671. add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"],in_attachment=p_sentence.in_attachment)
  672. list_entity.append(add_entity)
  673. range_entity.append(add_entity)
  674. list_calibrate.append({"type":"add","from":"","to":entity_text})
  675. #去重
  676. set_calibrate = set()
  677. list_match_enterprise = []
  678. for _calibrate in list_calibrate:
  679. _from = _calibrate.get("from","")
  680. _to = _calibrate.get("to","")
  681. _key = _from+_to
  682. if _key not in set_calibrate:
  683. list_match_enterprise.append(_calibrate)
  684. set_calibrate.add(_key)
  685. match_enterprise_type = 0
  686. if match_add:
  687. match_enterprise_type += 1
  688. if match_replace:
  689. match_enterprise_type += 2
  690. _article.match_enterprise = list_match_enterprise
  691. _article.match_enterprise_type = match_enterprise_type
  692. def isLegalEnterprise(name):
  693. is_legal = True
  694. if re.search("^[省市区县]",name) is not None or re.search("^\**.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会|有限公司)$",name) or re.search("标段|标包|名称|联系人|联系方式|中标单位|中标人|测试单位|采购单位|采购人|代理人|代理机构|盖章|(主)",name) is not None:
  695. is_legal = False
  696. return is_legal
  697. def fix_LEGAL_ENTERPRISE():
  698. unlegal_enterprise = []
  699. _path = getEnterprisePath()
  700. _sum = 0
  701. set_enter = set()
  702. paths = [_path]
  703. for _p in paths:
  704. with open(_p,"r",encoding="utf8") as f:
  705. while True:
  706. line = f.readline()
  707. if not line:
  708. break
  709. line = line.strip()
  710. if isLegalEnterprise(line):
  711. set_enter.add(line)
  712. if line=="有限责任公司" or line=='设计研究院' or line=='限责任公司' or (re.search("^.{,4}(分公司|支行|分行)$",line) is not None and re.search("电信|移动|联通|建行|工行|农行|中行|交行",line) is None):
  713. print(line)
  714. if line in set_enter:
  715. set_enter.remove(line)
  716. with open("enter.txt","w",encoding="utf8") as fwrite:
  717. for line in list(set_enter):
  718. fwrite.write(line.replace("(","(").replace(")",")"))
  719. fwrite.write("\n")
  720. # if re.search("标段|地址|标包|名称",line) is not None:#\(|\)||
  721. # _count += 1
  722. # print("=",line)
  723. # print("%d/%d"%(_count,_sum))
  724. # a_list = []
  725. # with open("电信分公司.txt","r",encoding="utf8") as f:
  726. # while True:
  727. # _line = f.readline()
  728. # if not _line:
  729. # break
  730. # if _line.strip()!="":
  731. # a_list.append(_line.strip())
  732. # with open("enter.txt","a",encoding="utf8") as f:
  733. # for _line in a_list:
  734. # f.write(_line)
  735. # f.write("\n")
  736. if __name__=="__main__":
  737. # edit_distance("GUMBO","GAMBOL")
  738. # print(jaccard_score("周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目竞争性谈判公告","周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目-成交公告"))
  739. #
  740. # sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司"
  741. # print(match_enterprise_max_first(sentences))
  742. #
  743. # print("takes %d s"%(time.time()-_time))
  744. # fix_LEGAL_ENTERPRISE()
  745. # print(jaccard_score("吉林省九台","吉林省建苑设计集团有限公司"))
  746. print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))