''' Created on 2019年5月21日 @author: User ''' import re import os import time _time = time.time() from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.Entitys import * import json def edit_distance(source,target): dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)] for i in range(len(dp)): for j in range(len(dp[i])): if i==0: dp[i][j] = j elif j==0: dp[i][j] = i else: if source[j-1]==target[i-1]: cost = 0 else: cost = 2 dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost]) return dp[-1][-1] def jaccard_score(source,target): source_set = set([s for s in source]) target_set = set([s for s in target]) if len(source_set)==0 or len(target_set)==0: return 0 return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set)) def link_entitys(list_entitys,on_value=0.8): for list_entity in list_entitys: range_entity = [] for _entity in list_entity: if _entity.entity_type in ["org","company"]: range_entity.append(_entity) range_entity = range_entity[:1000] for first_i in range(len(range_entity)): _entity = range_entity[first_i] for second_i in range(first_i+1,len(range_entity)): _ent = range_entity[second_i] _score = jaccard_score(_entity.entity_text, _ent.entity_text) if _entity.entity_text!=_ent.entity_text and _score>=on_value: _entity.linked_entitys.append(_ent) _ent.linked_entitys.append(_entity) #替换公司名称 for _entity in range_entity: if re.search("公司",_entity.entity_text) is None: for _ent in _entity.linked_entitys: if re.search("公司$",_ent.entity_text) is not None: if len(_ent.entity_text)>len(_entity.entity_text): _entity.entity_text = _ent.entity_text DICT_ENTERPRISE = {} DICT_ENTERPRISE_DONE = False def getDict_enterprise(): global DICT_ENTERPRISE,DICT_ENTERPRISE_DONE filename = os.path.dirname(__file__)+"/../LEGAL_ENTERPRISE.txt" filepath = os.path.dirname(__file__)+"/../" real_path = filename if os.path.exists(os.path.join(filepath,filename)): real_path = os.path.join(filepath,filename) with open(real_path,"r",encoding="UTF8") as f: for _e in f: if not _e: continue _e = _e.strip() if len(_e)>=4: key_enter = _e[:4] if key_enter not in DICT_ENTERPRISE: DICT_ENTERPRISE[key_enter] = set() DICT_ENTERPRISE[key_enter].add(_e[4:]) # for _e in ["河南省柘源","建筑工程有限公司"]: # if not _e: # continue # _e = _e.strip() # if len(_e)>=4: # key_enter = _e[:4] # if key_enter not in DICT_ENTERPRISE: # DICT_ENTERPRISE[key_enter] = set() # DICT_ENTERPRISE[key_enter].add(_e[4:]) DICT_ENTERPRISE_DONE = True return DICT_ENTERPRISE import threading import time load_enterprise_thread = threading.Thread(target=getDict_enterprise) load_enterprise_thread.start() MAX_ENTERPRISE_LEN = 30 def match_enterprise_max_first(sentence): while True: if not DICT_ENTERPRISE_DONE: time.sleep(1) else: break list_match = [] begin_index = 0 if len(sentence)>4: while True: if begin_index+41000: break for p_sentence in list_sentence: sentence = p_sentence.sentence_text list_match = match_enterprise_max_first(sentence) doc_id = p_sentence.doc_id sentence_index = p_sentence.sentence_index tokens = p_sentence.tokens list_match.sort(key=lambda x:x["begin_index"]) for _match_index in range(len(list_match)): _match = list_match[_match_index] find_flag = False for p_entity in range_entity: if p_entity.sentence_index!=p_sentence.sentence_index: continue if p_entity=="location" and p_entity.entity_text==_match["entity_text"]: find_flag = True p_entity.entity_type = "company" #有重叠 #match部分被包含则不处理 if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end: find_flag = True #判断是否是多个公司 for _match_j in range(_match_index,len(list_match)): if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end: _match_j -= 1 break if _match_j>_match_index: match_replace = True match_add = True begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]) list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]}) p_entity.entity_text = _match["entity_text"] p_entity.wordOffset_begin = _match["begin_index"] p_entity.wordOffset_end = _match["end_index"] p_entity.begin_index = begin_index p_entity.end_index = end_index for _match_h in range(_match_index+1,_match_j+1): entity_text = list_match[_match_h]["entity_text"] entity_type = "company" begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"]) end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]) entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"]) list_entity.append(add_entity) range_entity.append(add_entity) list_calibrate.append({"type":"add","from":"","to":entity_text}) _match_index = _match_j break continue elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin: find_flag = True if p_entity.entity_type in ("org","company"): if _match["begin_index"]=p_entity.wordOffset_end: match_replace = True begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]) list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]}) p_entity.entity_text = _match["entity_text"] p_entity.wordOffset_begin = _match["begin_index"] p_entity.wordOffset_end = _match["end_index"] p_entity.begin_index = begin_index p_entity.end_index = end_index elif _match["begin_index"]p_entity.wordOffset_end: find_flag = True if p_entity.entity_type in ("org","company"): match_replace = True begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]) list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]}) p_entity.entity_text = _match["entity_text"] p_entity.wordOffset_begin = _match["begin_index"] p_entity.wordOffset_end = _match["end_index"] p_entity.begin_index = begin_index p_entity.end_index = end_index if not find_flag: match_add = True entity_text = _match["entity_text"] entity_type = "company" begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]) entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"]) list_entity.append(add_entity) range_entity.append(add_entity) list_calibrate.append({"type":"add","from":"","to":entity_text}) #去重 set_calibrate = set() list_match_enterprise = [] for _calibrate in list_calibrate: _from = _calibrate.get("from","") _to = _calibrate.get("to","") _key = _from+_to if _key not in set_calibrate: list_match_enterprise.append(_calibrate) set_calibrate.add(_key) match_enterprise_type = 0 if match_add: match_enterprise_type += 1 if match_replace: match_enterprise_type += 2 _article.match_enterprise = list_match_enterprise _article.match_enterprise_type = match_enterprise_type if __name__=="__main__": # edit_distance("GUMBO","GAMBOL") # print(jaccard_score("GUMBO","GAMBOL")) sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司" print(match_enterprise_max_first(sentences)) print("takes %d s"%(time.time()-_time))