#coding:UTF8 ''' Created on 2019年5月21日 @author: User ''' import re import os import time import pandas as pd _time = time.time() from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.Entitys import * import json def edit_distance(source,target): dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)] for i in range(len(dp)): for j in range(len(dp[i])): if i==0: dp[i][j] = j elif j==0: dp[i][j] = i else: if source[j-1]==target[i-1]: cost = 0 else: cost = 2 dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost]) return dp[-1][-1] def jaccard_score(source,target): source_set = set([s for s in source]) target_set = set([s for s in target]) if len(source_set)==0 or len(target_set)==0: return 0 return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set)) def get_place_list(): path = os.path.dirname(__file__) + '/../place_info.csv' place_df = pd.read_csv(path) place_list = [] for index, row in place_df.iterrows(): place_list.append(row[1]) place_list.append('台湾') place_list.append('澳门') place_list.append('香港') # place_list.append('東莞') # place_list.append('廣州') # place_list.append('韩国') # place_list.append('德国') # place_list.append('英国') # place_list.append('日本') # place_list.append('意大利') # place_list.append('新加坡') # place_list.append('加拿大') # place_list.append('西班牙') # place_list.append('澳大利亚') # place_list.append('美国') place_list = list(set(place_list)) return place_list place_list = get_place_list() place_pattern = "|".join(place_list) def link_entitys(list_entitys,on_value=1):#on_value=0.81 for list_entity in list_entitys: range_entity = [] for _entity in list_entity: if _entity.entity_type in ["org","company"]: range_entity.append(_entity) range_entity = range_entity[:1000] for first_i in range(len(range_entity)): _entity = range_entity[first_i] for second_i in range(first_i+1,len(range_entity)): _ent = range_entity[second_i] # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过 if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]: continue _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text)) if _entity.entity_text!=_ent.entity_text and _score>=on_value: _entity.linked_entitys.append(_ent) _ent.linked_entitys.append(_entity) #替换公司名称 for _entity in range_entity: if re.search("公司",_entity.entity_text) is None: for _ent in _entity.linked_entitys: if re.search("公司$",_ent.entity_text) is not None: if len(_ent.entity_text)>len(_entity.entity_text): _entity.entity_text = _ent.entity_text # 2021/12/21 替换通过字典识别到的取长度最大的相似实体 for _entity in range_entity: used_linked_entitys = [] if not _entity.linked_entitys: continue _entity.linked_entitys.sort(key=lambda x: len(x.entity_text), reverse=True) for _ent in _entity.linked_entitys: if _ent in used_linked_entitys: break # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text) if _ent.if_dict_match == 1: if len(_ent.entity_text) > len(_entity.entity_text): # 判断两个公司地区相同 match_list_1, match_list_2 = [], [] for place in place_list: if place in _entity.entity_text: match_list_1.append(place) if place in _ent.entity_text: match_list_2.append(place) if str(match_list_1) == str(match_list_2): # print("字典替换", _entity.entity_text, "->", _ent.entity_text) _entity.origin_entity_text = _entity.entity_text _entity.entity_text = _ent.entity_text used_linked_entitys.append(_ent) # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match) # 用于去重的标题 def doctitle_refine(doctitle): _doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|' r'交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|竞价|合同', '', doctitle) return _doctitle_refine # 前100个公司实体 def get_nlp_enterprise(list_entity): nlp_enterprise = [] nlp_enterprise_attachment = [] max_num = 100 list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index)) for entity in list_entity: if entity.entity_type in ['org','company']: if not entity.in_attachment: if entity.entity_text not in nlp_enterprise: nlp_enterprise.append(entity.entity_text) else: if entity.entity_text not in nlp_enterprise_attachment: nlp_enterprise_attachment.append(entity.entity_text) return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num] def getEnterprisePath(): filename = "LEGAL_ENTERPRISE.txt" real_path = getFileFromSysPath(filename) if real_path is None: real_path = filename return real_path DICT_ENTERPRISE = {} DICT_ENTERPRISE_DONE = False def getDict_enterprise(): global DICT_ENTERPRISE,DICT_ENTERPRISE_DONE real_path = getEnterprisePath() with open(real_path,"r",encoding="UTF8") as f: for _e in f: if not _e: continue _e = _e.strip() if len(_e)>=4: key_enter = _e[:4] if key_enter not in DICT_ENTERPRISE: DICT_ENTERPRISE[key_enter] = set() DICT_ENTERPRISE[key_enter].add(_e[4:]) # for _e in ["河南省柘源","建筑工程有限公司"]: # if not _e: # continue # _e = _e.strip() # if len(_e)>=4: # key_enter = _e[:4] # if key_enter not in DICT_ENTERPRISE: # DICT_ENTERPRISE[key_enter] = set() # DICT_ENTERPRISE[key_enter].add(_e[4:]) DICT_ENTERPRISE_DONE = True return DICT_ENTERPRISE import threading import time load_enterprise_thread = threading.Thread(target=getDict_enterprise) load_enterprise_thread.start() MAX_ENTERPRISE_LEN = 30 def match_enterprise_max_first(sentence): while True: if not DICT_ENTERPRISE_DONE: time.sleep(1) else: break list_match = [] begin_index = 0 if len(sentence)>4: while True: if begin_index+41000: break for p_sentence in list_sentence: sentence = p_sentence.sentence_text sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']] list_match = match_enterprise_max_first(sentence) # print("list_match", list_match) doc_id = p_sentence.doc_id sentence_index = p_sentence.sentence_index tokens = p_sentence.tokens list_match.sort(key=lambda x:x["begin_index"]) for _match_index in range(len(list_match)): _match = list_match[_match_index] find_flag = False for p_entity in range_entity: if p_entity.sentence_index!=p_sentence.sentence_index: continue if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]: find_flag = True p_entity.entity_type = "company" p_entity.if_dict_match = 1 if p_entity.entity_type not in ["location","org","company"]: continue if _match["entity_text"] == p_entity.entity_text: p_entity.if_dict_match = 1 #有重叠 #match部分被包含则不处理 if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end: find_flag = True #判断是否是多个公司 for _match_j in range(_match_index,len(list_match)): if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end: _match_j -= 1 break if _match_j>_match_index: match_replace = True match_add = True begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1) list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]}) p_entity.entity_text = _match["entity_text"] p_entity.wordOffset_begin = _match["begin_index"] p_entity.wordOffset_end = _match["end_index"] p_entity.begin_index = begin_index p_entity.end_index = end_index # 该公司实体是字典识别的 p_entity.if_dict_match = 1 for _match_h in range(_match_index+1,_match_j+1): entity_text = list_match[_match_h]["entity_text"] entity_type = "company" begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"]) end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1) entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"],in_attachment=p_sentence.in_attachment) add_entity.if_dict_match = 1 list_entity.append(add_entity) range_entity.append(add_entity) list_calibrate.append({"type":"add","from":"","to":entity_text}) _match_index = _match_j break continue elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin: find_flag = True if _match["begin_index"]=p_entity.wordOffset_end: # 原entity列表已有实体,则不重复添加 if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys: match_replace = True begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1) list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]}) p_entity.entity_text = _match["entity_text"] p_entity.wordOffset_begin = _match["begin_index"] p_entity.wordOffset_end = _match["end_index"] p_entity.begin_index = begin_index p_entity.end_index = end_index p_entity.entity_type = "company" p_entity.if_dict_match = 1 elif _match["begin_index"]p_entity.wordOffset_end: find_flag = True if p_entity.entity_type in ("org","company"): match_replace = True begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1) list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]}) p_entity.entity_text = _match["entity_text"] p_entity.wordOffset_begin = _match["begin_index"] p_entity.wordOffset_end = _match["end_index"] p_entity.begin_index = begin_index p_entity.end_index = end_index p_entity.if_dict_match = 1 if not find_flag: match_add = True entity_text = _match["entity_text"] entity_type = "company" begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1) entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"],in_attachment=p_sentence.in_attachment) list_entity.append(add_entity) range_entity.append(add_entity) list_calibrate.append({"type":"add","from":"","to":entity_text}) #去重 set_calibrate = set() list_match_enterprise = [] for _calibrate in list_calibrate: _from = _calibrate.get("from","") _to = _calibrate.get("to","") _key = _from+_to if _key not in set_calibrate: list_match_enterprise.append(_calibrate) set_calibrate.add(_key) match_enterprise_type = 0 if match_add: match_enterprise_type += 1 if match_replace: match_enterprise_type += 2 _article.match_enterprise = list_match_enterprise _article.match_enterprise_type = match_enterprise_type def isLegalEnterprise(name): is_legal = True if re.search("^[省市区县]",name) is not None or re.search("^\**.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会|有限公司)$",name) or re.search("标段|标包|名称|联系人|联系方式|中标单位|中标人|测试单位|采购单位|采购人|代理人|代理机构|盖章|(主)",name) is not None: is_legal = False return is_legal def fix_LEGAL_ENTERPRISE(): unlegal_enterprise = [] _path = getEnterprisePath() _sum = 0 set_enter = set() paths = [_path] for _p in paths: with open(_p,"r",encoding="utf8") as f: while True: line = f.readline() if not line: break line = line.strip() if isLegalEnterprise(line): set_enter.add(line) if line=="有限责任公司" or line=='设计研究院' or line=='限责任公司' or (re.search("^.{,4}(分公司|支行|分行)$",line) is not None and re.search("电信|移动|联通|建行|工行|农行|中行|交行",line) is None): print(line) if line in set_enter: set_enter.remove(line) with open("enter.txt","w",encoding="utf8") as fwrite: for line in list(set_enter): fwrite.write(line.replace("(","(").replace(")",")")) fwrite.write("\n") # if re.search("标段|地址|标包|名称",line) is not None:#\(|\)|| # _count += 1 # print("=",line) # print("%d/%d"%(_count,_sum)) # a_list = [] # with open("电信分公司.txt","r",encoding="utf8") as f: # while True: # _line = f.readline() # if not _line: # break # if _line.strip()!="": # a_list.append(_line.strip()) # with open("enter.txt","a",encoding="utf8") as f: # for _line in a_list: # f.write(_line) # f.write("\n") if __name__=="__main__": # edit_distance("GUMBO","GAMBOL") # print(jaccard_score("周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目竞争性谈判公告","周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目-成交公告")) # # sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司" # print(match_enterprise_max_first(sentences)) # # print("takes %d s"%(time.time()-_time)) fix_LEGAL_ENTERPRISE() # print(jaccard_score("中国南方航空股份有限公司上海分公司","南方航空上海分公司")) # print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))