#coding:UTF8 ''' Created on 2019年5月21日 @author: User ''' import re import os import time import pandas as pd _time = time.time() from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.Entitys import * import json from BiddingKG.dl.common.constDict import ConstDict business_dic = {} def edit_distance(source,target): dp = [["" for i in range(len(source)+1)] for j in range(len(target)+1)] for i in range(len(dp)): for j in range(len(dp[i])): if i==0: dp[i][j] = j elif j==0: dp[i][j] = i else: if source[j-1]==target[i-1]: cost = 0 else: cost = 2 dp[i][j] = min([dp[i-1][j]+1,dp[i][j-1]+1,dp[i-1][j-1]+cost]) return dp[-1][-1] def jaccard_score(source,target): source_set = set([s for s in source]) target_set = set([s for s in target]) if len(source_set)==0 or len(target_set)==0: return 0 return max(len(source_set&target_set)/len(source_set),len(source_set&target_set)/len(target_set)) def get_place_list(): path = os.path.dirname(__file__) + '/../place_info.csv' place_df = pd.read_csv(path) place_list = [] for index, row in place_df.iterrows(): place_list.append(row[1]) place_list.append('台湾') place_list.append('澳门') place_list.append('香港') # place_list.append('東莞') # place_list.append('廣州') # place_list.append('韩国') # place_list.append('德国') # place_list.append('英国') # place_list.append('日本') # place_list.append('意大利') # place_list.append('新加坡') # place_list.append('加拿大') # place_list.append('西班牙') # place_list.append('澳大利亚') # place_list.append('美国') place_list = list(set(place_list)) return place_list place_list = get_place_list() place_pattern = "|".join(place_list) def is_short(shorter_cut, longer): ''' 判断是否为简称 :param shorter_cut: 简称 :param longer: 全称 :return: ''' flag = 1 for words in shorter_cut: if words in longer: longer = longer[longer.find(words) + len(words):] else: flag = 0 break if flag: return 1 else: return 0 def get_business_data(enterprise_name): ''' 获取指定公司名称是否有工商数据,有就返回True及相关招投标数据,没有返回False及{} :param enterprise_name: 公司名称 :return: ''' global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS # print("test",enterprise_name) if ENTERPRISE_HUGE: if POOL_REDIS is None: init_redis_pool() _db = POOL_REDIS.getConnector() try: _time = time.time() _v = _db.get(enterprise_name) POOL_REDIS.putConnector(_db) if _v is None: return False, {} else: _v = str(_v, 'utf-8') if 'have_business' in _v: # log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name)) d = json.loads(_v) if d.get('have_business', '') == 1: return True, d return False, d else: return False, {} except Exception as e: traceback.print_exc() return False, {} else: if enterprise_name in SET_ENTERPRISE: return True, {} else: return False, {} def get_role(dic): ''' 通过字典统计 招标、代理、中标公告数量 返回最大比例及对应类别 :param dic: redics 获取实体的工商数据字典 :return: ''' if 'zhao_biao_number' in dic: zhaobiao = dic.get('zhao_biao_number', 0) daili = dic.get('dai_li_number', 0) zhongbiao = dic.get('zhong_biao_number', 0) bid = zhaobiao+ daili+ zhongbiao if bid > 100: # 总数大于100的才统计 if zhaobiao>=daili: if zhaobiao>=zhongbiao: return 0, zhaobiao/bid else: return 2, zhongbiao/bid elif daili >= zhongbiao: return 1, daili/bid else: return 2, zhongbiao/bid return 5, 0 def link_entitys(list_entitys,on_value=1):#on_value=0.81 for list_entity in list_entitys: range_entity = [] short_entity = [] # 不包含工商数据实体 long_entity = [] # 包含工商数据实体 n = 0 bus_dic = {} # 保存已查询包含工商数据实体 属于招标、代理、中标 何种类别及对应概率 find_tenderee = False bus_tenderee = [] for _entity in list_entity: if _entity.entity_type in ["org","company"]: ser = re.search('(?P.{2,}(医院|大学|公司))(招[投议]?标|采购)(中心|办公室)$', _entity.entity_text) # 2024-06-07 规范单位名称,去除非必要字眼 if ser: _entity.entity_text = ser.group('name') range_entity.append(_entity) if _entity.entity_text in bus_dic: have_bus = True else: if _entity.entity_text not in business_dic: have_bus, dic = get_business_data(_entity.entity_text) business_dic[_entity.entity_text] = (have_bus, dic) else: have_bus, dic = business_dic.get(_entity.entity_text) # 20240708 字典保存查询过的工商数据,避免重复查询redis if re.search('^\w{,5}[分支](行|公司)$|^\w{1,3}公司$|^\w{2,5}段$', _entity.entity_text): have_bus = False if have_bus: lb, prob = get_role(dic) bus_dic[_entity.entity_text] = (lb, prob) if lb == 0 and prob > 0.9 and re.search('医院|学院|学校|中学|小学|大学|中心|幼儿园|保健院|党校|研究院|血站|分校|红十字会|防治院|研究所', _entity.entity_text) and _entity.entity_text not in ['中华人民共和国', '营业执照', '人民法院','民办非企业单位','个体工商户','运输服务', '社会团体']: bus_tenderee.append(_entity) elif re.search('^\w{2,6}银行\w{2,10}[分支]行$', _entity.entity_text): # 2024/05/22 补充某些支行没收集到工商数据 have_bus = True bus_dic[_entity.entity_text] = (0, 0.5) if have_bus: # 20231115 改为只判断是否有工商数据,没有就考虑替换 long_entity.append(_entity) if len(_entity.entity_text)< 6 and re.search('(大学|医院)', _entity.entity_text) == None: short_entity.append(_entity) lb, prob = bus_dic[_entity.entity_text] if lb in [0,1] and prob>0.9 and _entity.label in [0, 1] and _entity.values[_entity.label]<0.55: # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况 if _entity.label != lb: _entity.label = lb _entity.values[_entity.label] = 0.55 else: _entity.values[_entity.label] += 0.05 else: short_entity.append(_entity) if _entity.label == 0: # 找到招标人 find_tenderee = True n += 1 if n > 1000: break if find_tenderee == False and len(bus_tenderee)==1 and bus_tenderee[0].label==5: # 如果整篇都没招标人,工商统计只有一个高概率招标人把它作为招标人 bus_tenderee[0].label = 0 bus_tenderee[0].values[0] = 0.55 range_entity = range_entity[:1000] #替换公司的逻辑有问题,先取消 # for first_i in range(len(range_entity)): # _entity = range_entity[first_i] # for second_i in range(first_i+1,len(range_entity)): # _ent = range_entity[second_i] # # 2021/5/21 update: 两个实体标签互斥(一个是招标人、一个是代理人)且entity_text不相等时,跳过 # if _entity.entity_text != _ent.entity_text and _entity.label != _ent.label and _entity.label in [0,1] and _ent.label in [0, 1]: # continue # _score = jaccard_score(re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_entity.entity_text), re.sub("%s|%s"%("股份|责任|有限|公司",place_pattern),"",_ent.entity_text)) # if _entity.entity_text!=_ent.entity_text and _score>=on_value: # _entity.linked_entitys.append(_ent) # _ent.linked_entitys.append(_entity) # print("=-===",_entity.entity_text,_ent.entity_text,_score) # #替换公司名称 # for _entity in range_entity: # if re.search("公司",_entity.entity_text) is None: # for _ent in _entity.linked_entitys: # if re.search("公司$",_ent.entity_text) is not None: # if len(_ent.entity_text)>len(_entity.entity_text): # _entity.entity_text = _ent.entity_text if short_entity and long_entity: # for first_i in range(len(short_entity)): _entity = short_entity[first_i] if _entity.label == 0: for second_i in range(len(long_entity)): _ent = long_entity[second_i] if _ent.label in [0,1,5]: if len(_entity.entity_text) 0.9 and _entity.values[ _entity.label] < 0.55: # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况 if _entity.label != lb: _entity.label = lb _entity.values[_entity.label] = 0.55 else: _entity.values[_entity.label] += 0.05 break elif len(_entity.entity_text)>len(_ent.entity_text) and _ent.entity_text in _entity.entity_text \ and re.search('(医院|大学)$', _ent.entity_text) and re.search('[部处室科]$', _entity.entity_text): # 不包含工商数据实体完全包含工商数据实体名称的替换 20240520调整限定部门结尾才替换,防止出错 _entity.entity_text = _ent.entity_text lb, prob = bus_dic[_entity.entity_text] if lb in [0, 1] and prob > 0.9 and _entity.values[ _entity.label] < 0.55: # 如果工商统计概率较高,文中概率较低,换为统计类别,主要为标题及发布人等招标、代理划分不明确情况 if _entity.label != lb: _entity.label = lb _entity.values[_entity.label] = 0.55 else: _entity.values[_entity.label] += 0.05 break # 2021/12/21 替换通过字典识别到的取长度最大的相似实体 for _entity in range_entity: used_linked_entitys = [] if not _entity.linked_entitys: continue _entity.linked_entitys.sort(key=lambda x: len(x.entity_text), reverse=True) for _ent in _entity.linked_entitys: if _ent in used_linked_entitys: break # print("_entity, _ent", _entity.entity_text, _ent.if_dict_match, _ent.entity_text) if _ent.if_dict_match == 1: if len(_ent.entity_text) > len(_entity.entity_text): # 判断两个公司地区相同 match_list_1, match_list_2 = [], [] for place in place_list: if place in _entity.entity_text: match_list_1.append(place) if place in _ent.entity_text: match_list_2.append(place) if str(match_list_1) == str(match_list_2): # print("字典替换", _entity.entity_text, "->", _ent.entity_text) _entity.origin_entity_text = _entity.entity_text _entity.entity_text = _ent.entity_text used_linked_entitys.append(_ent) # print(_entity.entity_text, _entity.if_dict_match, _ent.entity_text, _ent.if_dict_match) # 用于去重的标题 def doctitle_refine(doctitle): _doctitle_refine = re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|' r'交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|竞价|合同', '', doctitle) return _doctitle_refine # 前100个公司实体 def get_nlp_enterprise(list_entity): nlp_enterprise = [] nlp_enterprise_attachment = [] dict_enterprise = {} max_num = 100 list_entity = sorted(list_entity,key=lambda x:(x.sentence_index,x.begin_index)) for entity in list_entity: if entity.entity_type in ['org','company']: if entity.entity_text not in dict_enterprise: if entity.entity_text not in business_dic: have_bus, dic = get_business_data(entity.entity_text) business_dic[entity.entity_text] = (have_bus, dic) else: have_bus, dic = business_dic.get(entity.entity_text) # 20240708 字典保存查询过的工商数据,避免重复查询redis credit_code = dic.get('credit_code', '') in_text = 0 if entity.in_attachment else 1 if entity.label in [0,1,2,3,4] or len(dict_enterprise)<=max_num: dict_enterprise[entity.entity_text] = {'in_text': in_text} if credit_code != "": dict_enterprise[entity.entity_text]['credit_code'] = credit_code else: in_text = 0 if entity.in_attachment else 1 if in_text != dict_enterprise[entity.entity_text]['in_text']: dict_enterprise[entity.entity_text]['in_text'] = 2 if not entity.in_attachment: if entity.entity_text not in nlp_enterprise: nlp_enterprise.append(entity.entity_text) else: if entity.entity_text not in nlp_enterprise_attachment: nlp_enterprise_attachment.append(entity.entity_text) return nlp_enterprise[:max_num],nlp_enterprise_attachment[:max_num], dict_enterprise ENTERPRISE_HUGE = None def getEnterprisePath(): global ENTERPRISE_HUGE filename_huge = "LEGAL_ENTERPRISE_HUGE.txt" huge_path = getFileFromSysPath(filename_huge) if huge_path is None: if os.path.exists(filename_huge): log("enterprise path:%s"%(filename_huge)) ENTERPRISE_HUGE = True return filename_huge,ENTERPRISE_HUGE else: log("enterprise path:%s"%(huge_path)) ENTERPRISE_HUGE = True return huge_path,ENTERPRISE_HUGE filename = "LEGAL_ENTERPRISE.txt" real_path = getFileFromSysPath(filename) if real_path is None: real_path = filename log("ENTERPRISE path:%s"%(real_path)) ENTERPRISE_HUGE = False return real_path,ENTERPRISE_HUGE DICT_ENTERPRISE_DONE = False POOL_REDIS = None ENTERPRISE_KEY_LEN = 3 ENTERPRISE_PREFIX_LEN = 3 ENTERPRISE_TAIL_LEN = 3 SET_ENTERPRISE = set() SET_PREFIX_ENTERPRISE = set() SET_TAIL_ENTERPRISE = set() SET_PREFIX_ENTERPRISE_HUGE_FILE = "SET_PREFIX_ENTERPRISE_HUGE.pk" SET_TAIL_ENTERPRISE_HUGE_FILE = "SET_TAIL_ENTERPRISE_HUGE.pk" def getDict_enterprise(): global DICT_ENTERPRISE_DONE,SET_ENTERPRISE,SET_PREFIX_ENTERPRISE,SET_TAIL_ENTERPRISE real_path,is_huge = getEnterprisePath() _ok = False if is_huge: if os.path.exists(SET_PREFIX_ENTERPRISE_HUGE_FILE) and os.path.exists(SET_TAIL_ENTERPRISE_HUGE_FILE): SET_PREFIX_ENTERPRISE = load(SET_PREFIX_ENTERPRISE_HUGE_FILE) SET_TAIL_ENTERPRISE = load(SET_TAIL_ENTERPRISE_HUGE_FILE) _ok = True if not _ok: with open(real_path,"r",encoding="UTF8") as f: for _e in f: if not _e: continue _e = _e.strip() if len(_e)>=4: key_enter = _e[:ENTERPRISE_KEY_LEN] SET_PREFIX_ENTERPRISE.add(key_enter) SET_TAIL_ENTERPRISE.add(_e[-ENTERPRISE_TAIL_LEN:]) if not is_huge: SET_ENTERPRISE.add(_e) #仅在大文件情况下才使用缓存加载 if is_huge: save(SET_PREFIX_ENTERPRISE,SET_PREFIX_ENTERPRISE_HUGE_FILE) save(SET_TAIL_ENTERPRISE,SET_TAIL_ENTERPRISE_HUGE_FILE) log("SET_PREFIX_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_PREFIX_ENTERPRISE)/1024/1024,len(SET_PREFIX_ENTERPRISE))) log("SET_TAIL_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_TAIL_ENTERPRISE)/1024/1024,len(SET_TAIL_ENTERPRISE))) log("SET_ENTERPRISE takes memory:%.2fM size:%d"%(sys.getsizeof(SET_ENTERPRISE)/1024/1024,len(SET_ENTERPRISE))) # for _e in ["河南省柘源","建筑工程有限公司"]: # if not _e: # continue # _e = _e.strip() # if len(_e)>=4: # key_enter = _e[:4] # if key_enter not in DICT_ENTERPRISE: # DICT_ENTERPRISE[key_enter] = set() # DICT_ENTERPRISE[key_enter].add(_e[4:]) DICT_ENTERPRISE_DONE = True def init_redis_pool(): from BiddingKG.dl.common.pool import ConnectorPool from BiddingKG.dl.common.source import getConnect_redis_baseline global POOL_REDIS if POOL_REDIS is None: POOL_REDIS = ConnectorPool(init_num=1,max_num=10,method_init=getConnect_redis_baseline) # 插入 Redis # def add_redis(company_list): # global ENTERPRISE_HUGE,POOL_REDIS # if ENTERPRISE_HUGE: # _db = POOL_REDIS.getConnector() # for enterprise_name in company_list: # _v = _db.get(enterprise_name) # if _v is None: # if isLegalNewName(enterprise_name): # _db.set(enterprise_name,1) # 新实体合法判断 def isLegalNewName(enterprise_name): # head_character_list = ["[",'【',"(",'('] # tail_character_list = ["]",'】',")",')'] # 名称开头判断 if re.search("^[\da-zA-Z][^\da-zA-Z]|" "^[^\da-zA-Z\u4e00-\u9fa5\[【((]|" "^[\[【((].{,1}[\]】))]|" "^[0〇]|" "^(20[0-2][0-9]|[0-2]?[0-9]年|[0-1]?[0-9]月|[0-3]?[0-9]日)",enterprise_name): return -1 if len(re.findall("[\u4e00-\u9fa5]",enterprise_name))<2: return -1 if re.search("╳|*|\*|×|xx|XX",enterprise_name): return -1 if re.search("^(省|自治[县州区]|市|县|区|镇|乡|街道)",enterprise_name) and not re.search("^(镇江|乡宁|镇原|镇海|镇安|镇巴|镇坪|镇赉|镇康|镇沅|镇雄|镇远|镇宁|乡城|镇平|市中|市南|市北)",enterprise_name): return -1 if re.search("\d{1,2}:\d{2}(:\d{2})?|(rar|xlsx|zip|png|jpg|swf|docx|txt|pdf|PDF|doc|xls|bmp|&?nbsp)",enterprise_name): return -1 if re.search("(招标|代理)(人|机构)|联系(人|方式)|中标|候选|第.名",enterprise_name): return -1 if re.search("[a-zA-Z\d]{1,2}(包|标段?)|第.批"): return 0 return 1 # 过滤掉Redis里值为0的错误实体 def enterprise_filter(entity_list): global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS if ENTERPRISE_HUGE: if POOL_REDIS is None: init_redis_pool() _db = POOL_REDIS.getConnector() remove_list = [] try: for entity in entity_list: if entity.entity_type in ['company','org']: _v = _db.get(entity.entity_text) if _v==0: remove_list.append(entity) except Exception as e: traceback.print_exc() POOL_REDIS.putConnector(_db) for _entity in remove_list: entity_list.remove(_entity) return entity_list def is_enterprise_exist(enterprise_name): global ENTERPRISE_HUGE,SET_ENTERPRISE,POOL_REDIS # print("test",enterprise_name) if ENTERPRISE_HUGE: if POOL_REDIS is None: init_redis_pool() _db = POOL_REDIS.getConnector() try: _time = time.time() _v = _db.get(enterprise_name) POOL_REDIS.putConnector(_db) if _v is None: return False else: if _v: # log("redis take %.5f of '%s' exists"%(time.time()-_time,enterprise_name)) return True else: return False except Exception as e: traceback.print_exc() return False else: if enterprise_name in SET_ENTERPRISE: return True else: return False import threading import time load_enterprise_thread = threading.Thread(target=getDict_enterprise) load_enterprise_thread.start() MAX_ENTERPRISE_LEN = 30 def match_enterprise_max_first(sentence): while True: if not DICT_ENTERPRISE_DONE: time.sleep(1) else: break list_match = [] begin_index = 0 if len(sentence)>4: while True: if begin_index+ENTERPRISE_KEY_LEN 0: key_enter = sentence[begin_index:begin_index + ENTERPRISE_KEY_LEN+1].replace(',', '') # 20241212 修复实体名称被分割问题 例:北,京千里马网信科技有限公司 # if key_enter in DICT_ENTERPRISE: # _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index) # for _i in range(_len): # enter_name = sentence[begin_index+ENTERPRISE_KEY_LEN:begin_index+_len-_i] # if enter_name in DICT_ENTERPRISE[key_enter]: # match_item = {"entity_text":"%s%s"%(key_enter,enter_name),"begin_index":begin_index,"end_index":begin_index+len(key_enter)+len(enter_name)} # list_match.append(match_item) # begin_index += (len(key_enter)+len(enter_name))-1 # break if key_enter in SET_PREFIX_ENTERPRISE: _len = min(MAX_ENTERPRISE_LEN-ENTERPRISE_KEY_LEN+1,len(sentence)-begin_index) for _i in range(_len): enter_name = sentence[begin_index:begin_index+_len-_i] if enter_name.endswith(','): continue fix_name = enter_name.replace(',', '') # 20241212 修复实体名称被分割问题 例:北,京千里马网信科技有限公司 enter_tail = fix_name[-ENTERPRISE_TAIL_LEN:] if re.search('[\u4e00-\u9fa5]', enter_tail) == None: # 20240111不包含中文后缀不要 continue elif fix_name in ['黄埔军校', '五金建材', '铝合金门窗', '测试单位' ,'生产管理部', '华电XXX发电有限公司']: # '国有资产管理处', continue elif re.search('^\w{,3}(有限)?(责任)?分?公司$|^第[一二三四五六七八九十](工程|建筑)?分?公司$|交汇处$|大厦$|大楼$|^华电X{1,4}发电有限公司$', fix_name): continue if len(fix_name)<4: # 20240521 短于4个字的不要 break if enter_tail in SET_TAIL_ENTERPRISE or re.search('(中心|中学|小学|医院|学院|大学|学校|监狱|大队|支队|林场|海关|分局|商行)$', enter_tail): if fix_name not in business_dic: have_bus, dic = get_business_data(fix_name) # 20210124 改为有工商数据的实体才添加 business_dic[fix_name] = (have_bus, dic) else: have_bus, dic = business_dic.get(fix_name) # 20240708 字典保存查询过的工商数据,避免重复查询redis if have_bus: # if is_enterprise_exist(enter_name): match_item = {"entity_text":"%s"%(fix_name),"begin_index":begin_index,"end_index":begin_index+len(enter_name)} # print("match_item",key_enter,enter_name) list_match.append(match_item) begin_index += len(enter_name)-1 break begin_index += 1 else: break # print("======",list_match) not_match_names = ['乌鲁木齐经济技术开发区(乌鲁木齐市头屯河区)市场监督管理局(区知识产权局、区市场监管综合行政执法队)', '政采云有限公司'] # 字典匹配不到的名称列表 pattern = re.compile('|'.join(not_match_names)) for it in re.finditer(pattern, sentence): match_item = {"entity_text": "%s" % (it.group(0)), "begin_index": it.start(), "end_index": it.end()} list_match.append(match_item) return list_match def calibrateEnterprise(list_articles,list_sentences,list_entitys): for _article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys): list_calibrate = [] match_add = False match_replace = False range_entity = [] for p_entity in list_entity: if p_entity.entity_type in ("org","company","location"): range_entity.append(p_entity) if len(range_entity)>1000: break for p_sentence in list_sentence: sentence = p_sentence.sentence_text sentence_entitys = [(ent.entity_text,ent.wordOffset_begin,ent.wordOffset_end) for ent in list_entity if ent.sentence_index==p_sentence.sentence_index and ent.entity_type in ['org','company']] list_match = match_enterprise_max_first(sentence) # print("list_match", list_match) doc_id = p_sentence.doc_id sentence_index = p_sentence.sentence_index tokens = p_sentence.tokens list_match.sort(key=lambda x:x["begin_index"]) for _match_index in range(len(list_match)): _match = list_match[_match_index] find_flag = False for p_entity in range_entity: if p_entity.sentence_index!=p_sentence.sentence_index: continue if p_entity.entity_type=="location" and p_entity.entity_text==_match["entity_text"]: find_flag = True p_entity.entity_type = "company" p_entity.if_dict_match = 1 if p_entity.entity_type not in ["location","org","company"]: continue if _match["entity_text"] == p_entity.entity_text: p_entity.if_dict_match = 1 #有重叠 #match部分被包含则不处理 if _match["begin_index"]>=p_entity.wordOffset_begin and _match["end_index"]<=p_entity.wordOffset_end: find_flag = True # 判断是否是多个公司 if re.search('[分支](公司|中心|监狱|部|行)|^\w{4,15}公司\w{2,3}公司$' '|(大学|学院)\w{,2}附属\w{,6}医院$|(\w{2,5}办事处\w{2,6}$|^\w{2,6}银行\w{2,10}[分支]行$' '|\w{2,4}[省市县]\w{2,14}村)(股份)?经济(合作|联合)社$|国家税务总局\w{2,10}税务局$', p_entity.entity_text): continue if p_entity.entity_type == "location" and re.search('\d[楼室号]', p_entity.entity_text): # 明确地址不进行替换避免 类似 434052508 西宁市城西区西关大街128号山东大厦15楼1152室 更新为 西宁市城西 continue for _match_j in range(_match_index,len(list_match)): if not list_match[_match_j]["end_index"]<=p_entity.wordOffset_end: _match_j -= 1 break if _match_j>_match_index: match_replace = True match_add = True begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1) list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]}) p_entity.entity_text = _match["entity_text"] p_entity.wordOffset_begin = _match["begin_index"] p_entity.wordOffset_end = _match["end_index"] p_entity.begin_index = begin_index p_entity.end_index = end_index # 该公司实体是字典识别的 p_entity.if_dict_match = 1 for _match_h in range(_match_index+1,_match_j+1): entity_text = list_match[_match_h]["entity_text"] entity_type = "company" begin_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["begin_index"]) end_index = changeIndexFromWordToWords(tokens,list_match[_match_h]["end_index"]-1) entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,list_match[_match_h]["begin_index"],list_match[_match_h]["end_index"],in_attachment=p_sentence.in_attachment) add_entity.if_dict_match = 1 list_entity.append(add_entity) range_entity.append(add_entity) list_calibrate.append({"type":"add","from":"","to":entity_text}) _match_index = _match_j break continue elif _match["begin_index"]<=p_entity.wordOffset_begin and _match["end_index"]>p_entity.wordOffset_begin: find_flag = True if _match["begin_index"]=p_entity.wordOffset_end: # 原entity列表已有实体,则不重复添加 if (_match["entity_text"],_match["begin_index"],_match["end_index"]) not in sentence_entitys: match_replace = True begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1) list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]}) p_entity.entity_text = _match["entity_text"] p_entity.wordOffset_begin = _match["begin_index"] p_entity.wordOffset_end = _match["end_index"] p_entity.begin_index = begin_index p_entity.end_index = end_index p_entity.entity_type = "company" p_entity.if_dict_match = 1 elif _match["begin_index"]p_entity.wordOffset_end: find_flag = True if p_entity.entity_type in ("org","company"): match_replace = True begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1) list_calibrate.append({"type":"update","from":p_entity.entity_text,"to":_match["entity_text"]}) p_entity.entity_text = _match["entity_text"] p_entity.wordOffset_begin = _match["begin_index"] p_entity.wordOffset_end = _match["end_index"] p_entity.begin_index = begin_index p_entity.end_index = end_index p_entity.if_dict_match = 1 if not find_flag: match_add = True entity_text = _match["entity_text"] entity_type = "company" begin_index = changeIndexFromWordToWords(tokens,_match["begin_index"]) end_index = changeIndexFromWordToWords(tokens,_match["end_index"]-1) entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index) add_entity = Entity(p_sentence.doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,_match["begin_index"],_match["end_index"],in_attachment=p_sentence.in_attachment) list_entity.append(add_entity) range_entity.append(add_entity) list_calibrate.append({"type":"add","from":"","to":entity_text}) #去重 set_calibrate = set() list_match_enterprise = [] for _calibrate in list_calibrate: _from = _calibrate.get("from","") _to = _calibrate.get("to","") _key = _from+_to if _key not in set_calibrate: list_match_enterprise.append(_calibrate) set_calibrate.add(_key) match_enterprise_type = 0 if match_add: match_enterprise_type += 1 if match_replace: match_enterprise_type += 2 _article.match_enterprise = list_match_enterprise _article.match_enterprise_type = match_enterprise_type def isLegalEnterprise(name): is_legal = True if re.search("^[省市区县]",name) is not None or re.search("^\**.{,3}(分(公司|行|支)|街道|中心|办事处|经营部|委员会|有限公司)$",name) or re.search("标段|标包|名称|联系人|联系方式|中标单位|中标人|测试单位|采购单位|采购人|代理人|代理机构|盖章|(主)",name) is not None: is_legal = False return is_legal def fix_LEGAL_ENTERPRISE(): unlegal_enterprise = [] _path = getEnterprisePath() _sum = 0 set_enter = set() paths = [_path] for _p in paths: with open(_p,"r",encoding="utf8") as f: while True: line = f.readline() if not line: break line = line.strip() if isLegalEnterprise(line): set_enter.add(line) if line=="有限责任公司" or line=='设计研究院' or line=='限责任公司' or (re.search("^.{,4}(分公司|支行|分行)$",line) is not None and re.search("电信|移动|联通|建行|工行|农行|中行|交行",line) is None): print(line) if line in set_enter: set_enter.remove(line) with open("enter.txt","w",encoding="utf8") as fwrite: for line in list(set_enter): fwrite.write(line.replace("(","(").replace(")",")")) fwrite.write("\n") # if re.search("标段|地址|标包|名称",line) is not None:#\(|\)|| # _count += 1 # print("=",line) # print("%d/%d"%(_count,_sum)) # a_list = [] # with open("电信分公司.txt","r",encoding="utf8") as f: # while True: # _line = f.readline() # if not _line: # break # if _line.strip()!="": # a_list.append(_line.strip()) # with open("enter.txt","a",encoding="utf8") as f: # for _line in a_list: # f.write(_line) # f.write("\n") if __name__=="__main__": # edit_distance("GUMBO","GAMBOL") # print(jaccard_score("周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目竞争性谈判公告","周口经济开发区陈营运粮河两岸拆迁工地土工布覆盖项目-成交公告")) # # sentences = "广州比地数据科技有限公司比地数据科技有限公司1111111123沈阳南光工贸有限公司" # print(match_enterprise_max_first(sentences)) # # print("takes %d s"%(time.time()-_time)) # fix_LEGAL_ENTERPRISE() # print(jaccard_score("吉林省九台","吉林省建苑设计集团有限公司")) print(match_enterprise_max_first("中国南方航空股份有限公司黑龙江分公司"))