''' Created on 2019年5月20日 @author: User ''' import pymongo import re from .entityLink import * from BiddingKG.dl.common.Utils import * from multiprocessing import Process def getData(): client = pymongo.MongoClient("mongodb://bxkc:bidizhaobiaowang2017@121.46.18.113:17017/bxkc") db = client["bxkc"] collection_enterprise = db['enterprise_profile'] cursor = collection_enterprise.find({},{"enterprise_name":1}).limit(3000) list_entitys = [] list_entitys_pair = [] for row in cursor: entity_name = row["enterprise_name"] entity_name_sub = re.sub("(有限(责任)?公司|子公司|(政府)?采购中心)","",entity_name) list_entitys.append([entity_name,entity_name_sub]) parts = 2 nums = len(list_entitys)//20 list_index = [] for i in range(parts-1): list_index.append([i*nums,(i+1)*nums]) list_index.append([(parts-1)*nums,len(list_entitys)]) for item in list_index: proc = Process(target=process_thread,args=(list_entitys,item[0],item[1])) proc.start() def process_thread(list_entitys,index_begin,index_end): list_entitys_pair = [] for i in range(index_begin,index_end): print(i) source = list_entitys[i] for j in range(i+1,len(list_entitys)): target = list_entitys[j] if edit_distance(source[0], target[0])<4 and jaccard_score(source[0], target[0])>0.6: list_entitys_pair.append([source[0],target[0]]) for item in list_entitys_pair: print(item) save(list_entitys_pair,"list_entitys_pair_"+str(index_begin)+"_"+str(index_end)+".pk") if __name__=="__main__": getData()