import numpy as np import pickle as pkl import networkx as nx import scipy.sparse as sp from scipy.sparse.linalg.eigen.arpack import eigsh import sys import tensorflow as tf import math flags = tf.app.flags FLAGS = flags.FLAGS def sparse_to_tuple(sparse_mx): """Convert sparse matrix to tuple representation.""" def to_tuple(mx): if not sp.isspmatrix_coo(mx): mx = mx.tocoo() coords = np.vstack((mx.row, mx.col)).transpose() values = mx.data shape = mx.shape return coords, values, shape if isinstance(sparse_mx, list): for i in range(len(sparse_mx)): sparse_mx[i] = to_tuple(sparse_mx[i]) else: sparse_mx = to_tuple(sparse_mx) return sparse_mx # 使邻接矩阵对称 def normalize_adj(adj): """Symmetrically normalize adjacency matrix.""" adj = sp.coo_matrix(adj) # 对每一列相加,形成一行总和 rowsum = np.array(adj.sum(1)) # np.power: 第一个参数的第二个参数次方,并打平(打平为一维列表,无嵌套) d_inv_sqrt = np.power(rowsum, -0.5).flatten() # 把是inf的改为0 d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. # 把结果进行对角化 d_mat_inv_sqrt = sp.diags(d_inv_sqrt) return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() # 对 前后实体tuple,分数邻接矩阵 进行预处理 def preprocess_adj(adj): """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation.""" adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0])) return sparse_to_tuple(adj_normalized) def construct_feed_dict(features, support, placeholders): """Construct feed dictionary for GCN-Align.""" feed_dict = dict() feed_dict.update({placeholders['features']: features}) feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))}) # print("support ===================") # print(support[0]) return feed_dict def chebyshev_polynomials(adj, k): """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation).""" print("Calculating Chebyshev polynomials up to order {}...".format(k)) adj_normalized = normalize_adj(adj) laplacian = sp.eye(adj.shape[0]) - adj_normalized largest_eigval, _ = eigsh(laplacian, 1, which='LM') scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0]) t_k = list() t_k.append(sp.eye(adj.shape[0])) t_k.append(scaled_laplacian) def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap): s_lap = sp.csr_matrix(scaled_lap, copy=True) return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two for i in range(2, k+1): t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian)) return sparse_to_tuple(t_k) # 输入文件名filename,和需要截取的字符串数目num,读取文件 def loadfile(fn, num=1): """Load a file and return a list of tuple containing $num integers in each line.""" print('loading a file...' + fn) ret = [] with open(fn, encoding='utf-8') as f: for line in f: th = line[:-1].split('\t') x = [] for i in range(num): x.append(int(th[i])) ret.append(tuple(x)) return ret def get_ent2id(fns): ent2id = {} for fn in fns: with open(fn, 'r', encoding='utf-8') as f: for line in f: th = line[:-1].split('\t') ent2id[th[1]] = int(th[0]) return ent2id # 读取属性,输出 def loadattr(fns, e, ent2id): """The most frequent attributes are selected to save space.""" # 对除了每行的第一个是resource,其他的property进行计数,并从大到小排序 cnt = {} for fn in fns: with open(fn, 'r', encoding='utf-8') as f: for line in f: th = line[:-1].split('\t') if th[0] not in ent2id: continue for i in range(1, len(th)): if th[i] not in cnt: cnt[th[i]] = 1 else: cnt[th[i]] += 1 fre = [(k, cnt[k]) for k in sorted(cnt, key=cnt.get, reverse=True)] # 生成value 和 1~2000的id的字典,取排名前2000的train_attr中的property num_features = min(len(fre), 2000) attr2id = {} for i in range(num_features): attr2id[fre[i][0]] = i # 如果resource在entity2id中存在,则循环剩下的property # 如果property在attr2id中存在,则添加至字典Map{"resource, property": 1.0} M = {} for fn in fns: with open(fn, 'r', encoding='utf-8') as f: for line in f: th = line[:-1].split('\t') if th[0] in ent2id: for i in range(1, len(th)): if th[i] in attr2id: M[(ent2id[th[0]], attr2id[th[i]])] = 1.0 # 将M中的key, value分离出来,key中的entity_id是row,key中的property是col,value是data row = [] col = [] data = [] for key in M: row.append(key[0]) col.append(key[1]) data.append(M[key]) # print("-------------") print(len(data), len(row), len(col), e, num_features) # coo_matrix:生成稀疏矩阵,数据data根据(row,col)的定位,定位到shape大小的的矩阵中 return sp.coo_matrix((data, (row, col)), shape=(e, num_features)) # attr def get_dic_list(e, KG): M = {} for tri in KG: if tri[0] == tri[2]: continue M[(tri[0], tri[2])] = 1 M[(tri[2], tri[0])] = 1 dic_list = {} for i in range(e): dic_list[i] = [] for pair in M: dic_list[pair[0]].append(pair[1]) return dic_list # 传入Triple三元组 def func(KG): # head: 存储reference的前一个entity,并可添加 head = {} # cnt: 存储reference的id和计数 cnt = {} # 对triple中的第二个元素reference,分别进行计数和去重 for tri in KG: if tri[1] not in cnt: cnt[tri[1]] = 1 head[tri[1]] = set([tri[0]]) else: cnt[tri[1]] += 1 head[tri[1]].add(tri[0]) # 根据head和count计算分数,通过一个reference的前面Entity的数量 / 一个reference的出现次数 r2f = {} for r in cnt: r2f[r] = len(head[r]) / cnt[r] return r2f # 传入Triple三元组 def ifunc(KG): # tail: 存储reference的后一个entity,并可添加 tail = {} # cnt: 存储reference的id和计数 cnt = {} for tri in KG: if tri[1] not in cnt: cnt[tri[1]] = 1 tail[tri[1]] = set([tri[2]]) else: cnt[tri[1]] += 1 tail[tri[1]].add(tri[2]) # 根据tail和count计算分数,通过一个reference的后面Entity的数量 / 一个reference的出现次数 r2if = {} for r in cnt: r2if[r] = len(tail[r]) / cnt[r] return r2if # 传入entity_id数目,triples三元组 def get_weighted_adj(e, KG): # r2f: 获得reference与前面实体的分数 r2f = func(KG) # r2if: 获得reference与后面实体的分数 r2if = ifunc(KG) # 根据r2f、r2if和超参数计算分数,生成字典Map{(前Entity, 后Entity): 分数} M = {} for tri in KG: # 跳过前后Entity相同的 if tri[0] == tri[2]: continue # 前后Entity tuple不在M中,就取0.3和r2if中更大的作为分数 if (tri[0], tri[2]) not in M: M[(tri[0], tri[2])] = max(r2if[tri[1]], 0.3) # 前后实体tuple在M中,就取0.3和r2if中更大的与之前的相加作为分数 else: M[(tri[0], tri[2])] += max(r2if[tri[1]], 0.3) # 前后Entity翻转tuple判断 if (tri[2], tri[0]) not in M: M[(tri[2], tri[0])] = max(r2f[tri[1]], 0.3) else: M[(tri[2], tri[0])] += max(r2f[tri[1]], 0.3) # 准备和返回稀疏矩阵 row = [] col = [] data = [] for key in M: row.append(key[1]) col.append(key[0]) data.append(M[key]) print(len(data), len(row), len(col), e) return sp.coo_matrix((data, (row, col)), shape=(e, e)) # 稀疏矩阵转tuple def get_ae_input(attr): return sparse_to_tuple(sp.coo_matrix(attr)) def load_data(dataset_str): names = [['ent_ids_1', 'ent_ids_1'], ['training_attrs_1', 'training_attrs_1'], ['triples_1', 'triples_1'], ['ref_ent_ids']] for fns in names: for i in range(len(fns)): fns[i] = 'C:/Users/admin/Desktop/Kaggle/KG/GCN-Align-master/data1/100000/'+dataset_str+'/'+fns[i] Es, As, Ts, ill = names ill = ill[0] # 读取ref_ent_ids文件,并分出训练集测试集 ILL = loadfile(ill, 2) illL = len(ILL) np.random.shuffle(ILL) train = np.array(ILL[:int(illL * FLAGS.seed / 10)]) print("train.shape", train.shape) test = ILL[int(illL * FLAGS.seed / 10):] print("len(test)", len(test)) # 读取ent_ids_1和ent_ids_2,取前面的id,并去重计数 # e = len(set(loadfile(Es[0], 1)) | set(loadfile(Es[1], 1))) e = len(loadfile(Es[0], 1)) e = e + 1 # 将ent_ids_1和ent_ids_2的id和value组成一个字典 ent2id = get_ent2id([Es[0], Es[1]]) print("len(ent2id)", len(ent2id)) # 读取training_attrs_1和training_attrs_2,传入去重数量和字典,返回value为1的[entity_id, attr_id]的字典 attr = loadattr([As[0], As[1]], e, ent2id) # ae_input: tuple类型的([entity_id, attr_id], 1) ae_input = get_ae_input(attr) # 读取triples_1和triples_2,并取3个id KG = loadfile(Ts[0], 3) + loadfile(Ts[1], 3) # 得到前后Entity的分数稀疏矩阵,作为邻接矩阵adjacency matrix adj = get_weighted_adj(e, KG) # nx.adjacency_matrix(nx.from_dict_of_lists(get_dic_list(e, KG))) return adj, ae_input, train, test