123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- import numpy as np
- import pickle as pkl
- import networkx as nx
- import scipy.sparse as sp
- from scipy.sparse.linalg.eigen.arpack import eigsh
- import sys
- import tensorflow as tf
- import math
- flags = tf.app.flags
- FLAGS = flags.FLAGS
- def sparse_to_tuple(sparse_mx):
- """Convert sparse matrix to tuple representation."""
- def to_tuple(mx):
- if not sp.isspmatrix_coo(mx):
- mx = mx.tocoo()
- coords = np.vstack((mx.row, mx.col)).transpose()
- values = mx.data
- shape = mx.shape
- return coords, values, shape
- if isinstance(sparse_mx, list):
- for i in range(len(sparse_mx)):
- sparse_mx[i] = to_tuple(sparse_mx[i])
- else:
- sparse_mx = to_tuple(sparse_mx)
- return sparse_mx
- # 使邻接矩阵对称
- def normalize_adj(adj):
- """Symmetrically normalize adjacency matrix."""
- adj = sp.coo_matrix(adj)
- # 对每一列相加,形成一行总和
- rowsum = np.array(adj.sum(1))
- # np.power: 第一个参数的第二个参数次方,并打平(打平为一维列表,无嵌套)
- d_inv_sqrt = np.power(rowsum, -0.5).flatten()
- # 把是inf的改为0
- d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
- # 把结果进行对角化
- d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
- return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
- # 对 前后实体tuple,分数邻接矩阵 进行预处理
- def preprocess_adj(adj):
- """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
- adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
- return sparse_to_tuple(adj_normalized)
- def construct_feed_dict(features, support, placeholders):
- """Construct feed dictionary for GCN-Align."""
- feed_dict = dict()
- feed_dict.update({placeholders['features']: features})
- feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))})
- # print("support ===================")
- # print(support[0])
- return feed_dict
- def chebyshev_polynomials(adj, k):
- """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""
- print("Calculating Chebyshev polynomials up to order {}...".format(k))
- adj_normalized = normalize_adj(adj)
- laplacian = sp.eye(adj.shape[0]) - adj_normalized
- largest_eigval, _ = eigsh(laplacian, 1, which='LM')
- scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])
- t_k = list()
- t_k.append(sp.eye(adj.shape[0]))
- t_k.append(scaled_laplacian)
- def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
- s_lap = sp.csr_matrix(scaled_lap, copy=True)
- return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two
- for i in range(2, k+1):
- t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))
- return sparse_to_tuple(t_k)
- # 输入文件名filename,和需要截取的字符串数目num,读取文件
- def loadfile(fn, num=1):
- """Load a file and return a list of tuple containing $num integers in each line."""
- print('loading a file...' + fn)
- ret = []
- with open(fn, encoding='utf-8') as f:
- for line in f:
- th = line[:-1].split('\t')
- x = []
- for i in range(num):
- x.append(int(th[i]))
- ret.append(tuple(x))
- return ret
- def get_ent2id(fns):
- ent2id = {}
- for fn in fns:
- with open(fn, 'r', encoding='utf-8') as f:
- for line in f:
- th = line[:-1].split('\t')
- ent2id[th[1]] = int(th[0])
- return ent2id
- # 读取属性,输出
- def loadattr(fns, e, ent2id):
- """The most frequent attributes are selected to save space."""
- # 对除了每行的第一个是resource,其他的property进行计数,并从大到小排序
- cnt = {}
- for fn in fns:
- with open(fn, 'r', encoding='utf-8') as f:
- for line in f:
- th = line[:-1].split('\t')
- if th[0] not in ent2id:
- continue
- for i in range(1, len(th)):
- if th[i] not in cnt:
- cnt[th[i]] = 1
- else:
- cnt[th[i]] += 1
- fre = [(k, cnt[k]) for k in sorted(cnt, key=cnt.get, reverse=True)]
- # 生成value 和 1~2000的id的字典,取排名前2000的train_attr中的property
- num_features = min(len(fre), 2000)
- attr2id = {}
- for i in range(num_features):
- attr2id[fre[i][0]] = i
- # 如果resource在entity2id中存在,则循环剩下的property
- # 如果property在attr2id中存在,则添加至字典Map{"resource, property": 1.0}
- M = {}
- for fn in fns:
- with open(fn, 'r', encoding='utf-8') as f:
- for line in f:
- th = line[:-1].split('\t')
- if th[0] in ent2id:
- for i in range(1, len(th)):
- if th[i] in attr2id:
- M[(ent2id[th[0]], attr2id[th[i]])] = 1.0
- # 将M中的key, value分离出来,key中的entity_id是row,key中的property是col,value是data
- row = []
- col = []
- data = []
- for key in M:
- row.append(key[0])
- col.append(key[1])
- data.append(M[key])
- # print("-------------")
- print(len(data), len(row), len(col), e, num_features)
- # coo_matrix:生成稀疏矩阵,数据data根据(row,col)的定位,定位到shape大小的的矩阵中
- return sp.coo_matrix((data, (row, col)), shape=(e, num_features)) # attr
- def get_dic_list(e, KG):
- M = {}
- for tri in KG:
- if tri[0] == tri[2]:
- continue
- M[(tri[0], tri[2])] = 1
- M[(tri[2], tri[0])] = 1
- dic_list = {}
- for i in range(e):
- dic_list[i] = []
- for pair in M:
- dic_list[pair[0]].append(pair[1])
- return dic_list
- # 传入Triple三元组
- def func(KG):
- # head: 存储reference的前一个entity,并可添加
- head = {}
- # cnt: 存储reference的id和计数
- cnt = {}
- # 对triple中的第二个元素reference,分别进行计数和去重
- for tri in KG:
- if tri[1] not in cnt:
- cnt[tri[1]] = 1
- head[tri[1]] = set([tri[0]])
- else:
- cnt[tri[1]] += 1
- head[tri[1]].add(tri[0])
- # 根据head和count计算分数,通过一个reference的前面Entity的数量 / 一个reference的出现次数
- r2f = {}
- for r in cnt:
- r2f[r] = len(head[r]) / cnt[r]
- return r2f
- # 传入Triple三元组
- def ifunc(KG):
- # tail: 存储reference的后一个entity,并可添加
- tail = {}
- # cnt: 存储reference的id和计数
- cnt = {}
- for tri in KG:
- if tri[1] not in cnt:
- cnt[tri[1]] = 1
- tail[tri[1]] = set([tri[2]])
- else:
- cnt[tri[1]] += 1
- tail[tri[1]].add(tri[2])
- # 根据tail和count计算分数,通过一个reference的后面Entity的数量 / 一个reference的出现次数
- r2if = {}
- for r in cnt:
- r2if[r] = len(tail[r]) / cnt[r]
- return r2if
- # 传入entity_id数目,triples三元组
- def get_weighted_adj(e, KG):
- # r2f: 获得reference与前面实体的分数
- r2f = func(KG)
- # r2if: 获得reference与后面实体的分数
- r2if = ifunc(KG)
- # 根据r2f、r2if和超参数计算分数,生成字典Map{(前Entity, 后Entity): 分数}
- M = {}
- for tri in KG:
- # 跳过前后Entity相同的
- if tri[0] == tri[2]:
- continue
- # 前后Entity tuple不在M中,就取0.3和r2if中更大的作为分数
- if (tri[0], tri[2]) not in M:
- M[(tri[0], tri[2])] = max(r2if[tri[1]], 0.3)
- # 前后实体tuple在M中,就取0.3和r2if中更大的与之前的相加作为分数
- else:
- M[(tri[0], tri[2])] += max(r2if[tri[1]], 0.3)
- # 前后Entity翻转tuple判断
- if (tri[2], tri[0]) not in M:
- M[(tri[2], tri[0])] = max(r2f[tri[1]], 0.3)
- else:
- M[(tri[2], tri[0])] += max(r2f[tri[1]], 0.3)
- # 准备和返回稀疏矩阵
- row = []
- col = []
- data = []
- for key in M:
- row.append(key[1])
- col.append(key[0])
- data.append(M[key])
- print(len(data), len(row), len(col), e)
- return sp.coo_matrix((data, (row, col)), shape=(e, e))
- # 稀疏矩阵转tuple
- def get_ae_input(attr):
- return sparse_to_tuple(sp.coo_matrix(attr))
- def load_data(dataset_str):
- names = [['ent_ids_1', 'ent_ids_1'], ['training_attrs_1', 'training_attrs_1'], ['triples_1', 'triples_1'], ['ref_ent_ids']]
- for fns in names:
- for i in range(len(fns)):
- fns[i] = 'C:/Users/admin/Desktop/Kaggle/KG/GCN-Align-master/data1/100000/'+dataset_str+'/'+fns[i]
- Es, As, Ts, ill = names
- ill = ill[0]
- # 读取ref_ent_ids文件,并分出训练集测试集
- ILL = loadfile(ill, 2)
- illL = len(ILL)
- np.random.shuffle(ILL)
- train = np.array(ILL[:int(illL * FLAGS.seed / 10)])
- print("train.shape", train.shape)
- test = ILL[int(illL * FLAGS.seed / 10):]
- print("len(test)", len(test))
- # 读取ent_ids_1和ent_ids_2,取前面的id,并去重计数
- # e = len(set(loadfile(Es[0], 1)) | set(loadfile(Es[1], 1)))
- e = len(loadfile(Es[0], 1))
- e = e + 1
- # 将ent_ids_1和ent_ids_2的id和value组成一个字典
- ent2id = get_ent2id([Es[0], Es[1]])
- print("len(ent2id)", len(ent2id))
- # 读取training_attrs_1和training_attrs_2,传入去重数量和字典,返回value为1的[entity_id, attr_id]的字典
- attr = loadattr([As[0], As[1]], e, ent2id)
- # ae_input: tuple类型的([entity_id, attr_id], 1)
- ae_input = get_ae_input(attr)
- # 读取triples_1和triples_2,并取3个id
- KG = loadfile(Ts[0], 3) + loadfile(Ts[1], 3)
- # 得到前后Entity的分数稀疏矩阵,作为邻接矩阵adjacency matrix
- adj = get_weighted_adj(e, KG) # nx.adjacency_matrix(nx.from_dict_of_lists(get_dic_list(e, KG)))
- return adj, ae_input, train, test
|