utils.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. import numpy as np
  2. import pickle as pkl
  3. import networkx as nx
  4. import scipy.sparse as sp
  5. from scipy.sparse.linalg.eigen.arpack import eigsh
  6. import sys
  7. import tensorflow as tf
  8. import math
  9. flags = tf.app.flags
  10. FLAGS = flags.FLAGS
  11. def sparse_to_tuple(sparse_mx):
  12. """Convert sparse matrix to tuple representation."""
  13. def to_tuple(mx):
  14. if not sp.isspmatrix_coo(mx):
  15. mx = mx.tocoo()
  16. coords = np.vstack((mx.row, mx.col)).transpose()
  17. values = mx.data
  18. shape = mx.shape
  19. return coords, values, shape
  20. if isinstance(sparse_mx, list):
  21. for i in range(len(sparse_mx)):
  22. sparse_mx[i] = to_tuple(sparse_mx[i])
  23. else:
  24. sparse_mx = to_tuple(sparse_mx)
  25. return sparse_mx
  26. # 使邻接矩阵对称
  27. def normalize_adj(adj):
  28. """Symmetrically normalize adjacency matrix."""
  29. adj = sp.coo_matrix(adj)
  30. # 对每一列相加,形成一行总和
  31. rowsum = np.array(adj.sum(1))
  32. # np.power: 第一个参数的第二个参数次方,并打平(打平为一维列表,无嵌套)
  33. d_inv_sqrt = np.power(rowsum, -0.5).flatten()
  34. # 把是inf的改为0
  35. d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
  36. # 把结果进行对角化
  37. d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
  38. return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
  39. # 对 前后实体tuple,分数邻接矩阵 进行预处理
  40. def preprocess_adj(adj):
  41. """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
  42. adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
  43. return sparse_to_tuple(adj_normalized)
  44. def construct_feed_dict(features, support, placeholders):
  45. """Construct feed dictionary for GCN-Align."""
  46. feed_dict = dict()
  47. feed_dict.update({placeholders['features']: features})
  48. feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))})
  49. # print("support ===================")
  50. # print(support[0])
  51. return feed_dict
  52. def chebyshev_polynomials(adj, k):
  53. """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""
  54. print("Calculating Chebyshev polynomials up to order {}...".format(k))
  55. adj_normalized = normalize_adj(adj)
  56. laplacian = sp.eye(adj.shape[0]) - adj_normalized
  57. largest_eigval, _ = eigsh(laplacian, 1, which='LM')
  58. scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])
  59. t_k = list()
  60. t_k.append(sp.eye(adj.shape[0]))
  61. t_k.append(scaled_laplacian)
  62. def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
  63. s_lap = sp.csr_matrix(scaled_lap, copy=True)
  64. return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two
  65. for i in range(2, k+1):
  66. t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))
  67. return sparse_to_tuple(t_k)
  68. # 输入文件名filename,和需要截取的字符串数目num,读取文件
  69. def loadfile(fn, num=1):
  70. """Load a file and return a list of tuple containing $num integers in each line."""
  71. print('loading a file...' + fn)
  72. ret = []
  73. with open(fn, encoding='utf-8') as f:
  74. for line in f:
  75. th = line[:-1].split('\t')
  76. x = []
  77. for i in range(num):
  78. x.append(int(th[i]))
  79. ret.append(tuple(x))
  80. return ret
  81. def get_ent2id(fns):
  82. ent2id = {}
  83. for fn in fns:
  84. with open(fn, 'r', encoding='utf-8') as f:
  85. for line in f:
  86. th = line[:-1].split('\t')
  87. ent2id[th[1]] = int(th[0])
  88. return ent2id
  89. # 读取属性,输出
  90. def loadattr(fns, e, ent2id):
  91. """The most frequent attributes are selected to save space."""
  92. # 对除了每行的第一个是resource,其他的property进行计数,并从大到小排序
  93. cnt = {}
  94. for fn in fns:
  95. with open(fn, 'r', encoding='utf-8') as f:
  96. for line in f:
  97. th = line[:-1].split('\t')
  98. if th[0] not in ent2id:
  99. continue
  100. for i in range(1, len(th)):
  101. if th[i] not in cnt:
  102. cnt[th[i]] = 1
  103. else:
  104. cnt[th[i]] += 1
  105. fre = [(k, cnt[k]) for k in sorted(cnt, key=cnt.get, reverse=True)]
  106. # 生成value 和 1~2000的id的字典,取排名前2000的train_attr中的property
  107. num_features = min(len(fre), 2000)
  108. attr2id = {}
  109. for i in range(num_features):
  110. attr2id[fre[i][0]] = i
  111. # 如果resource在entity2id中存在,则循环剩下的property
  112. # 如果property在attr2id中存在,则添加至字典Map{"resource, property": 1.0}
  113. M = {}
  114. for fn in fns:
  115. with open(fn, 'r', encoding='utf-8') as f:
  116. for line in f:
  117. th = line[:-1].split('\t')
  118. if th[0] in ent2id:
  119. for i in range(1, len(th)):
  120. if th[i] in attr2id:
  121. M[(ent2id[th[0]], attr2id[th[i]])] = 1.0
  122. # 将M中的key, value分离出来,key中的entity_id是row,key中的property是col,value是data
  123. row = []
  124. col = []
  125. data = []
  126. for key in M:
  127. row.append(key[0])
  128. col.append(key[1])
  129. data.append(M[key])
  130. # print("-------------")
  131. print(len(data), len(row), len(col), e, num_features)
  132. # coo_matrix:生成稀疏矩阵,数据data根据(row,col)的定位,定位到shape大小的的矩阵中
  133. return sp.coo_matrix((data, (row, col)), shape=(e, num_features)) # attr
  134. def get_dic_list(e, KG):
  135. M = {}
  136. for tri in KG:
  137. if tri[0] == tri[2]:
  138. continue
  139. M[(tri[0], tri[2])] = 1
  140. M[(tri[2], tri[0])] = 1
  141. dic_list = {}
  142. for i in range(e):
  143. dic_list[i] = []
  144. for pair in M:
  145. dic_list[pair[0]].append(pair[1])
  146. return dic_list
  147. # 传入Triple三元组
  148. def func(KG):
  149. # head: 存储reference的前一个entity,并可添加
  150. head = {}
  151. # cnt: 存储reference的id和计数
  152. cnt = {}
  153. # 对triple中的第二个元素reference,分别进行计数和去重
  154. for tri in KG:
  155. if tri[1] not in cnt:
  156. cnt[tri[1]] = 1
  157. head[tri[1]] = set([tri[0]])
  158. else:
  159. cnt[tri[1]] += 1
  160. head[tri[1]].add(tri[0])
  161. # 根据head和count计算分数,通过一个reference的前面Entity的数量 / 一个reference的出现次数
  162. r2f = {}
  163. for r in cnt:
  164. r2f[r] = len(head[r]) / cnt[r]
  165. return r2f
  166. # 传入Triple三元组
  167. def ifunc(KG):
  168. # tail: 存储reference的后一个entity,并可添加
  169. tail = {}
  170. # cnt: 存储reference的id和计数
  171. cnt = {}
  172. for tri in KG:
  173. if tri[1] not in cnt:
  174. cnt[tri[1]] = 1
  175. tail[tri[1]] = set([tri[2]])
  176. else:
  177. cnt[tri[1]] += 1
  178. tail[tri[1]].add(tri[2])
  179. # 根据tail和count计算分数,通过一个reference的后面Entity的数量 / 一个reference的出现次数
  180. r2if = {}
  181. for r in cnt:
  182. r2if[r] = len(tail[r]) / cnt[r]
  183. return r2if
  184. # 传入entity_id数目,triples三元组
  185. def get_weighted_adj(e, KG):
  186. # r2f: 获得reference与前面实体的分数
  187. r2f = func(KG)
  188. # r2if: 获得reference与后面实体的分数
  189. r2if = ifunc(KG)
  190. # 根据r2f、r2if和超参数计算分数,生成字典Map{(前Entity, 后Entity): 分数}
  191. M = {}
  192. for tri in KG:
  193. # 跳过前后Entity相同的
  194. if tri[0] == tri[2]:
  195. continue
  196. # 前后Entity tuple不在M中,就取0.3和r2if中更大的作为分数
  197. if (tri[0], tri[2]) not in M:
  198. M[(tri[0], tri[2])] = max(r2if[tri[1]], 0.3)
  199. # 前后实体tuple在M中,就取0.3和r2if中更大的与之前的相加作为分数
  200. else:
  201. M[(tri[0], tri[2])] += max(r2if[tri[1]], 0.3)
  202. # 前后Entity翻转tuple判断
  203. if (tri[2], tri[0]) not in M:
  204. M[(tri[2], tri[0])] = max(r2f[tri[1]], 0.3)
  205. else:
  206. M[(tri[2], tri[0])] += max(r2f[tri[1]], 0.3)
  207. # 准备和返回稀疏矩阵
  208. row = []
  209. col = []
  210. data = []
  211. for key in M:
  212. row.append(key[1])
  213. col.append(key[0])
  214. data.append(M[key])
  215. print(len(data), len(row), len(col), e)
  216. return sp.coo_matrix((data, (row, col)), shape=(e, e))
  217. # 稀疏矩阵转tuple
  218. def get_ae_input(attr):
  219. return sparse_to_tuple(sp.coo_matrix(attr))
  220. def load_data(dataset_str):
  221. names = [['ent_ids_1', 'ent_ids_1'], ['training_attrs_1', 'training_attrs_1'], ['triples_1', 'triples_1'], ['ref_ent_ids']]
  222. for fns in names:
  223. for i in range(len(fns)):
  224. fns[i] = 'C:/Users/admin/Desktop/Kaggle/KG/GCN-Align-master/data1/100000/'+dataset_str+'/'+fns[i]
  225. Es, As, Ts, ill = names
  226. ill = ill[0]
  227. # 读取ref_ent_ids文件,并分出训练集测试集
  228. ILL = loadfile(ill, 2)
  229. illL = len(ILL)
  230. np.random.shuffle(ILL)
  231. train = np.array(ILL[:int(illL * FLAGS.seed / 10)])
  232. print("train.shape", train.shape)
  233. test = ILL[int(illL * FLAGS.seed / 10):]
  234. print("len(test)", len(test))
  235. # 读取ent_ids_1和ent_ids_2,取前面的id,并去重计数
  236. # e = len(set(loadfile(Es[0], 1)) | set(loadfile(Es[1], 1)))
  237. e = len(loadfile(Es[0], 1))
  238. e = e + 1
  239. # 将ent_ids_1和ent_ids_2的id和value组成一个字典
  240. ent2id = get_ent2id([Es[0], Es[1]])
  241. print("len(ent2id)", len(ent2id))
  242. # 读取training_attrs_1和training_attrs_2,传入去重数量和字典,返回value为1的[entity_id, attr_id]的字典
  243. attr = loadattr([As[0], As[1]], e, ent2id)
  244. # ae_input: tuple类型的([entity_id, attr_id], 1)
  245. ae_input = get_ae_input(attr)
  246. # 读取triples_1和triples_2,并取3个id
  247. KG = loadfile(Ts[0], 3) + loadfile(Ts[1], 3)
  248. # 得到前后Entity的分数稀疏矩阵,作为邻接矩阵adjacency matrix
  249. adj = get_weighted_adj(e, KG) # nx.adjacency_matrix(nx.from_dict_of_lists(get_dic_list(e, KG)))
  250. return adj, ae_input, train, test