fangjiasheng
/
ORGS_FUSION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
							import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys
import tensorflow as tf
import math

flags = tf.app.flags
FLAGS = flags.FLAGS


def sparse_to_tuple(sparse_mx):
    """Convert sparse matrix to tuple representation."""
    def to_tuple(mx):
        if not sp.isspmatrix_coo(mx):
            mx = mx.tocoo()
        coords = np.vstack((mx.row, mx.col)).transpose()
        values = mx.data
        shape = mx.shape
        return coords, values, shape

    if isinstance(sparse_mx, list):
        for i in range(len(sparse_mx)):
            sparse_mx[i] = to_tuple(sparse_mx[i])
    else:
        sparse_mx = to_tuple(sparse_mx)

    return sparse_mx


# 使邻接矩阵对称
def normalize_adj(adj):
    """Symmetrically normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    # 对每一列相加，形成一行总和
    rowsum = np.array(adj.sum(1))
    # np.power: 第一个参数的第二个参数次方，并打平(打平为一维列表，无嵌套)
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    # 把是inf的改为0
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    # 把结果进行对角化
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()


# 对 前后实体tuple,分数邻接矩阵 进行预处理
def preprocess_adj(adj):
    """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
    adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
    return sparse_to_tuple(adj_normalized)


def construct_feed_dict(features, support, placeholders):
    """Construct feed dictionary for GCN-Align."""
    feed_dict = dict()
    feed_dict.update({placeholders['features']: features})
    feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))})
    # print("support ===================")
    # print(support[0])
    return feed_dict


def chebyshev_polynomials(adj, k):
    """Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""
    print("Calculating Chebyshev polynomials up to order {}...".format(k))

    adj_normalized = normalize_adj(adj)
    laplacian = sp.eye(adj.shape[0]) - adj_normalized
    largest_eigval, _ = eigsh(laplacian, 1, which='LM')
    scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0])

    t_k = list()
    t_k.append(sp.eye(adj.shape[0]))
    t_k.append(scaled_laplacian)

    def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
        s_lap = sp.csr_matrix(scaled_lap, copy=True)
        return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two

    for i in range(2, k+1):
        t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))

    return sparse_to_tuple(t_k)


# 输入文件名filename，和需要截取的字符串数目num，读取文件
def loadfile(fn, num=1):
    """Load a file and return a list of tuple containing $num integers in each line."""
    print('loading a file...' + fn)
    ret = []
    with open(fn, encoding='utf-8') as f:
        for line in f:
            th = line[:-1].split('\t')
            x = []
            for i in range(num):
                x.append(int(th[i]))
            ret.append(tuple(x))
    return ret


def get_ent2id(fns):
    ent2id = {}
    for fn in fns:
        with open(fn, 'r', encoding='utf-8') as f:
            for line in f:
                th = line[:-1].split('\t')
                ent2id[th[1]] = int(th[0])
    return ent2id


# 读取属性，输出
def loadattr(fns, e, ent2id):
    """The most frequent attributes are selected to save space."""
    # 对除了每行的第一个是resource，其他的property进行计数，并从大到小排序
    cnt = {}
    for fn in fns:
        with open(fn, 'r', encoding='utf-8') as f:
            for line in f:
                th = line[:-1].split('\t')
                if th[0] not in ent2id:
                    continue
                for i in range(1, len(th)):
                    if th[i] not in cnt:
                        cnt[th[i]] = 1
                    else:
                        cnt[th[i]] += 1
    fre = [(k, cnt[k]) for k in sorted(cnt, key=cnt.get, reverse=True)]

    # 生成value 和 1~2000的id的字典，取排名前2000的train_attr中的property
    num_features = min(len(fre), 2000)
    attr2id = {}
    for i in range(num_features):
        attr2id[fre[i][0]] = i

    # 如果resource在entity2id中存在，则循环剩下的property
    # 如果property在attr2id中存在，则添加至字典Map{"resource, property": 1.0}
    M = {}
    for fn in fns:
        with open(fn, 'r', encoding='utf-8') as f:
            for line in f:
                th = line[:-1].split('\t')
                if th[0] in ent2id:
                    for i in range(1, len(th)):
                        if th[i] in attr2id:
                            M[(ent2id[th[0]], attr2id[th[i]])] = 1.0

    # 将M中的key, value分离出来，key中的entity_id是row，key中的property是col，value是data
    row = []
    col = []
    data = []
    for key in M:
        row.append(key[0])
        col.append(key[1])
        data.append(M[key])

    # print("-------------")
    print(len(data), len(row), len(col), e, num_features)
    # coo_matrix：生成稀疏矩阵，数据data根据(row,col)的定位，定位到shape大小的的矩阵中
    return sp.coo_matrix((data, (row, col)), shape=(e, num_features)) # attr


def get_dic_list(e, KG):
    M = {}
    for tri in KG:
        if tri[0] == tri[2]:
            continue
        M[(tri[0], tri[2])] = 1
        M[(tri[2], tri[0])] = 1
    dic_list = {}
    for i in range(e):
        dic_list[i] = []
    for pair in M:
        dic_list[pair[0]].append(pair[1])
    return dic_list


# 传入Triple三元组
def func(KG):
    # head: 存储reference的前一个entity，并可添加
    head = {}
    # cnt: 存储reference的id和计数
    cnt = {}
    # 对triple中的第二个元素reference，分别进行计数和去重
    for tri in KG:
        if tri[1] not in cnt:
            cnt[tri[1]] = 1
            head[tri[1]] = set([tri[0]])
        else:
            cnt[tri[1]] += 1
            head[tri[1]].add(tri[0])

    # 根据head和count计算分数，通过一个reference的前面Entity的数量 / 一个reference的出现次数
    r2f = {}
    for r in cnt:
        r2f[r] = len(head[r]) / cnt[r]
    return r2f


# 传入Triple三元组
def ifunc(KG):
    # tail: 存储reference的后一个entity，并可添加
    tail = {}
    # cnt: 存储reference的id和计数
    cnt = {}
    for tri in KG:
        if tri[1] not in cnt:
            cnt[tri[1]] = 1
            tail[tri[1]] = set([tri[2]])
        else:
            cnt[tri[1]] += 1
            tail[tri[1]].add(tri[2])

    # 根据tail和count计算分数，通过一个reference的后面Entity的数量 / 一个reference的出现次数
    r2if = {}
    for r in cnt:
        r2if[r] = len(tail[r]) / cnt[r]
    return r2if


# 传入entity_id数目，triples三元组
def get_weighted_adj(e, KG):

    # r2f: 获得reference与前面实体的分数
    r2f = func(KG)
    # r2if: 获得reference与后面实体的分数
    r2if = ifunc(KG)

    # 根据r2f、r2if和超参数计算分数，生成字典Map{(前Entity, 后Entity): 分数}
    M = {}
    for tri in KG:
        # 跳过前后Entity相同的
        if tri[0] == tri[2]:
            continue
        # 前后Entity tuple不在M中，就取0.3和r2if中更大的作为分数
        if (tri[0], tri[2]) not in M:
            M[(tri[0], tri[2])] = max(r2if[tri[1]], 0.3)
        # 前后实体tuple在M中，就取0.3和r2if中更大的与之前的相加作为分数
        else:
            M[(tri[0], tri[2])] += max(r2if[tri[1]], 0.3)

        # 前后Entity翻转tuple判断
        if (tri[2], tri[0]) not in M:
            M[(tri[2], tri[0])] = max(r2f[tri[1]], 0.3)
        else:
            M[(tri[2], tri[0])] += max(r2f[tri[1]], 0.3)

    # 准备和返回稀疏矩阵
    row = []
    col = []
    data = []
    for key in M:
        row.append(key[1])
        col.append(key[0])
        data.append(M[key])
    print(len(data), len(row), len(col), e)
    return sp.coo_matrix((data, (row, col)), shape=(e, e))


# 稀疏矩阵转tuple
def get_ae_input(attr):
    return sparse_to_tuple(sp.coo_matrix(attr))


def load_data(dataset_str):
    names = [['ent_ids_1', 'ent_ids_1'], ['training_attrs_1', 'training_attrs_1'], ['triples_1', 'triples_1'], ['ref_ent_ids']]
    for fns in names:
        for i in range(len(fns)):
            fns[i] = 'C:/Users/admin/Desktop/Kaggle/KG/GCN-Align-master/data1/100000/'+dataset_str+'/'+fns[i]

    Es, As, Ts, ill = names

    ill = ill[0]
    # 读取ref_ent_ids文件，并分出训练集测试集
    ILL = loadfile(ill, 2)
    illL = len(ILL)
    np.random.shuffle(ILL)
    train = np.array(ILL[:int(illL * FLAGS.seed / 10)])
    print("train.shape", train.shape)
    test = ILL[int(illL * FLAGS.seed / 10):]
    print("len(test)", len(test))

    # 读取ent_ids_1和ent_ids_2，取前面的id，并去重计数
    # e = len(set(loadfile(Es[0], 1)) | set(loadfile(Es[1], 1)))
    e = len(loadfile(Es[0], 1))
    e = e + 1
    # 将ent_ids_1和ent_ids_2的id和value组成一个字典
    ent2id = get_ent2id([Es[0], Es[1]])
    print("len(ent2id)", len(ent2id))

    # 读取training_attrs_1和training_attrs_2，传入去重数量和字典，返回value为1的[entity_id, attr_id]的字典
    attr = loadattr([As[0], As[1]], e, ent2id)

    # ae_input: tuple类型的([entity_id, attr_id], 1)
    ae_input = get_ae_input(attr)

    # 读取triples_1和triples_2，并取3个id
    KG = loadfile(Ts[0], 3) + loadfile(Ts[1], 3)
    # 得到前后Entity的分数稀疏矩阵，作为邻接矩阵adjacency matrix
    adj = get_weighted_adj(e, KG) # nx.adjacency_matrix(nx.from_dict_of_lists(get_dic_list(e, KG)))

    return adj, ae_input, train, test