import tensorflow as tf import numpy as np import scipy import json from fuzzywuzzy import fuzz import gc from utils import loadfile import heapq import time import os dir = os.getcwd()+"\\data1\\100000\\zh_en\\" def masked_softmax_cross_entropy(preds, labels, mask): """Softmax cross-entropy loss with masking.""" loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels) mask = tf.cast(mask, dtype=tf.float32) mask /= tf.reduce_mean(mask) loss *= mask return tf.reduce_mean(loss) def masked_accuracy(preds, labels, mask): """Accuracy with masking.""" correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1)) accuracy_all = tf.cast(correct_prediction, tf.float32) mask = tf.cast(mask, dtype=tf.float32) mask /= tf.reduce_mean(mask) accuracy_all *= mask return tf.reduce_mean(accuracy_all) def get_placeholder_by_name(name): try: return tf.get_default_graph().get_tensor_by_name(name+":0") except: return tf.placeholder(tf.int32, name=name) # def align_loss(outlayer, ILL, gamma, k, AE=True): left = ILL[:, 0] right = ILL[:, 1] t = len(ILL) negative_ILL = loadfile(dir+"ref_ent_ids_neg", 2) # 正样本 # embedding_lookup 就是返回 outlayer的第left个Tensor,但不只是查表,同样也会更新参数 left_x = tf.nn.embedding_lookup(outlayer, left) right_x = tf.nn.embedding_lookup(outlayer, right) # reduce_sum:计算指定维度的和 A = tf.reduce_sum(tf.abs(left_x - right_x), 1) # 负样本 # 如果是AE则使用随机选取负样本,如果是SE则人工筛选负样本 if AE: neg_left = get_placeholder_by_name("neg_left") #tf.placeholder(tf.int32, [t * k], "neg_left") neg_right = get_placeholder_by_name("neg_right") #tf.placeholder(tf.int32, [t * k], "neg_right") else: negative_ILL = negative_ILL[:t*k] np.random.shuffle(negative_ILL) negative_ILL = np.array(negative_ILL) neg_left = negative_ILL[:, 0] neg_right = negative_ILL[:, 1] neg_l_x = tf.nn.embedding_lookup(outlayer, neg_left) neg_r_x = tf.nn.embedding_lookup(outlayer, neg_right) B = tf.reduce_sum(tf.abs(neg_l_x - neg_r_x), 1) C = - tf.reshape(B, [t, k]) D = A + gamma L1 = tf.nn.relu(tf.add(C, tf.reshape(D, [t, 1]))) neg_left = get_placeholder_by_name("neg2_left") #tf.placeholder(tf.int32, [t * k], "neg2_left") neg_right = get_placeholder_by_name("neg2_right") #tf.placeholder(tf.int32, [t * k], "neg2_right") neg_l_x = tf.nn.embedding_lookup(outlayer, neg_left) neg_r_x = tf.nn.embedding_lookup(outlayer, neg_right) B = tf.reduce_sum(tf.abs(neg_l_x - neg_r_x), 1) C = - tf.reshape(B, [t, k]) L2 = tf.nn.relu(tf.add(C, tf.reshape(D, [t, 1]))) return (tf.reduce_sum(L1) + tf.reduce_sum(L2)) / (2.0 * k * t) def get_hits(vec, test_pair, top_k=(1, 10, 30, 50)): # print("vec ===================") # print(vec.shape) # 取每个在test_pair中vec对应的的左右两个向量 Lvec = np.array([vec[e1] for e1, e2 in test_pair]) Rvec = np.array([vec[e2] for e1, e2 in test_pair]) # print("Lvec ===================") # print(Lvec.shape) # print("Rvec ===================") # print(Rvec.shape) # 计算两个向量的相似度 sim = scipy.spatial.distance.cdist(Lvec, Rvec, metric='cityblock') # print("sim ====================") # print(len(sim)) # print(len(sim[0])) # 计算 top_lr = [0] * len(top_k) for i in range(Lvec.shape[0]): # argsort: 从小到大排序,并只取排序好的索引 rank = sim[i, :].argsort() rank_index = np.where(rank == i)[0][0] for j in range(len(top_k)): if rank_index < top_k[j]: top_lr[j] += 1 # print("rank ====================") # print(np.where(rank == 0)[0]) # # print("top_lr ====================") # print(len(top_lr)) top_rl = [0] * len(top_k) for i in range(Rvec.shape[0]): rank = sim[:, i].argsort() rank_index = np.where(rank == i)[0][0] for j in range(len(top_k)): if rank_index < top_k[j]: top_rl[j] += 1 # print("top_rl ====================") # print(top_rl) print('For each left:') for i in range(len(top_lr)): print('Hits@%d: %.2f%%' % (top_k[i], top_lr[i] / len(test_pair) * 100)) print('For each right:') for i in range(len(top_rl)): print('Hits@%d: %.2f%%' % (top_k[i], top_rl[i] / len(test_pair) * 100)) def get_combine_hits(se_vec, ae_vec, beta, test_pair, top_k=(1, 10, 30, 50)): vec = np.concatenate([se_vec*beta, ae_vec*(1.0-beta)], axis=1) get_hits(vec, test_pair, top_k) def predict(vec, test_pair): # 读取Modelid 和 RealId 映射字典 map_dict = loadDict(dir+"ModelId2RealId") id_list = file2Data(dir+"ent_ids_1_real") # 反转key-value,方便查找 map_dict_reverse = {} [map_dict_reverse.update({str(v): int(k)}) for k, v in map_dict.items()] # ID list 转 dict,方便查找 id_dict = {} for i in range(len(id_list)): ss = id_list[i][:-1].split("\t") id_dict[str(ss[0])] = ss[1] # 取test_pair左右两个ID Lid = np.array([[e1] for e1, e2 in test_pair]) Rid = np.array([[e2] for e1, e2 in test_pair]) # 取每个在test_pair中vec对应的的左右两个向量 Lvec = np.array([vec[e1] for e1, e2 in test_pair]) Rvec = np.array([vec[e2] for e1, e2 in test_pair]) # 计算两个向量的相似度 sim = scipy.spatial.distance.cdist(Lvec, Rvec, metric='cityblock') # model ID对应similar分数输出字典 ModelId_sim_dict = {} RealId_sim_dict = {} Org_sim_dict = {} for i in range(len(sim)): for j in range(len(sim[i])): # 跳过自己与自己的相似度 if int(Lid[i]) == int(Rid[j]): continue ModelId_sim_dict[(int(Lid[i]), int(Rid[j]))] = float(sim[i][j]) # 取真正ID Lid_real = map_dict_reverse[str(Lid[i][0])] Rid_real = map_dict_reverse[str(Rid[j][0])] # 根据ID取公司名 Lorg = id_dict[str(Lid_real)] Rorg = id_dict[str(Rid_real)] RealId_sim_dict[(str(Lid_real)+"\t"+str(Rid_real), Lorg+"\t"+Rorg)] = float(sim[i][j]) Org_sim_dict[(Lorg, Rorg)] = float(sim[i][j]) RealId_sim_sorted_list = sorted(RealId_sim_dict.items(), key=lambda x: x[1]) Org_sim_sorted_list = sorted(Org_sim_dict.items(), key=lambda x: x[1]) for i in range(20): print(Org_sim_sorted_list[i]) # 根据阈值取符合条件的ID对 # for i in range(len(RealId_sim_sorted_list)): # length = RealId_sim_sorted_list[i][1] # if length return def predict_new(vec): dir_new_align = "C:\\Users\\admin\\Desktop\\Predict_Align_10w" with open(dir_new_align, "r+", encoding='UTF-8') as f: f.truncate() f.close() # 读取Modelid 和 RealId 映射字典 id_list2 = file2Data(dir + "ent_ids_1") map_dict = loadDict(dir + "ModelId2RealId") id_list = file2Data(dir + "ent_ids_1_real") align_list = file2Data(dir + "ref_ent_ids_real") # 反转key-value,方便查找 map_dict_reverse = {} [map_dict_reverse.update({str(v): int(k)}) for k, v in map_dict.items()] # ID list 转 dict,方便查找 id_dict = {} for i in range(len(id_list)): ss = id_list[i][:-1].split("\t") if len(ss) < 2: continue id_dict[str(ss[0])] = ss[1] id_list3 = [] for i in range(len(id_list2)): ss = id_list2[i][:-1].split("\t") if ss[1][0] == "O": # print(ss[1]) id_list3.append(ss[0]) # 取ID id = np.array([[e] for e in id_list3]) # 取ID对应向量计算相似度 id_vec = np.array([vec[int(k)] for k in id_list3]) # 清内存 del vec del id_list2 del id_list3 del map_dict del id_list gc.collect() # 分割成 2k * 2k 计算相似度 splited = 2000 print("len(id_vec)", len(id_vec)) for i in range(0, len(id_vec), splited): splited_vec_left = id_vec[i:i+splited] if len(id_vec) - i < splited: splited_vec_left = id_vec[i:] for j in range(0, len(id_vec), splited): print("i,j", i, j) splited_vec_right = id_vec[j:j+splited] if len(id_vec) - j < splited: splited_vec_right = id_vec[j:] print("计算相似度") sim = scipy.spatial.distance.cdist(splited_vec_left, splited_vec_right, metric='cityblock') print("计算完成!") # print("转换:ID-相似度") # start_time = time.time() # RealId_sim_dict = {} # for k in range(len(sim)): # # print("k", k) # for m in range(len(sim[k])): # # if i+k >= len(id) or j+m >= len(id): # # print("i, k, m", i, k, m) # continue # # # 跳过自己与自己的相似度 # if int(id[i+k]) == int(id[j+m]): # continue # # # 跳过相似度距离大于1的 # if float(sim[k][m]) > 0.6: # # print(k, m, sim[k][m]) # continue # # # 取真正ID # Lid_real = map_dict_reverse[str(id[i+k][0])] # Rid_real = map_dict_reverse[str(id[j+m][0])] # # print("Lid_real/Rid_real", Lid_real, Rid_real) # # 根据ID取公司名 # Lorg = id_dict[str(Lid_real)] # Rorg = id_dict[str(Rid_real)] # RealId_sim_dict[(str(Lid_real)+"\t"+str(Rid_real), Lorg+"\t"+Rorg)] = float(sim[k][m]) # # print("len(RealId_sim_dict)", len(RealId_sim_dict)) # print("排序") # RealId_sim_sorted_list = sorted(RealId_sim_dict.items(), key=lambda x: x[1]) # RealId_sim_sorted_list = RealId_sim_sorted_list[0:200] # print("sort", time.time()-start_time) # print(RealId_sim_sorted_list[:5]) start_time = time.time() c_dict = {} for k in range(len(sim)): if i+k >= len(id): continue # print("取前10") c = heapq.nsmallest(10, range(len(sim[k])), sim[k].take) # print("转换:ID-相似度") for index in range(len(c)): Lid_real = map_dict_reverse[str(id[i+k][0])] Rid_real = map_dict_reverse[str(id[j+c[index]][0])] Lorg = id_dict[str(Lid_real)] Rorg = id_dict[str(Rid_real)] if Lid_real == Rid_real: continue c_dict[(str(Lid_real)+"\t"+str(Rid_real), Lorg+"\t"+Rorg)] = float(sim[k][c[index]]) print("字典去重") c_sort = sorted(c_dict.items(), key=lambda x: x[1]) c_sort = c_sort[0:200] print("headq", time.time()-start_time) # print(c_sort[:5]) # 清内存 print("清内存") del sim # del RealId_sim_dict del c_dict gc.collect() print("计算编辑距离") new_align_list = [] for k in range(len(c_sort)): # print(RealId_sim_sorted_list[0:5]) ids = c_sort[k][0][0] id1 = ids.split("\t")[0] id2 = ids.split("\t")[1] orgs = c_sort[k][0][1] org1 = orgs.split("\t")[0] org2 = orgs.split("\t")[1] length = c_sort[k][1] # 包含关系直接放入 if (org1 in org2) or (org2 in org1): if int(id1) <= int(id2): new_align_list.append(id1 + "\t" + id2 + "\t" + org1 + "\t" + org2 + "\t" + str(length) + "\n") else: new_align_list.append(id2 + "\t" + id1 + "\t" + org2 + "\t" + org1 + "\t" + str(length) + "\n") # if org2 in org1: # if int(id1) <= int(id2): # new_align_list.append(ids + "\t" + orgs + str(length) + "\n") # else: # new_align_list.append(id2 + "\t" + id1 + "\t" + org2 + "\t" # + org1 + "\t" + str(length) + "\n") # 计算编辑距离 sim = fuzz.ratio(org1, org2) if sim > 85: print(org1, org2, sim) for m in range(len(align_list)): if int(id1) <= int(id2): new_align_list.append(id1 + "\t" + id2 + "\t" + org1 + "\t" + org2 + "\t" + str(length) + "\n") else: new_align_list.append(id2 + "\t" + id1 + "\t" + org2 + "\t" + org1 + "\t" + str(length) + "\n") print("计算完成!") print("去重") new_align_list = list(set(new_align_list)) print("len(new_align_list)", len(new_align_list)) print("写入") data2FileAppend(new_align_list, dir_new_align) print("Finished right", j+splited) print("Finished left", i+splited) # 计算模型对齐的ID对在人工对齐的ID中的比例 accuracy_num = 0 new_num = 0 model_align_list = file2Data(dir_new_align) model_align_list2 = [] for i in range(len(model_align_list)): ss = model_align_list[i].split("\t") if int(ss[0]) > int(ss[1]): model_align_list2.append(ss[1]+"\t"+ss[0]) else: model_align_list2.append(ss[0]+"\t"+ss[1]) model_align_list2 = list(set(model_align_list2)) for i in range(len(model_align_list2)): ss = model_align_list2[i].split("\t") ids = ss[0] + "\t" + ss[1] ids_reverse = ss[1] + "\t" + ss[0] for ids2 in align_list: if ids == ids2[:-1] or ids_reverse == ids2[:-1]: # print("right predict", model_align_list[i]) accuracy_num += 1 break if ids != ids2[:-1] and ids_reverse != ids2[:-1]: print("new predict", model_align_list[i]) new_num += 1 print("====================================") print("预测正确率", accuracy_num/len(model_align_list)) print("predict right number", accuracy_num) print("predict new number", new_num) print("predict all number", len(model_align_list)) print("====================================") def loadDict(filename): with open(filename, "r") as json_file: dic = json.load(json_file) return dic def file2Data(filename): with open(filename, 'r', encoding='UTF-8') as f: _list = f.readlines() f.close() return _list def data2File(_list, filename): with open(filename, 'w', encoding='UTF-8') as f: f.writelines(_list) f.close() def data2FileAppend(_list, filename): with open(filename, 'a+', encoding='UTF-8') as f: f.writelines(_list) f.close()