123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444 |
- import tensorflow as tf
- import numpy as np
- import scipy
- import json
- from fuzzywuzzy import fuzz
- import gc
- from utils import loadfile
- import heapq
- import time
- import os
- dir = os.getcwd()+"\\data1\\100000\\zh_en\\"
- def masked_softmax_cross_entropy(preds, labels, mask):
- """Softmax cross-entropy loss with masking."""
- loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels)
- mask = tf.cast(mask, dtype=tf.float32)
- mask /= tf.reduce_mean(mask)
- loss *= mask
- return tf.reduce_mean(loss)
- def masked_accuracy(preds, labels, mask):
- """Accuracy with masking."""
- correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1))
- accuracy_all = tf.cast(correct_prediction, tf.float32)
- mask = tf.cast(mask, dtype=tf.float32)
- mask /= tf.reduce_mean(mask)
- accuracy_all *= mask
- return tf.reduce_mean(accuracy_all)
- def get_placeholder_by_name(name):
- try:
- return tf.get_default_graph().get_tensor_by_name(name+":0")
- except:
- return tf.placeholder(tf.int32, name=name)
- #
- def align_loss(outlayer, ILL, gamma, k, AE=True):
- left = ILL[:, 0]
- right = ILL[:, 1]
- t = len(ILL)
- negative_ILL = loadfile(dir+"ref_ent_ids_neg", 2)
- # 正样本
- # embedding_lookup 就是返回 outlayer的第left个Tensor,但不只是查表,同样也会更新参数
- left_x = tf.nn.embedding_lookup(outlayer, left)
- right_x = tf.nn.embedding_lookup(outlayer, right)
- # reduce_sum:计算指定维度的和
- A = tf.reduce_sum(tf.abs(left_x - right_x), 1)
- # 负样本
- # 如果是AE则使用随机选取负样本,如果是SE则人工筛选负样本
- if AE:
- neg_left = get_placeholder_by_name("neg_left") #tf.placeholder(tf.int32, [t * k], "neg_left")
- neg_right = get_placeholder_by_name("neg_right") #tf.placeholder(tf.int32, [t * k], "neg_right")
- else:
- negative_ILL = negative_ILL[:t*k]
- np.random.shuffle(negative_ILL)
- negative_ILL = np.array(negative_ILL)
- neg_left = negative_ILL[:, 0]
- neg_right = negative_ILL[:, 1]
- neg_l_x = tf.nn.embedding_lookup(outlayer, neg_left)
- neg_r_x = tf.nn.embedding_lookup(outlayer, neg_right)
- B = tf.reduce_sum(tf.abs(neg_l_x - neg_r_x), 1)
- C = - tf.reshape(B, [t, k])
- D = A + gamma
- L1 = tf.nn.relu(tf.add(C, tf.reshape(D, [t, 1])))
- neg_left = get_placeholder_by_name("neg2_left") #tf.placeholder(tf.int32, [t * k], "neg2_left")
- neg_right = get_placeholder_by_name("neg2_right") #tf.placeholder(tf.int32, [t * k], "neg2_right")
- neg_l_x = tf.nn.embedding_lookup(outlayer, neg_left)
- neg_r_x = tf.nn.embedding_lookup(outlayer, neg_right)
- B = tf.reduce_sum(tf.abs(neg_l_x - neg_r_x), 1)
- C = - tf.reshape(B, [t, k])
- L2 = tf.nn.relu(tf.add(C, tf.reshape(D, [t, 1])))
- return (tf.reduce_sum(L1) + tf.reduce_sum(L2)) / (2.0 * k * t)
- def get_hits(vec, test_pair, top_k=(1, 10, 30, 50)):
- # print("vec ===================")
- # print(vec.shape)
- # 取每个在test_pair中vec对应的的左右两个向量
- Lvec = np.array([vec[e1] for e1, e2 in test_pair])
- Rvec = np.array([vec[e2] for e1, e2 in test_pair])
- # print("Lvec ===================")
- # print(Lvec.shape)
- # print("Rvec ===================")
- # print(Rvec.shape)
- # 计算两个向量的相似度
- sim = scipy.spatial.distance.cdist(Lvec, Rvec, metric='cityblock')
- # print("sim ====================")
- # print(len(sim))
- # print(len(sim[0]))
- # 计算
- top_lr = [0] * len(top_k)
- for i in range(Lvec.shape[0]):
- # argsort: 从小到大排序,并只取排序好的索引
- rank = sim[i, :].argsort()
- rank_index = np.where(rank == i)[0][0]
- for j in range(len(top_k)):
- if rank_index < top_k[j]:
- top_lr[j] += 1
- # print("rank ====================")
- # print(np.where(rank == 0)[0])
- #
- # print("top_lr ====================")
- # print(len(top_lr))
- top_rl = [0] * len(top_k)
- for i in range(Rvec.shape[0]):
- rank = sim[:, i].argsort()
- rank_index = np.where(rank == i)[0][0]
- for j in range(len(top_k)):
- if rank_index < top_k[j]:
- top_rl[j] += 1
- # print("top_rl ====================")
- # print(top_rl)
- print('For each left:')
- for i in range(len(top_lr)):
- print('Hits@%d: %.2f%%' % (top_k[i], top_lr[i] / len(test_pair) * 100))
- print('For each right:')
- for i in range(len(top_rl)):
- print('Hits@%d: %.2f%%' % (top_k[i], top_rl[i] / len(test_pair) * 100))
- def get_combine_hits(se_vec, ae_vec, beta, test_pair, top_k=(1, 10, 30, 50)):
- vec = np.concatenate([se_vec*beta, ae_vec*(1.0-beta)], axis=1)
- get_hits(vec, test_pair, top_k)
- def predict(vec, test_pair):
- # 读取Modelid 和 RealId 映射字典
- map_dict = loadDict(dir+"ModelId2RealId")
- id_list = file2Data(dir+"ent_ids_1_real")
- # 反转key-value,方便查找
- map_dict_reverse = {}
- [map_dict_reverse.update({str(v): int(k)}) for k, v in map_dict.items()]
- # ID list 转 dict,方便查找
- id_dict = {}
- for i in range(len(id_list)):
- ss = id_list[i][:-1].split("\t")
- id_dict[str(ss[0])] = ss[1]
- # 取test_pair左右两个ID
- Lid = np.array([[e1] for e1, e2 in test_pair])
- Rid = np.array([[e2] for e1, e2 in test_pair])
- # 取每个在test_pair中vec对应的的左右两个向量
- Lvec = np.array([vec[e1] for e1, e2 in test_pair])
- Rvec = np.array([vec[e2] for e1, e2 in test_pair])
- # 计算两个向量的相似度
- sim = scipy.spatial.distance.cdist(Lvec, Rvec, metric='cityblock')
- # model ID对应similar分数输出字典
- ModelId_sim_dict = {}
- RealId_sim_dict = {}
- Org_sim_dict = {}
- for i in range(len(sim)):
- for j in range(len(sim[i])):
- # 跳过自己与自己的相似度
- if int(Lid[i]) == int(Rid[j]):
- continue
- ModelId_sim_dict[(int(Lid[i]), int(Rid[j]))] = float(sim[i][j])
- # 取真正ID
- Lid_real = map_dict_reverse[str(Lid[i][0])]
- Rid_real = map_dict_reverse[str(Rid[j][0])]
- # 根据ID取公司名
- Lorg = id_dict[str(Lid_real)]
- Rorg = id_dict[str(Rid_real)]
- RealId_sim_dict[(str(Lid_real)+"\t"+str(Rid_real), Lorg+"\t"+Rorg)] = float(sim[i][j])
- Org_sim_dict[(Lorg, Rorg)] = float(sim[i][j])
- RealId_sim_sorted_list = sorted(RealId_sim_dict.items(), key=lambda x: x[1])
- Org_sim_sorted_list = sorted(Org_sim_dict.items(), key=lambda x: x[1])
- for i in range(20):
- print(Org_sim_sorted_list[i])
- # 根据阈值取符合条件的ID对
- # for i in range(len(RealId_sim_sorted_list)):
- # length = RealId_sim_sorted_list[i][1]
- # if length
- return
- def predict_new(vec):
- dir_new_align = "C:\\Users\\admin\\Desktop\\Predict_Align_10w"
- with open(dir_new_align, "r+", encoding='UTF-8') as f:
- f.truncate()
- f.close()
- # 读取Modelid 和 RealId 映射字典
- id_list2 = file2Data(dir + "ent_ids_1")
- map_dict = loadDict(dir + "ModelId2RealId")
- id_list = file2Data(dir + "ent_ids_1_real")
- align_list = file2Data(dir + "ref_ent_ids_real")
- # 反转key-value,方便查找
- map_dict_reverse = {}
- [map_dict_reverse.update({str(v): int(k)}) for k, v in map_dict.items()]
- # ID list 转 dict,方便查找
- id_dict = {}
- for i in range(len(id_list)):
- ss = id_list[i][:-1].split("\t")
- if len(ss) < 2:
- continue
- id_dict[str(ss[0])] = ss[1]
- id_list3 = []
- for i in range(len(id_list2)):
- ss = id_list2[i][:-1].split("\t")
- if ss[1][0] == "O":
- # print(ss[1])
- id_list3.append(ss[0])
- # 取ID
- id = np.array([[e] for e in id_list3])
- # 取ID对应向量计算相似度
- id_vec = np.array([vec[int(k)] for k in id_list3])
- # 清内存
- del vec
- del id_list2
- del id_list3
- del map_dict
- del id_list
- gc.collect()
- # 分割成 2k * 2k 计算相似度
- splited = 2000
- print("len(id_vec)", len(id_vec))
- for i in range(0, len(id_vec), splited):
- splited_vec_left = id_vec[i:i+splited]
- if len(id_vec) - i < splited:
- splited_vec_left = id_vec[i:]
- for j in range(0, len(id_vec), splited):
- print("i,j", i, j)
- splited_vec_right = id_vec[j:j+splited]
- if len(id_vec) - j < splited:
- splited_vec_right = id_vec[j:]
- print("计算相似度")
- sim = scipy.spatial.distance.cdist(splited_vec_left, splited_vec_right, metric='cityblock')
- print("计算完成!")
- # print("转换:ID-相似度")
- # start_time = time.time()
- # RealId_sim_dict = {}
- # for k in range(len(sim)):
- # # print("k", k)
- # for m in range(len(sim[k])):
- #
- # if i+k >= len(id) or j+m >= len(id):
- # # print("i, k, m", i, k, m)
- # continue
- #
- # # 跳过自己与自己的相似度
- # if int(id[i+k]) == int(id[j+m]):
- # continue
- #
- # # 跳过相似度距离大于1的
- # if float(sim[k][m]) > 0.6:
- # # print(k, m, sim[k][m])
- # continue
- #
- # # 取真正ID
- # Lid_real = map_dict_reverse[str(id[i+k][0])]
- # Rid_real = map_dict_reverse[str(id[j+m][0])]
- # # print("Lid_real/Rid_real", Lid_real, Rid_real)
- # # 根据ID取公司名
- # Lorg = id_dict[str(Lid_real)]
- # Rorg = id_dict[str(Rid_real)]
- # RealId_sim_dict[(str(Lid_real)+"\t"+str(Rid_real), Lorg+"\t"+Rorg)] = float(sim[k][m])
- #
- # print("len(RealId_sim_dict)", len(RealId_sim_dict))
- # print("排序")
- # RealId_sim_sorted_list = sorted(RealId_sim_dict.items(), key=lambda x: x[1])
- # RealId_sim_sorted_list = RealId_sim_sorted_list[0:200]
- # print("sort", time.time()-start_time)
- # print(RealId_sim_sorted_list[:5])
- start_time = time.time()
- c_dict = {}
- for k in range(len(sim)):
- if i+k >= len(id):
- continue
- # print("取前10")
- c = heapq.nsmallest(10, range(len(sim[k])), sim[k].take)
- # print("转换:ID-相似度")
- for index in range(len(c)):
- Lid_real = map_dict_reverse[str(id[i+k][0])]
- Rid_real = map_dict_reverse[str(id[j+c[index]][0])]
- Lorg = id_dict[str(Lid_real)]
- Rorg = id_dict[str(Rid_real)]
- if Lid_real == Rid_real:
- continue
- c_dict[(str(Lid_real)+"\t"+str(Rid_real), Lorg+"\t"+Rorg)] = float(sim[k][c[index]])
- print("字典去重")
- c_sort = sorted(c_dict.items(), key=lambda x: x[1])
- c_sort = c_sort[0:200]
- print("headq", time.time()-start_time)
- # print(c_sort[:5])
- # 清内存
- print("清内存")
- del sim
- # del RealId_sim_dict
- del c_dict
- gc.collect()
- print("计算编辑距离")
- new_align_list = []
- for k in range(len(c_sort)):
- # print(RealId_sim_sorted_list[0:5])
- ids = c_sort[k][0][0]
- id1 = ids.split("\t")[0]
- id2 = ids.split("\t")[1]
- orgs = c_sort[k][0][1]
- org1 = orgs.split("\t")[0]
- org2 = orgs.split("\t")[1]
- length = c_sort[k][1]
- # 包含关系直接放入
- if (org1 in org2) or (org2 in org1):
- if int(id1) <= int(id2):
- new_align_list.append(id1 + "\t" + id2 + "\t" + org1 + "\t"
- + org2 + "\t" + str(length) + "\n")
- else:
- new_align_list.append(id2 + "\t" + id1 + "\t" + org2 + "\t"
- + org1 + "\t" + str(length) + "\n")
- # if org2 in org1:
- # if int(id1) <= int(id2):
- # new_align_list.append(ids + "\t" + orgs + str(length) + "\n")
- # else:
- # new_align_list.append(id2 + "\t" + id1 + "\t" + org2 + "\t"
- # + org1 + "\t" + str(length) + "\n")
- # 计算编辑距离
- sim = fuzz.ratio(org1, org2)
- if sim > 85:
- print(org1, org2, sim)
- for m in range(len(align_list)):
- if int(id1) <= int(id2):
- new_align_list.append(id1 + "\t" + id2 + "\t" + org1 + "\t"
- + org2 + "\t" + str(length) + "\n")
- else:
- new_align_list.append(id2 + "\t" + id1 + "\t" + org2 + "\t"
- + org1 + "\t" + str(length) + "\n")
- print("计算完成!")
- print("去重")
- new_align_list = list(set(new_align_list))
- print("len(new_align_list)", len(new_align_list))
- print("写入")
- data2FileAppend(new_align_list, dir_new_align)
- print("Finished right", j+splited)
- print("Finished left", i+splited)
- # 计算模型对齐的ID对在人工对齐的ID中的比例
- accuracy_num = 0
- new_num = 0
- model_align_list = file2Data(dir_new_align)
- model_align_list2 = []
- for i in range(len(model_align_list)):
- ss = model_align_list[i].split("\t")
- if int(ss[0]) > int(ss[1]):
- model_align_list2.append(ss[1]+"\t"+ss[0])
- else:
- model_align_list2.append(ss[0]+"\t"+ss[1])
- model_align_list2 = list(set(model_align_list2))
- for i in range(len(model_align_list2)):
- ss = model_align_list2[i].split("\t")
- ids = ss[0] + "\t" + ss[1]
- ids_reverse = ss[1] + "\t" + ss[0]
- for ids2 in align_list:
- if ids == ids2[:-1] or ids_reverse == ids2[:-1]:
- # print("right predict", model_align_list[i])
- accuracy_num += 1
- break
- if ids != ids2[:-1] and ids_reverse != ids2[:-1]:
- print("new predict", model_align_list[i])
- new_num += 1
- print("====================================")
- print("预测正确率", accuracy_num/len(model_align_list))
- print("predict right number", accuracy_num)
- print("predict new number", new_num)
- print("predict all number", len(model_align_list))
- print("====================================")
- def loadDict(filename):
- with open(filename, "r") as json_file:
- dic = json.load(json_file)
- return dic
- def file2Data(filename):
- with open(filename, 'r', encoding='UTF-8') as f:
- _list = f.readlines()
- f.close()
- return _list
- def data2File(_list, filename):
- with open(filename, 'w', encoding='UTF-8') as f:
- f.writelines(_list)
- f.close()
- def data2FileAppend(_list, filename):
- with open(filename, 'a+', encoding='UTF-8') as f:
- f.writelines(_list)
- f.close()
|