metrics.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. import tensorflow as tf
  2. import numpy as np
  3. import scipy
  4. import json
  5. from fuzzywuzzy import fuzz
  6. import gc
  7. from utils import loadfile
  8. import heapq
  9. import time
  10. import os
  11. dir = os.getcwd()+"\\data1\\100000\\zh_en\\"
  12. def masked_softmax_cross_entropy(preds, labels, mask):
  13. """Softmax cross-entropy loss with masking."""
  14. loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels)
  15. mask = tf.cast(mask, dtype=tf.float32)
  16. mask /= tf.reduce_mean(mask)
  17. loss *= mask
  18. return tf.reduce_mean(loss)
  19. def masked_accuracy(preds, labels, mask):
  20. """Accuracy with masking."""
  21. correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1))
  22. accuracy_all = tf.cast(correct_prediction, tf.float32)
  23. mask = tf.cast(mask, dtype=tf.float32)
  24. mask /= tf.reduce_mean(mask)
  25. accuracy_all *= mask
  26. return tf.reduce_mean(accuracy_all)
  27. def get_placeholder_by_name(name):
  28. try:
  29. return tf.get_default_graph().get_tensor_by_name(name+":0")
  30. except:
  31. return tf.placeholder(tf.int32, name=name)
  32. #
  33. def align_loss(outlayer, ILL, gamma, k, AE=True):
  34. left = ILL[:, 0]
  35. right = ILL[:, 1]
  36. t = len(ILL)
  37. negative_ILL = loadfile(dir+"ref_ent_ids_neg", 2)
  38. # 正样本
  39. # embedding_lookup 就是返回 outlayer的第left个Tensor,但不只是查表,同样也会更新参数
  40. left_x = tf.nn.embedding_lookup(outlayer, left)
  41. right_x = tf.nn.embedding_lookup(outlayer, right)
  42. # reduce_sum:计算指定维度的和
  43. A = tf.reduce_sum(tf.abs(left_x - right_x), 1)
  44. # 负样本
  45. # 如果是AE则使用随机选取负样本,如果是SE则人工筛选负样本
  46. if AE:
  47. neg_left = get_placeholder_by_name("neg_left") #tf.placeholder(tf.int32, [t * k], "neg_left")
  48. neg_right = get_placeholder_by_name("neg_right") #tf.placeholder(tf.int32, [t * k], "neg_right")
  49. else:
  50. negative_ILL = negative_ILL[:t*k]
  51. np.random.shuffle(negative_ILL)
  52. negative_ILL = np.array(negative_ILL)
  53. neg_left = negative_ILL[:, 0]
  54. neg_right = negative_ILL[:, 1]
  55. neg_l_x = tf.nn.embedding_lookup(outlayer, neg_left)
  56. neg_r_x = tf.nn.embedding_lookup(outlayer, neg_right)
  57. B = tf.reduce_sum(tf.abs(neg_l_x - neg_r_x), 1)
  58. C = - tf.reshape(B, [t, k])
  59. D = A + gamma
  60. L1 = tf.nn.relu(tf.add(C, tf.reshape(D, [t, 1])))
  61. neg_left = get_placeholder_by_name("neg2_left") #tf.placeholder(tf.int32, [t * k], "neg2_left")
  62. neg_right = get_placeholder_by_name("neg2_right") #tf.placeholder(tf.int32, [t * k], "neg2_right")
  63. neg_l_x = tf.nn.embedding_lookup(outlayer, neg_left)
  64. neg_r_x = tf.nn.embedding_lookup(outlayer, neg_right)
  65. B = tf.reduce_sum(tf.abs(neg_l_x - neg_r_x), 1)
  66. C = - tf.reshape(B, [t, k])
  67. L2 = tf.nn.relu(tf.add(C, tf.reshape(D, [t, 1])))
  68. return (tf.reduce_sum(L1) + tf.reduce_sum(L2)) / (2.0 * k * t)
  69. def get_hits(vec, test_pair, top_k=(1, 10, 30, 50)):
  70. # print("vec ===================")
  71. # print(vec.shape)
  72. # 取每个在test_pair中vec对应的的左右两个向量
  73. Lvec = np.array([vec[e1] for e1, e2 in test_pair])
  74. Rvec = np.array([vec[e2] for e1, e2 in test_pair])
  75. # print("Lvec ===================")
  76. # print(Lvec.shape)
  77. # print("Rvec ===================")
  78. # print(Rvec.shape)
  79. # 计算两个向量的相似度
  80. sim = scipy.spatial.distance.cdist(Lvec, Rvec, metric='cityblock')
  81. # print("sim ====================")
  82. # print(len(sim))
  83. # print(len(sim[0]))
  84. # 计算
  85. top_lr = [0] * len(top_k)
  86. for i in range(Lvec.shape[0]):
  87. # argsort: 从小到大排序,并只取排序好的索引
  88. rank = sim[i, :].argsort()
  89. rank_index = np.where(rank == i)[0][0]
  90. for j in range(len(top_k)):
  91. if rank_index < top_k[j]:
  92. top_lr[j] += 1
  93. # print("rank ====================")
  94. # print(np.where(rank == 0)[0])
  95. #
  96. # print("top_lr ====================")
  97. # print(len(top_lr))
  98. top_rl = [0] * len(top_k)
  99. for i in range(Rvec.shape[0]):
  100. rank = sim[:, i].argsort()
  101. rank_index = np.where(rank == i)[0][0]
  102. for j in range(len(top_k)):
  103. if rank_index < top_k[j]:
  104. top_rl[j] += 1
  105. # print("top_rl ====================")
  106. # print(top_rl)
  107. print('For each left:')
  108. for i in range(len(top_lr)):
  109. print('Hits@%d: %.2f%%' % (top_k[i], top_lr[i] / len(test_pair) * 100))
  110. print('For each right:')
  111. for i in range(len(top_rl)):
  112. print('Hits@%d: %.2f%%' % (top_k[i], top_rl[i] / len(test_pair) * 100))
  113. def get_combine_hits(se_vec, ae_vec, beta, test_pair, top_k=(1, 10, 30, 50)):
  114. vec = np.concatenate([se_vec*beta, ae_vec*(1.0-beta)], axis=1)
  115. get_hits(vec, test_pair, top_k)
  116. def predict(vec, test_pair):
  117. # 读取Modelid 和 RealId 映射字典
  118. map_dict = loadDict(dir+"ModelId2RealId")
  119. id_list = file2Data(dir+"ent_ids_1_real")
  120. # 反转key-value,方便查找
  121. map_dict_reverse = {}
  122. [map_dict_reverse.update({str(v): int(k)}) for k, v in map_dict.items()]
  123. # ID list 转 dict,方便查找
  124. id_dict = {}
  125. for i in range(len(id_list)):
  126. ss = id_list[i][:-1].split("\t")
  127. id_dict[str(ss[0])] = ss[1]
  128. # 取test_pair左右两个ID
  129. Lid = np.array([[e1] for e1, e2 in test_pair])
  130. Rid = np.array([[e2] for e1, e2 in test_pair])
  131. # 取每个在test_pair中vec对应的的左右两个向量
  132. Lvec = np.array([vec[e1] for e1, e2 in test_pair])
  133. Rvec = np.array([vec[e2] for e1, e2 in test_pair])
  134. # 计算两个向量的相似度
  135. sim = scipy.spatial.distance.cdist(Lvec, Rvec, metric='cityblock')
  136. # model ID对应similar分数输出字典
  137. ModelId_sim_dict = {}
  138. RealId_sim_dict = {}
  139. Org_sim_dict = {}
  140. for i in range(len(sim)):
  141. for j in range(len(sim[i])):
  142. # 跳过自己与自己的相似度
  143. if int(Lid[i]) == int(Rid[j]):
  144. continue
  145. ModelId_sim_dict[(int(Lid[i]), int(Rid[j]))] = float(sim[i][j])
  146. # 取真正ID
  147. Lid_real = map_dict_reverse[str(Lid[i][0])]
  148. Rid_real = map_dict_reverse[str(Rid[j][0])]
  149. # 根据ID取公司名
  150. Lorg = id_dict[str(Lid_real)]
  151. Rorg = id_dict[str(Rid_real)]
  152. RealId_sim_dict[(str(Lid_real)+"\t"+str(Rid_real), Lorg+"\t"+Rorg)] = float(sim[i][j])
  153. Org_sim_dict[(Lorg, Rorg)] = float(sim[i][j])
  154. RealId_sim_sorted_list = sorted(RealId_sim_dict.items(), key=lambda x: x[1])
  155. Org_sim_sorted_list = sorted(Org_sim_dict.items(), key=lambda x: x[1])
  156. for i in range(20):
  157. print(Org_sim_sorted_list[i])
  158. # 根据阈值取符合条件的ID对
  159. # for i in range(len(RealId_sim_sorted_list)):
  160. # length = RealId_sim_sorted_list[i][1]
  161. # if length
  162. return
  163. def predict_new(vec):
  164. dir_new_align = "C:\\Users\\admin\\Desktop\\Predict_Align_10w"
  165. with open(dir_new_align, "r+", encoding='UTF-8') as f:
  166. f.truncate()
  167. f.close()
  168. # 读取Modelid 和 RealId 映射字典
  169. id_list2 = file2Data(dir + "ent_ids_1")
  170. map_dict = loadDict(dir + "ModelId2RealId")
  171. id_list = file2Data(dir + "ent_ids_1_real")
  172. align_list = file2Data(dir + "ref_ent_ids_real")
  173. # 反转key-value,方便查找
  174. map_dict_reverse = {}
  175. [map_dict_reverse.update({str(v): int(k)}) for k, v in map_dict.items()]
  176. # ID list 转 dict,方便查找
  177. id_dict = {}
  178. for i in range(len(id_list)):
  179. ss = id_list[i][:-1].split("\t")
  180. if len(ss) < 2:
  181. continue
  182. id_dict[str(ss[0])] = ss[1]
  183. id_list3 = []
  184. for i in range(len(id_list2)):
  185. ss = id_list2[i][:-1].split("\t")
  186. if ss[1][0] == "O":
  187. # print(ss[1])
  188. id_list3.append(ss[0])
  189. # 取ID
  190. id = np.array([[e] for e in id_list3])
  191. # 取ID对应向量计算相似度
  192. id_vec = np.array([vec[int(k)] for k in id_list3])
  193. # 清内存
  194. del vec
  195. del id_list2
  196. del id_list3
  197. del map_dict
  198. del id_list
  199. gc.collect()
  200. # 分割成 2k * 2k 计算相似度
  201. splited = 2000
  202. print("len(id_vec)", len(id_vec))
  203. for i in range(0, len(id_vec), splited):
  204. splited_vec_left = id_vec[i:i+splited]
  205. if len(id_vec) - i < splited:
  206. splited_vec_left = id_vec[i:]
  207. for j in range(0, len(id_vec), splited):
  208. print("i,j", i, j)
  209. splited_vec_right = id_vec[j:j+splited]
  210. if len(id_vec) - j < splited:
  211. splited_vec_right = id_vec[j:]
  212. print("计算相似度")
  213. sim = scipy.spatial.distance.cdist(splited_vec_left, splited_vec_right, metric='cityblock')
  214. print("计算完成!")
  215. # print("转换:ID-相似度")
  216. # start_time = time.time()
  217. # RealId_sim_dict = {}
  218. # for k in range(len(sim)):
  219. # # print("k", k)
  220. # for m in range(len(sim[k])):
  221. #
  222. # if i+k >= len(id) or j+m >= len(id):
  223. # # print("i, k, m", i, k, m)
  224. # continue
  225. #
  226. # # 跳过自己与自己的相似度
  227. # if int(id[i+k]) == int(id[j+m]):
  228. # continue
  229. #
  230. # # 跳过相似度距离大于1的
  231. # if float(sim[k][m]) > 0.6:
  232. # # print(k, m, sim[k][m])
  233. # continue
  234. #
  235. # # 取真正ID
  236. # Lid_real = map_dict_reverse[str(id[i+k][0])]
  237. # Rid_real = map_dict_reverse[str(id[j+m][0])]
  238. # # print("Lid_real/Rid_real", Lid_real, Rid_real)
  239. # # 根据ID取公司名
  240. # Lorg = id_dict[str(Lid_real)]
  241. # Rorg = id_dict[str(Rid_real)]
  242. # RealId_sim_dict[(str(Lid_real)+"\t"+str(Rid_real), Lorg+"\t"+Rorg)] = float(sim[k][m])
  243. #
  244. # print("len(RealId_sim_dict)", len(RealId_sim_dict))
  245. # print("排序")
  246. # RealId_sim_sorted_list = sorted(RealId_sim_dict.items(), key=lambda x: x[1])
  247. # RealId_sim_sorted_list = RealId_sim_sorted_list[0:200]
  248. # print("sort", time.time()-start_time)
  249. # print(RealId_sim_sorted_list[:5])
  250. start_time = time.time()
  251. c_dict = {}
  252. for k in range(len(sim)):
  253. if i+k >= len(id):
  254. continue
  255. # print("取前10")
  256. c = heapq.nsmallest(10, range(len(sim[k])), sim[k].take)
  257. # print("转换:ID-相似度")
  258. for index in range(len(c)):
  259. Lid_real = map_dict_reverse[str(id[i+k][0])]
  260. Rid_real = map_dict_reverse[str(id[j+c[index]][0])]
  261. Lorg = id_dict[str(Lid_real)]
  262. Rorg = id_dict[str(Rid_real)]
  263. if Lid_real == Rid_real:
  264. continue
  265. c_dict[(str(Lid_real)+"\t"+str(Rid_real), Lorg+"\t"+Rorg)] = float(sim[k][c[index]])
  266. print("字典去重")
  267. c_sort = sorted(c_dict.items(), key=lambda x: x[1])
  268. c_sort = c_sort[0:200]
  269. print("headq", time.time()-start_time)
  270. # print(c_sort[:5])
  271. # 清内存
  272. print("清内存")
  273. del sim
  274. # del RealId_sim_dict
  275. del c_dict
  276. gc.collect()
  277. print("计算编辑距离")
  278. new_align_list = []
  279. for k in range(len(c_sort)):
  280. # print(RealId_sim_sorted_list[0:5])
  281. ids = c_sort[k][0][0]
  282. id1 = ids.split("\t")[0]
  283. id2 = ids.split("\t")[1]
  284. orgs = c_sort[k][0][1]
  285. org1 = orgs.split("\t")[0]
  286. org2 = orgs.split("\t")[1]
  287. length = c_sort[k][1]
  288. # 包含关系直接放入
  289. if (org1 in org2) or (org2 in org1):
  290. if int(id1) <= int(id2):
  291. new_align_list.append(id1 + "\t" + id2 + "\t" + org1 + "\t"
  292. + org2 + "\t" + str(length) + "\n")
  293. else:
  294. new_align_list.append(id2 + "\t" + id1 + "\t" + org2 + "\t"
  295. + org1 + "\t" + str(length) + "\n")
  296. # if org2 in org1:
  297. # if int(id1) <= int(id2):
  298. # new_align_list.append(ids + "\t" + orgs + str(length) + "\n")
  299. # else:
  300. # new_align_list.append(id2 + "\t" + id1 + "\t" + org2 + "\t"
  301. # + org1 + "\t" + str(length) + "\n")
  302. # 计算编辑距离
  303. sim = fuzz.ratio(org1, org2)
  304. if sim > 85:
  305. print(org1, org2, sim)
  306. for m in range(len(align_list)):
  307. if int(id1) <= int(id2):
  308. new_align_list.append(id1 + "\t" + id2 + "\t" + org1 + "\t"
  309. + org2 + "\t" + str(length) + "\n")
  310. else:
  311. new_align_list.append(id2 + "\t" + id1 + "\t" + org2 + "\t"
  312. + org1 + "\t" + str(length) + "\n")
  313. print("计算完成!")
  314. print("去重")
  315. new_align_list = list(set(new_align_list))
  316. print("len(new_align_list)", len(new_align_list))
  317. print("写入")
  318. data2FileAppend(new_align_list, dir_new_align)
  319. print("Finished right", j+splited)
  320. print("Finished left", i+splited)
  321. # 计算模型对齐的ID对在人工对齐的ID中的比例
  322. accuracy_num = 0
  323. new_num = 0
  324. model_align_list = file2Data(dir_new_align)
  325. model_align_list2 = []
  326. for i in range(len(model_align_list)):
  327. ss = model_align_list[i].split("\t")
  328. if int(ss[0]) > int(ss[1]):
  329. model_align_list2.append(ss[1]+"\t"+ss[0])
  330. else:
  331. model_align_list2.append(ss[0]+"\t"+ss[1])
  332. model_align_list2 = list(set(model_align_list2))
  333. for i in range(len(model_align_list2)):
  334. ss = model_align_list2[i].split("\t")
  335. ids = ss[0] + "\t" + ss[1]
  336. ids_reverse = ss[1] + "\t" + ss[0]
  337. for ids2 in align_list:
  338. if ids == ids2[:-1] or ids_reverse == ids2[:-1]:
  339. # print("right predict", model_align_list[i])
  340. accuracy_num += 1
  341. break
  342. if ids != ids2[:-1] and ids_reverse != ids2[:-1]:
  343. print("new predict", model_align_list[i])
  344. new_num += 1
  345. print("====================================")
  346. print("预测正确率", accuracy_num/len(model_align_list))
  347. print("predict right number", accuracy_num)
  348. print("predict new number", new_num)
  349. print("predict all number", len(model_align_list))
  350. print("====================================")
  351. def loadDict(filename):
  352. with open(filename, "r") as json_file:
  353. dic = json.load(json_file)
  354. return dic
  355. def file2Data(filename):
  356. with open(filename, 'r', encoding='UTF-8') as f:
  357. _list = f.readlines()
  358. f.close()
  359. return _list
  360. def data2File(_list, filename):
  361. with open(filename, 'w', encoding='UTF-8') as f:
  362. f.writelines(_list)
  363. f.close()
  364. def data2FileAppend(_list, filename):
  365. with open(filename, 'a+', encoding='UTF-8') as f:
  366. f.writelines(_list)
  367. f.close()