5 years ago · 567991c407
--- a/LoadBestModel.py
+++ b/LoadBestModel.py
@@ -6,112 +6,122 @@ import gc
 
															 import time
														
 
															 import os
														
 
															-dir_best_model = os.getcwd()+"\\data1\\100000\\zh_en\\model.ckpt"
														
 
															-sess = tf.Session()
														
 
															-
														
 
															-# Define placeholders
														
 
															-num_supports = 1
														
 
															-ph_ae = {
														
 
															-    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
														
 
															-    'features': tf.sparse_placeholder(tf.float32), #tf.placeholder(tf.float32),
														
 
															-    'dropout': tf.placeholder_with_default(0., shape=()),
														
 
															-    'num_features_nonzero': tf.placeholder_with_default(0, shape=())
														
 
															-}
														
 
															-ph_se = {
														
 
															-    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
														
 
															-    'features': tf.placeholder(tf.float32),
														
 
															-    'dropout': tf.placeholder_with_default(0., shape=()),
														
 
															-    'num_features_nonzero': tf.placeholder_with_default(0, shape=())
														
 
															-}
														
 
															-
														
 
															-# some flags
														
 
															-flags = tf.app.flags
														
 
															-FLAGS = flags.FLAGS
														
 
															-flags.DEFINE_string('lang', 'zh_en', 'Dataset string.')  # 'zh_en', 'ja_en', 'fr_en'
														
 
															-flags.DEFINE_float('learning_rate', 20, 'Initial learning rate.')
														
 
															-flags.DEFINE_integer('epochs', 20, 'Number of epochs to train.')
														
 
															-flags.DEFINE_float('dropout', 0.3, 'Dropout rate (1 - keep probability).')
														
 
															-flags.DEFINE_float('gamma', 3.0, 'Hyper-parameter for margin based loss.')
														
 
															-flags.DEFINE_integer('k', 5, 'Number of negative samples for each positive seed.')
														
 
															-flags.DEFINE_float('beta', 0.3, 'Weight for structure embeddings.')
														
 
															-flags.DEFINE_integer('se_dim', 100, 'Dimension for SE.')
														
 
															-flags.DEFINE_integer('ae_dim', 100, 'Dimension for AE.')
														
 
															-flags.DEFINE_integer('seed', 5, 'Proportion of seeds, 3 means 30%')
														
 
															-
														
 
															-# data process
														
 
															-adj, ae_input, train, test = load_data(FLAGS.lang)
														
 
															-support = [preprocess_adj(adj)]
														
 
															-
														
 
															-# 把具体值赋给事先定义好的placeholder
														
 
															-feed_dict_ae = construct_feed_dict(ae_input, support, ph_ae)
														
 
															-feed_dict_ae.update({ph_ae['dropout']: FLAGS.dropout})
														
 
															-feed_dict_se = construct_feed_dict(1.0, support, ph_se)
														
 
															-feed_dict_se.update({ph_se['dropout']: FLAGS.dropout})
														
 
															-
														
 
															-# 负样本填充placeholder
														
 
															-t = 0
														
 
															-k = 0
														
 
															-e = ae_input[2][0]
														
 
															-L = np.ones((t, k))
														
 
															-neg_left = L.reshape((t * k,))
														
 
															-L = np.ones((t, k))
														
 
															-neg2_right = L.reshape((t * k,))
														
 
															-neg2_left = np.random.choice(e, t * k)
														
 
															-neg_right = np.random.choice(e, t * k)
														
 
															-feed_dict_ae.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
														
 
															-feed_dict_se.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
														
 
															-
														
 
															-# Create model
														
 
															-model_func = GCN_Align
														
 
															-# attribute embedding model
														
 
															-model_ae = model_func(ph_ae, input_dim=ae_input[2][1], output_dim=FLAGS.ae_dim, ILL=train, sparse_inputs=True, featureless=False, logging=True)
														
 
															-# structure embedding model
														
 
															-model_se = model_func(ph_se, input_dim=ae_input[2][0], output_dim=FLAGS.se_dim, ILL=train, sparse_inputs=False, featureless=True, logging=True)
														
 
															-
														
 
															-# load model
														
 
															-saver = tf.train.Saver()
														
 
															-saver.restore(sess, dir_best_model)
														
 
															-
														
 
															-# run the last layer, get vector
														
 
															-# print(len(feed_dict_ae))
														
 
															-# for i in feed_dict_ae.keys():
														
 
															-#     print(i)
														
 
															-vec_ae = sess.run(model_ae.outputs, feed_dict=feed_dict_ae)
														
 
															-vec_se = sess.run(model_se.outputs, feed_dict=feed_dict_se)
														
 
															-
														
 
															-# 清内存
														
 
															-print("清内存")
														
 
															-del saver
														
 
															-del model_ae
														
 
															-del model_se
														
 
															-del model_func
														
 
															-del feed_dict_ae
														
 
															-del feed_dict_se
														
 
															-del adj
														
 
															-del ae_input
														
 
															-del train
														
 
															-# del test
														
 
															-del support
														
 
															-del sess
														
 
															-gc.collect()
														
 
															-
														
 
															-# print("AE")
														
 
															-# get_hits(vec_ae, test)
														
 
															-# print("SE")
														
 
															-# get_hits(vec_se, test)
														
 
															-# print("SE+AE")
														
 
															-# get_combine_hits(vec_se, vec_ae, FLAGS.beta, test)
														
 
															-#
														
 
															-# calculate similarity
														
 
															-# print("AE Similarity")
														
 
															-# print(len(vec_ae), len(test))
														
 
															-# predict(vec_ae, test)
														
 
															-# print("SE Similarity")
														
 
															-# predict(vec_se, test)
														
 
															-# print("AE+SE Similarity")
														
 
															-# predict(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1), test)
														
 
															-
														
 
															-print("Predict New Align Orgs")
														
 
															-start_time = time.time()
														
 
															-predict_new(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1))
														
 
															-print("use time", time.time()-start_time)
														
 
															+
														
 
															+def loadBestModel():
														
 
															+
														
 
															+    dir_best_model = os.getcwd()+"\\data1\\100000\\zh_en\\model.ckpt"
														
 
															+    sess = tf.Session()
														
 
															+
														
 
															+    # Define placeholders
														
 
															+    num_supports = 1
														
 
															+    ph_ae = {
														
 
															+        'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
														
 
															+        'features': tf.sparse_placeholder(tf.float32), #tf.placeholder(tf.float32),
														
 
															+        'dropout': tf.placeholder_with_default(0., shape=()),
														
 
															+        'num_features_nonzero': tf.placeholder_with_default(0, shape=())
														
 
															+    }
														
 
															+    ph_se = {
														
 
															+        'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
														
 
															+        'features': tf.placeholder(tf.float32),
														
 
															+        'dropout': tf.placeholder_with_default(0., shape=()),
														
 
															+        'num_features_nonzero': tf.placeholder_with_default(0, shape=())
														
 
															+    }
														
 
															+
														
 
															+    # some flags
														
 
															+    flags = tf.app.flags
														
 
															+    FLAGS = flags.FLAGS
														
 
															+    flags.DEFINE_string('lang', 'zh_en', 'Dataset string.')  # 'zh_en', 'ja_en', 'fr_en'
														
 
															+    flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
														
 
															+    flags.DEFINE_integer('epochs', 500, 'Number of epochs to train.')
														
 
															+    flags.DEFINE_float('dropout', 0.3, 'Dropout rate (1 - keep probability).')
														
 
															+    flags.DEFINE_float('gamma', 3.0, 'Hyper-parameter for margin based loss.')
														
 
															+    flags.DEFINE_integer('k', 4, 'Number of negative samples for each positive seed.')
														
 
															+    flags.DEFINE_float('beta', 0.3, 'Weight for structure embeddings.')
														
 
															+    flags.DEFINE_integer('se_dim', 100, 'Dimension for SE.')
														
 
															+    flags.DEFINE_integer('ae_dim', 100, 'Dimension for AE.')
														
 
															+    flags.DEFINE_integer('seed', 9, 'Proportion of seeds, 3 means 30%')
														
 
															+
														
 
															+    # data process
														
 
															+    adj, ae_input, train, test = load_data(FLAGS.lang)
														
 
															+    support = [preprocess_adj(adj)]
														
 
															+
														
 
															+    # 把具体值赋给事先定义好的placeholder
														
 
															+    feed_dict_ae = construct_feed_dict(ae_input, support, ph_ae)
														
 
															+    feed_dict_ae.update({ph_ae['dropout']: FLAGS.dropout})
														
 
															+    feed_dict_se = construct_feed_dict(1.0, support, ph_se)
														
 
															+    feed_dict_se.update({ph_se['dropout']: FLAGS.dropout})
														
 
															+
														
 
															+    # 负样本填充placeholder
														
 
															+    t = 0
														
 
															+    k = 0
														
 
															+    e = ae_input[2][0]
														
 
															+    L = np.ones((t, k))
														
 
															+    neg_left = L.reshape((t * k,))
														
 
															+    L = np.ones((t, k))
														
 
															+    neg2_right = L.reshape((t * k,))
														
 
															+    neg2_left = np.random.choice(e, t * k)
														
 
															+    neg_right = np.random.choice(e, t * k)
														
 
															+    feed_dict_ae.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
														
 
															+    feed_dict_se.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
														
 
															+
														
 
															+    # Create model
														
 
															+    model_func = GCN_Align
														
 
															+    # attribute embedding model
														
 
															+    model_ae = model_func(ph_ae, input_dim=ae_input[2][1], output_dim=FLAGS.ae_dim, ILL=train, sparse_inputs=True, featureless=False, logging=True)
														
 
															+    # structure embedding model
														
 
															+    model_se = model_func(ph_se, input_dim=ae_input[2][0], output_dim=FLAGS.se_dim, ILL=train, sparse_inputs=False, featureless=True, logging=True)
														
 
															+
														
 
															+    # load model
														
 
															+    saver = tf.train.Saver()
														
 
															+    saver.restore(sess, dir_best_model)
														
 
															+
														
 
															+    # run the last layer, get vector
														
 
															+    # print(len(feed_dict_ae))
														
 
															+    # for i in feed_dict_ae.keys():
														
 
															+    #     print(i)
														
 
															+    vec_ae = sess.run(model_ae.outputs, feed_dict=feed_dict_ae)
														
 
															+    vec_se = sess.run(model_se.outputs, feed_dict=feed_dict_se)
														
 
															+
														
 
															+    # 清内存
														
 
															+    print("清内存")
														
 
															+    del saver
														
 
															+    del model_ae
														
 
															+    del model_se
														
 
															+    del model_func
														
 
															+    del feed_dict_ae
														
 
															+    del feed_dict_se
														
 
															+    del adj
														
 
															+    del ae_input
														
 
															+    del train
														
 
															+    # del test
														
 
															+    del support
														
 
															+    del sess
														
 
															+    gc.collect()
														
 
															+
														
 
															+    # print("AE")
														
 
															+    # get_hits(vec_ae, test)
														
 
															+    # print("SE")
														
 
															+    # get_hits(vec_se, test)
														
 
															+    # print("SE+AE")
														
 
															+    # get_combine_hits(vec_se, vec_ae, FLAGS.beta, test)
														
 
															+    #
														
 
															+    # calculate similarity
														
 
															+    # print("AE Similarity")
														
 
															+    # print(len(vec_ae), len(test))
														
 
															+    # predict(vec_ae, test)
														
 
															+    # print("SE Similarity")
														
 
															+    # predict(vec_se, test)
														
 
															+    # print("AE+SE Similarity")
														
 
															+    # predict(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1), test)
														
 
															+
														
 
															+    print("Predict New Align Orgs")
														
 
															+    start_time = time.time()
														
 
															+    predict_new(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1))
														
 
															+    print("use time", time.time()-start_time)
														
 
															+
														
 
															+    print("e"+str(FLAGS.epochs), "d"+str(FLAGS.dropout), "k"+str(FLAGS.k), "s"+str(FLAGS.seed),
														
 
															+          "lr"+str(FLAGS.learning_rate), "b"+str(FLAGS.beta))
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    loadBestModel()
														
--- a/data1/100000/zh_en/model.ckpt.data-00000-of-00001
+++ b/data1/100000/zh_en/model.ckpt.data-00000-of-00001
--- a/data1/100000/zh_en/model.ckpt.index
+++ b/data1/100000/zh_en/model.ckpt.index
--- a/data1/100000/zh_en/model.ckpt.meta
+++ b/data1/100000/zh_en/model.ckpt.meta
--- a/data1/100000/zh_en/ref_ent_ids_real_neg
+++ b/data1/100000/zh_en/ref_ent_ids_real_neg
@@ -5,7 +5,87 @@
 
															 1118944	1899166
														
 
															 5009972	10966414
														
 
															 681097	1118944
														
 
															+123692	969743
														
 
															+1791845	2750730
														
 
															+1882988	2856967
														
 
															+1772758	3357141
														
 
															+1782306	3459999
														
 
															+410264	1845197
														
 
															+1874888	45663924
														
 
															+1841100	45663924
														
 
															+94499	1818967
														
 
															+2156185	2156186
														
 
															+2079122	2297388
														
 
															+2183393	3127840
														
 
															+2151573	3516473
														
 
															+395503	2279239
														
 
															+2140475	4320364
														
 
															+1078878	1391869
														
 
															+35463	1438762
														
 
															+147416	4011641
														
 
															+701385	14732667
														
 
															+634863	1456775
														
 
															+728750	1434146
														
 
															+773767	1456775
														
 
															+980178	1394642
														
 
															+963586	1434146
														
 
															+973245	1349007
														
 
															+1075636	1536110
														
 
															+1223370	1561374
														
 
															+1570827	1570960
														
 
															+1570602	1570984
														
 
															+1570949	1570984
														
 
															+1570602	1570949
														
 
															+1570751	1570934
														
 
															 1431462	2903115
														
 
															+351981	7196326
														
 
															+58437	612829
														
 
															+585481	677644
														
 
															+619225	623843
														
 
															+728215	792093
														
 
															+666694	863853
														
 
															+800686	1094520
														
 
															+117138	836734
														
 
															+773767	1456775
														
 
															+179498	802656
														
 
															+775976	2680005
														
 
															+792184	2965226
														
 
															+3486057	7811352
														
 
															+779859	4522047
														
 
															+1055702	1055731
														
 
															+1055617	1055731
														
 
															+1055617	1055702
														
 
															+448841	1075636
														
 
															+637504	1075636
														
 
															+634315	1075636
														
 
															+993905	1040395
														
 
															+970116	1058212
														
 
															+1223370	1561374
														
 
															+1111975	1874888
														
 
															+391662	961145
														
 
															+987566	2382314
														
 
															+987566	2283218
														
 
															+973245	1349007
														
 
															+970116	1058212
														
 
															+514399	896938
														
 
															+666694	863853
														
 
															+3945130	9434411
														
 
															+728215	792093
														
 
															+465287	792184
														
 
															+580828	792184
														
 
															+779859	4522047
														
 
															+792184	2965226
														
 
															+117138	836734
														
 
															+664437	3671929
														
 
															+609710	1672693
														
 
															+701385	14732667
														
 
															+95522	580828
														
 
															+554566	852241
														
 
															+5167206	7704457
														
 
															+552838	1090125
														
 
															+5009972	10966414
														
 
															+55970	47756334
														
 
															+360649	47319034
														
 
															 15367749	34432513
														
 
															 1133449	14072570
														
 
															 147416	4011641
														
--- a/data1/Align/zh_en/ref_ent_ids_real
+++ b/data1/Align/zh_en/ref_ent_ids_real
@@ -1003,4 +1003,11 @@
 
															 81793	1379850
														
 
															 3181284	3833577
														
 
															 41185	33680479
														
 
															-3894976	40246738
														
 
															+3894976	40246738
														
 
															+615017	651213
														
 
															+655027	3768318
														
 
															+3600029	6374467
														
 
															+3691330	6175739
														
 
															+1009151	1670090
														
 
															+1580600	4243295
														
 
															+1823648	5611310
														
--- a/metrics.py
+++ b/metrics.py
@@ -198,6 +198,11 @@ def predict(vec, test_pair):
 
															 def predict_new(vec):
														
 
															     dir_new_align = "C:\\Users\\admin\\Desktop\\Predict_Align_10w"
														
 
															+
														
 
															+    with open(dir_new_align, "r+", encoding='UTF-8') as f:
														
 
															+        f.truncate()
														
 
															+        f.close()
														
 
															+
														
 
															     # 读取Modelid 和 RealId 映射字典
														
 
															     id_list2 = file2Data(dir + "ent_ids_1")
														
 
															     map_dict = loadDict(dir + "ModelId2RealId")
														
@@ -423,14 +428,17 @@ def loadDict(filename):
 
															 def file2Data(filename):
														
 
															     with open(filename, 'r', encoding='UTF-8') as f:
														
 
															         _list = f.readlines()
														
 
															+        f.close()
														
 
															     return _list
														
 
															 def data2File(_list, filename):
														
 
															     with open(filename, 'w', encoding='UTF-8') as f:
														
 
															         f.writelines(_list)
														
 
															+        f.close()
														
 
															 def data2FileAppend(_list, filename):
														
 
															     with open(filename, 'a+', encoding='UTF-8') as f:
														
 
															-        f.writelines(_list)
														
 
															+        f.writelines(_list)
														
 
															+        f.close()
														
--- a/train.py
+++ b/train.py
@@ -8,6 +8,7 @@ from utils import *
 
															 from metrics import *
														
 
															 from models import GCN_Align
														
 
															 import os
														
 
															+from LoadBestModel import loadBestModel
														
 
															 dir_best_model = os.getcwd()+"\\data1\\100000\\zh_en\\model.ckpt"
														
@@ -20,7 +21,7 @@ tf.set_random_seed(seed)
 
															 flags = tf.app.flags
														
 
															 FLAGS = flags.FLAGS
														
 
															 flags.DEFINE_string('lang', 'zh_en', 'Dataset string.')  # 'zh_en', 'ja_en', 'fr_en'
														
 
															-flags.DEFINE_float('learning_rate', 20, 'Initial learning rate.')
														
 
															+flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
														
 
															 flags.DEFINE_integer('epochs', 500, 'Number of epochs to train.')
														
 
															 flags.DEFINE_float('dropout', 0.3, 'Dropout rate (1 - keep probability).')
														
 
															 flags.DEFINE_float('gamma', 3.0, 'Hyper-parameter for margin based loss.')
														
@@ -128,6 +129,10 @@ for epoch in range(FLAGS.epochs):
 
															         SE_train_loss = outs_se[1]
														
 
															         print("Save best Model!")
														
 
															 print("Optimization Finished!")
														
 
															+print("e"+str(FLAGS.epochs), "d"+str(FLAGS.dropout), "k"+str(FLAGS.k), "s"+str(FLAGS.seed),
														
 
															+      "lr"+str(FLAGS.learning_rate), "b"+str(FLAGS.beta))
														
 
															+# loadBestModel()
														
 
															+
														
 
															 # Testing