5 gadi atpakaļ · 567991c407
--- a/LoadBestModel.py
+++ b/LoadBestModel.py
@@ -6,112 +6,122 @@ import gc
 
				 import time
			
 
				 import os
			
 
				 
			
 
				-dir_best_model = os.getcwd()+"\\data1\\100000\\zh_en\\model.ckpt"
			
 
				-sess = tf.Session()
			
 
				-
			
 
				-# Define placeholders
			
 
				-num_supports = 1
			
 
				-ph_ae = {
			
 
				-    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
			
 
				-    'features': tf.sparse_placeholder(tf.float32), #tf.placeholder(tf.float32),
			
 
				-    'dropout': tf.placeholder_with_default(0., shape=()),
			
 
				-    'num_features_nonzero': tf.placeholder_with_default(0, shape=())
			
 
				-}
			
 
				-ph_se = {
			
 
				-    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
			
 
				-    'features': tf.placeholder(tf.float32),
			
 
				-    'dropout': tf.placeholder_with_default(0., shape=()),
			
 
				-    'num_features_nonzero': tf.placeholder_with_default(0, shape=())
			
 
				-}
			
 
				-
			
 
				-# some flags
			
 
				-flags = tf.app.flags
			
 
				-FLAGS = flags.FLAGS
			
 
				-flags.DEFINE_string('lang', 'zh_en', 'Dataset string.')  # 'zh_en', 'ja_en', 'fr_en'
			
 
				-flags.DEFINE_float('learning_rate', 20, 'Initial learning rate.')
			
 
				-flags.DEFINE_integer('epochs', 20, 'Number of epochs to train.')
			
 
				-flags.DEFINE_float('dropout', 0.3, 'Dropout rate (1 - keep probability).')
			
 
				-flags.DEFINE_float('gamma', 3.0, 'Hyper-parameter for margin based loss.')
			
 
				-flags.DEFINE_integer('k', 5, 'Number of negative samples for each positive seed.')
			
 
				-flags.DEFINE_float('beta', 0.3, 'Weight for structure embeddings.')
			
 
				-flags.DEFINE_integer('se_dim', 100, 'Dimension for SE.')
			
 
				-flags.DEFINE_integer('ae_dim', 100, 'Dimension for AE.')
			
 
				-flags.DEFINE_integer('seed', 5, 'Proportion of seeds, 3 means 30%')
			
 
				-
			
 
				-# data process
			
 
				-adj, ae_input, train, test = load_data(FLAGS.lang)
			
 
				-support = [preprocess_adj(adj)]
			
 
				-
			
 
				-# 把具体值赋给事先定义好的placeholder
			
 
				-feed_dict_ae = construct_feed_dict(ae_input, support, ph_ae)
			
 
				-feed_dict_ae.update({ph_ae['dropout']: FLAGS.dropout})
			
 
				-feed_dict_se = construct_feed_dict(1.0, support, ph_se)
			
 
				-feed_dict_se.update({ph_se['dropout']: FLAGS.dropout})
			
 
				-
			
 
				-# 负样本填充placeholder
			
 
				-t = 0
			
 
				-k = 0
			
 
				-e = ae_input[2][0]
			
 
				-L = np.ones((t, k))
			
 
				-neg_left = L.reshape((t * k,))
			
 
				-L = np.ones((t, k))
			
 
				-neg2_right = L.reshape((t * k,))
			
 
				-neg2_left = np.random.choice(e, t * k)
			
 
				-neg_right = np.random.choice(e, t * k)
			
 
				-feed_dict_ae.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
			
 
				-feed_dict_se.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
			
 
				-
			
 
				-# Create model
			
 
				-model_func = GCN_Align
			
 
				-# attribute embedding model
			
 
				-model_ae = model_func(ph_ae, input_dim=ae_input[2][1], output_dim=FLAGS.ae_dim, ILL=train, sparse_inputs=True, featureless=False, logging=True)
			
 
				-# structure embedding model
			
 
				-model_se = model_func(ph_se, input_dim=ae_input[2][0], output_dim=FLAGS.se_dim, ILL=train, sparse_inputs=False, featureless=True, logging=True)
			
 
				-
			
 
				-# load model
			
 
				-saver = tf.train.Saver()
			
 
				-saver.restore(sess, dir_best_model)
			
 
				-
			
 
				-# run the last layer, get vector
			
 
				-# print(len(feed_dict_ae))
			
 
				-# for i in feed_dict_ae.keys():
			
 
				-#     print(i)
			
 
				-vec_ae = sess.run(model_ae.outputs, feed_dict=feed_dict_ae)
			
 
				-vec_se = sess.run(model_se.outputs, feed_dict=feed_dict_se)
			
 
				-
			
 
				-# 清内存
			
 
				-print("清内存")
			
 
				-del saver
			
 
				-del model_ae
			
 
				-del model_se
			
 
				-del model_func
			
 
				-del feed_dict_ae
			
 
				-del feed_dict_se
			
 
				-del adj
			
 
				-del ae_input
			
 
				-del train
			
 
				-# del test
			
 
				-del support
			
 
				-del sess
			
 
				-gc.collect()
			
 
				-
			
 
				-# print("AE")
			
 
				-# get_hits(vec_ae, test)
			
 
				-# print("SE")
			
 
				-# get_hits(vec_se, test)
			
 
				-# print("SE+AE")
			
 
				-# get_combine_hits(vec_se, vec_ae, FLAGS.beta, test)
			
 
				-#
			
 
				-# calculate similarity
			
 
				-# print("AE Similarity")
			
 
				-# print(len(vec_ae), len(test))
			
 
				-# predict(vec_ae, test)
			
 
				-# print("SE Similarity")
			
 
				-# predict(vec_se, test)
			
 
				-# print("AE+SE Similarity")
			
 
				-# predict(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1), test)
			
 
				-
			
 
				-print("Predict New Align Orgs")
			
 
				-start_time = time.time()
			
 
				-predict_new(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1))
			
 
				-print("use time", time.time()-start_time)
			
 
				+
			
 
				+def loadBestModel():
			
 
				+
			
 
				+    dir_best_model = os.getcwd()+"\\data1\\100000\\zh_en\\model.ckpt"
			
 
				+    sess = tf.Session()
			
 
				+
			
 
				+    # Define placeholders
			
 
				+    num_supports = 1
			
 
				+    ph_ae = {
			
 
				+        'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
			
 
				+        'features': tf.sparse_placeholder(tf.float32), #tf.placeholder(tf.float32),
			
 
				+        'dropout': tf.placeholder_with_default(0., shape=()),
			
 
				+        'num_features_nonzero': tf.placeholder_with_default(0, shape=())
			
 
				+    }
			
 
				+    ph_se = {
			
 
				+        'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
			
 
				+        'features': tf.placeholder(tf.float32),
			
 
				+        'dropout': tf.placeholder_with_default(0., shape=()),
			
 
				+        'num_features_nonzero': tf.placeholder_with_default(0, shape=())
			
 
				+    }
			
 
				+
			
 
				+    # some flags
			
 
				+    flags = tf.app.flags
			
 
				+    FLAGS = flags.FLAGS
			
 
				+    flags.DEFINE_string('lang', 'zh_en', 'Dataset string.')  # 'zh_en', 'ja_en', 'fr_en'
			
 
				+    flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
			
 
				+    flags.DEFINE_integer('epochs', 500, 'Number of epochs to train.')
			
 
				+    flags.DEFINE_float('dropout', 0.3, 'Dropout rate (1 - keep probability).')
			
 
				+    flags.DEFINE_float('gamma', 3.0, 'Hyper-parameter for margin based loss.')
			
 
				+    flags.DEFINE_integer('k', 4, 'Number of negative samples for each positive seed.')
			
 
				+    flags.DEFINE_float('beta', 0.3, 'Weight for structure embeddings.')
			
 
				+    flags.DEFINE_integer('se_dim', 100, 'Dimension for SE.')
			
 
				+    flags.DEFINE_integer('ae_dim', 100, 'Dimension for AE.')
			
 
				+    flags.DEFINE_integer('seed', 9, 'Proportion of seeds, 3 means 30%')
			
 
				+
			
 
				+    # data process
			
 
				+    adj, ae_input, train, test = load_data(FLAGS.lang)
			
 
				+    support = [preprocess_adj(adj)]
			
 
				+
			
 
				+    # 把具体值赋给事先定义好的placeholder
			
 
				+    feed_dict_ae = construct_feed_dict(ae_input, support, ph_ae)
			
 
				+    feed_dict_ae.update({ph_ae['dropout']: FLAGS.dropout})
			
 
				+    feed_dict_se = construct_feed_dict(1.0, support, ph_se)
			
 
				+    feed_dict_se.update({ph_se['dropout']: FLAGS.dropout})
			
 
				+
			
 
				+    # 负样本填充placeholder
			
 
				+    t = 0
			
 
				+    k = 0
			
 
				+    e = ae_input[2][0]
			
 
				+    L = np.ones((t, k))
			
 
				+    neg_left = L.reshape((t * k,))
			
 
				+    L = np.ones((t, k))
			
 
				+    neg2_right = L.reshape((t * k,))
			
 
				+    neg2_left = np.random.choice(e, t * k)
			
 
				+    neg_right = np.random.choice(e, t * k)
			
 
				+    feed_dict_ae.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
			
 
				+    feed_dict_se.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
			
 
				+
			
 
				+    # Create model
			
 
				+    model_func = GCN_Align
			
 
				+    # attribute embedding model
			
 
				+    model_ae = model_func(ph_ae, input_dim=ae_input[2][1], output_dim=FLAGS.ae_dim, ILL=train, sparse_inputs=True, featureless=False, logging=True)
			
 
				+    # structure embedding model
			
 
				+    model_se = model_func(ph_se, input_dim=ae_input[2][0], output_dim=FLAGS.se_dim, ILL=train, sparse_inputs=False, featureless=True, logging=True)
			
 
				+
			
 
				+    # load model
			
 
				+    saver = tf.train.Saver()
			
 
				+    saver.restore(sess, dir_best_model)
			
 
				+
			
 
				+    # run the last layer, get vector
			
 
				+    # print(len(feed_dict_ae))
			
 
				+    # for i in feed_dict_ae.keys():
			
 
				+    #     print(i)
			
 
				+    vec_ae = sess.run(model_ae.outputs, feed_dict=feed_dict_ae)
			
 
				+    vec_se = sess.run(model_se.outputs, feed_dict=feed_dict_se)
			
 
				+
			
 
				+    # 清内存
			
 
				+    print("清内存")
			
 
				+    del saver
			
 
				+    del model_ae
			
 
				+    del model_se
			
 
				+    del model_func
			
 
				+    del feed_dict_ae
			
 
				+    del feed_dict_se
			
 
				+    del adj
			
 
				+    del ae_input
			
 
				+    del train
			
 
				+    # del test
			
 
				+    del support
			
 
				+    del sess
			
 
				+    gc.collect()
			
 
				+
			
 
				+    # print("AE")
			
 
				+    # get_hits(vec_ae, test)
			
 
				+    # print("SE")
			
 
				+    # get_hits(vec_se, test)
			
 
				+    # print("SE+AE")
			
 
				+    # get_combine_hits(vec_se, vec_ae, FLAGS.beta, test)
			
 
				+    #
			
 
				+    # calculate similarity
			
 
				+    # print("AE Similarity")
			
 
				+    # print(len(vec_ae), len(test))
			
 
				+    # predict(vec_ae, test)
			
 
				+    # print("SE Similarity")
			
 
				+    # predict(vec_se, test)
			
 
				+    # print("AE+SE Similarity")
			
 
				+    # predict(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1), test)
			
 
				+
			
 
				+    print("Predict New Align Orgs")
			
 
				+    start_time = time.time()
			
 
				+    predict_new(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1))
			
 
				+    print("use time", time.time()-start_time)
			
 
				+
			
 
				+    print("e"+str(FLAGS.epochs), "d"+str(FLAGS.dropout), "k"+str(FLAGS.k), "s"+str(FLAGS.seed),
			
 
				+          "lr"+str(FLAGS.learning_rate), "b"+str(FLAGS.beta))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    loadBestModel()
			
--- a/data1/100000/zh_en/model.ckpt.data-00000-of-00001
+++ b/data1/100000/zh_en/model.ckpt.data-00000-of-00001
--- a/data1/100000/zh_en/model.ckpt.index
+++ b/data1/100000/zh_en/model.ckpt.index
--- a/data1/100000/zh_en/model.ckpt.meta
+++ b/data1/100000/zh_en/model.ckpt.meta
--- a/data1/100000/zh_en/ref_ent_ids_real_neg
+++ b/data1/100000/zh_en/ref_ent_ids_real_neg
@@ -5,7 +5,87 @@
 
				 1118944	1899166
			
 
				 5009972	10966414
			
 
				 681097	1118944
			
 
				+123692	969743
			
 
				+1791845	2750730
			
 
				+1882988	2856967
			
 
				+1772758	3357141
			
 
				+1782306	3459999
			
 
				+410264	1845197
			
 
				+1874888	45663924
			
 
				+1841100	45663924
			
 
				+94499	1818967
			
 
				+2156185	2156186
			
 
				+2079122	2297388
			
 
				+2183393	3127840
			
 
				+2151573	3516473
			
 
				+395503	2279239
			
 
				+2140475	4320364
			
 
				+1078878	1391869
			
 
				+35463	1438762
			
 
				+147416	4011641
			
 
				+701385	14732667
			
 
				+634863	1456775
			
 
				+728750	1434146
			
 
				+773767	1456775
			
 
				+980178	1394642
			
 
				+963586	1434146
			
 
				+973245	1349007
			
 
				+1075636	1536110
			
 
				+1223370	1561374
			
 
				+1570827	1570960
			
 
				+1570602	1570984
			
 
				+1570949	1570984
			
 
				+1570602	1570949
			
 
				+1570751	1570934
			
 
				 1431462	2903115
			
 
				+351981	7196326
			
 
				+58437	612829
			
 
				+585481	677644
			
 
				+619225	623843
			
 
				+728215	792093
			
 
				+666694	863853
			
 
				+800686	1094520
			
 
				+117138	836734
			
 
				+773767	1456775
			
 
				+179498	802656
			
 
				+775976	2680005
			
 
				+792184	2965226
			
 
				+3486057	7811352
			
 
				+779859	4522047
			
 
				+1055702	1055731
			
 
				+1055617	1055731
			
 
				+1055617	1055702
			
 
				+448841	1075636
			
 
				+637504	1075636
			
 
				+634315	1075636
			
 
				+993905	1040395
			
 
				+970116	1058212
			
 
				+1223370	1561374
			
 
				+1111975	1874888
			
 
				+391662	961145
			
 
				+987566	2382314
			
 
				+987566	2283218
			
 
				+973245	1349007
			
 
				+970116	1058212
			
 
				+514399	896938
			
 
				+666694	863853
			
 
				+3945130	9434411
			
 
				+728215	792093
			
 
				+465287	792184
			
 
				+580828	792184
			
 
				+779859	4522047
			
 
				+792184	2965226
			
 
				+117138	836734
			
 
				+664437	3671929
			
 
				+609710	1672693
			
 
				+701385	14732667
			
 
				+95522	580828
			
 
				+554566	852241
			
 
				+5167206	7704457
			
 
				+552838	1090125
			
 
				+5009972	10966414
			
 
				+55970	47756334
			
 
				+360649	47319034
			
 
				 15367749	34432513
			
 
				 1133449	14072570
			
 
				 147416	4011641
			
--- a/data1/Align/zh_en/ref_ent_ids_real
+++ b/data1/Align/zh_en/ref_ent_ids_real
@@ -1003,4 +1003,11 @@
 
				 81793	1379850
			
 
				 3181284	3833577
			
 
				 41185	33680479
			
 
				-3894976	40246738
			
 
				+3894976	40246738
			
 
				+615017	651213
			
 
				+655027	3768318
			
 
				+3600029	6374467
			
 
				+3691330	6175739
			
 
				+1009151	1670090
			
 
				+1580600	4243295
			
 
				+1823648	5611310
			
--- a/metrics.py
+++ b/metrics.py
@@ -198,6 +198,11 @@ def predict(vec, test_pair):
 
				 
			
 
				 def predict_new(vec):
			
 
				     dir_new_align = "C:\\Users\\admin\\Desktop\\Predict_Align_10w"
			
 
				+
			
 
				+    with open(dir_new_align, "r+", encoding='UTF-8') as f:
			
 
				+        f.truncate()
			
 
				+        f.close()
			
 
				+
			
 
				     # 读取Modelid 和 RealId 映射字典
			
 
				     id_list2 = file2Data(dir + "ent_ids_1")
			
 
				     map_dict = loadDict(dir + "ModelId2RealId")
			
@@ -423,14 +428,17 @@ def loadDict(filename):
 
				 def file2Data(filename):
			
 
				     with open(filename, 'r', encoding='UTF-8') as f:
			
 
				         _list = f.readlines()
			
 
				+        f.close()
			
 
				     return _list
			
 
				 
			
 
				 
			
 
				 def data2File(_list, filename):
			
 
				     with open(filename, 'w', encoding='UTF-8') as f:
			
 
				         f.writelines(_list)
			
 
				+        f.close()
			
 
				 
			
 
				 
			
 
				 def data2FileAppend(_list, filename):
			
 
				     with open(filename, 'a+', encoding='UTF-8') as f:
			
 
				-        f.writelines(_list)
			
 
				+        f.writelines(_list)
			
 
				+        f.close()
			
--- a/train.py
+++ b/train.py
@@ -8,6 +8,7 @@ from utils import *
 
				 from metrics import *
			
 
				 from models import GCN_Align
			
 
				 import os
			
 
				+from LoadBestModel import loadBestModel
			
 
				 
			
 
				 dir_best_model = os.getcwd()+"\\data1\\100000\\zh_en\\model.ckpt"
			
 
				 
			
@@ -20,7 +21,7 @@ tf.set_random_seed(seed)
 
				 flags = tf.app.flags
			
 
				 FLAGS = flags.FLAGS
			
 
				 flags.DEFINE_string('lang', 'zh_en', 'Dataset string.')  # 'zh_en', 'ja_en', 'fr_en'
			
 
				-flags.DEFINE_float('learning_rate', 20, 'Initial learning rate.')
			
 
				+flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
			
 
				 flags.DEFINE_integer('epochs', 500, 'Number of epochs to train.')
			
 
				 flags.DEFINE_float('dropout', 0.3, 'Dropout rate (1 - keep probability).')
			
 
				 flags.DEFINE_float('gamma', 3.0, 'Hyper-parameter for margin based loss.')
			
@@ -128,6 +129,10 @@ for epoch in range(FLAGS.epochs):
 
				         SE_train_loss = outs_se[1]
			
 
				         print("Save best Model!")
			
 
				 print("Optimization Finished!")
			
 
				+print("e"+str(FLAGS.epochs), "d"+str(FLAGS.dropout), "k"+str(FLAGS.k), "s"+str(FLAGS.seed),
			
 
				+      "lr"+str(FLAGS.learning_rate), "b"+str(FLAGS.beta))
			
 
				+# loadBestModel()
			
 
				+
			
 
				 
			
 
				 
			
 
				 # Testing