Pārlūkot izejas kodu

公司融合代码更新

Jiasheng 4 gadi atpakaļ
vecāks
revīzija
567991c407

+ 119 - 109
LoadBestModel.py

@@ -6,112 +6,122 @@ import gc
 import time
 import os
 
-dir_best_model = os.getcwd()+"\\data1\\100000\\zh_en\\model.ckpt"
-sess = tf.Session()
-
-# Define placeholders
-num_supports = 1
-ph_ae = {
-    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
-    'features': tf.sparse_placeholder(tf.float32), #tf.placeholder(tf.float32),
-    'dropout': tf.placeholder_with_default(0., shape=()),
-    'num_features_nonzero': tf.placeholder_with_default(0, shape=())
-}
-ph_se = {
-    'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
-    'features': tf.placeholder(tf.float32),
-    'dropout': tf.placeholder_with_default(0., shape=()),
-    'num_features_nonzero': tf.placeholder_with_default(0, shape=())
-}
-
-# some flags
-flags = tf.app.flags
-FLAGS = flags.FLAGS
-flags.DEFINE_string('lang', 'zh_en', 'Dataset string.')  # 'zh_en', 'ja_en', 'fr_en'
-flags.DEFINE_float('learning_rate', 20, 'Initial learning rate.')
-flags.DEFINE_integer('epochs', 20, 'Number of epochs to train.')
-flags.DEFINE_float('dropout', 0.3, 'Dropout rate (1 - keep probability).')
-flags.DEFINE_float('gamma', 3.0, 'Hyper-parameter for margin based loss.')
-flags.DEFINE_integer('k', 5, 'Number of negative samples for each positive seed.')
-flags.DEFINE_float('beta', 0.3, 'Weight for structure embeddings.')
-flags.DEFINE_integer('se_dim', 100, 'Dimension for SE.')
-flags.DEFINE_integer('ae_dim', 100, 'Dimension for AE.')
-flags.DEFINE_integer('seed', 5, 'Proportion of seeds, 3 means 30%')
-
-# data process
-adj, ae_input, train, test = load_data(FLAGS.lang)
-support = [preprocess_adj(adj)]
-
-# 把具体值赋给事先定义好的placeholder
-feed_dict_ae = construct_feed_dict(ae_input, support, ph_ae)
-feed_dict_ae.update({ph_ae['dropout']: FLAGS.dropout})
-feed_dict_se = construct_feed_dict(1.0, support, ph_se)
-feed_dict_se.update({ph_se['dropout']: FLAGS.dropout})
-
-# 负样本填充placeholder
-t = 0
-k = 0
-e = ae_input[2][0]
-L = np.ones((t, k))
-neg_left = L.reshape((t * k,))
-L = np.ones((t, k))
-neg2_right = L.reshape((t * k,))
-neg2_left = np.random.choice(e, t * k)
-neg_right = np.random.choice(e, t * k)
-feed_dict_ae.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
-feed_dict_se.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
-
-# Create model
-model_func = GCN_Align
-# attribute embedding model
-model_ae = model_func(ph_ae, input_dim=ae_input[2][1], output_dim=FLAGS.ae_dim, ILL=train, sparse_inputs=True, featureless=False, logging=True)
-# structure embedding model
-model_se = model_func(ph_se, input_dim=ae_input[2][0], output_dim=FLAGS.se_dim, ILL=train, sparse_inputs=False, featureless=True, logging=True)
-
-# load model
-saver = tf.train.Saver()
-saver.restore(sess, dir_best_model)
-
-# run the last layer, get vector
-# print(len(feed_dict_ae))
-# for i in feed_dict_ae.keys():
-#     print(i)
-vec_ae = sess.run(model_ae.outputs, feed_dict=feed_dict_ae)
-vec_se = sess.run(model_se.outputs, feed_dict=feed_dict_se)
-
-# 清内存
-print("清内存")
-del saver
-del model_ae
-del model_se
-del model_func
-del feed_dict_ae
-del feed_dict_se
-del adj
-del ae_input
-del train
-# del test
-del support
-del sess
-gc.collect()
-
-# print("AE")
-# get_hits(vec_ae, test)
-# print("SE")
-# get_hits(vec_se, test)
-# print("SE+AE")
-# get_combine_hits(vec_se, vec_ae, FLAGS.beta, test)
-#
-# calculate similarity
-# print("AE Similarity")
-# print(len(vec_ae), len(test))
-# predict(vec_ae, test)
-# print("SE Similarity")
-# predict(vec_se, test)
-# print("AE+SE Similarity")
-# predict(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1), test)
-
-print("Predict New Align Orgs")
-start_time = time.time()
-predict_new(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1))
-print("use time", time.time()-start_time)
+
+def loadBestModel():
+
+    dir_best_model = os.getcwd()+"\\data1\\100000\\zh_en\\model.ckpt"
+    sess = tf.Session()
+
+    # Define placeholders
+    num_supports = 1
+    ph_ae = {
+        'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
+        'features': tf.sparse_placeholder(tf.float32), #tf.placeholder(tf.float32),
+        'dropout': tf.placeholder_with_default(0., shape=()),
+        'num_features_nonzero': tf.placeholder_with_default(0, shape=())
+    }
+    ph_se = {
+        'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
+        'features': tf.placeholder(tf.float32),
+        'dropout': tf.placeholder_with_default(0., shape=()),
+        'num_features_nonzero': tf.placeholder_with_default(0, shape=())
+    }
+
+    # some flags
+    flags = tf.app.flags
+    FLAGS = flags.FLAGS
+    flags.DEFINE_string('lang', 'zh_en', 'Dataset string.')  # 'zh_en', 'ja_en', 'fr_en'
+    flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
+    flags.DEFINE_integer('epochs', 500, 'Number of epochs to train.')
+    flags.DEFINE_float('dropout', 0.3, 'Dropout rate (1 - keep probability).')
+    flags.DEFINE_float('gamma', 3.0, 'Hyper-parameter for margin based loss.')
+    flags.DEFINE_integer('k', 4, 'Number of negative samples for each positive seed.')
+    flags.DEFINE_float('beta', 0.3, 'Weight for structure embeddings.')
+    flags.DEFINE_integer('se_dim', 100, 'Dimension for SE.')
+    flags.DEFINE_integer('ae_dim', 100, 'Dimension for AE.')
+    flags.DEFINE_integer('seed', 9, 'Proportion of seeds, 3 means 30%')
+
+    # data process
+    adj, ae_input, train, test = load_data(FLAGS.lang)
+    support = [preprocess_adj(adj)]
+
+    # 把具体值赋给事先定义好的placeholder
+    feed_dict_ae = construct_feed_dict(ae_input, support, ph_ae)
+    feed_dict_ae.update({ph_ae['dropout']: FLAGS.dropout})
+    feed_dict_se = construct_feed_dict(1.0, support, ph_se)
+    feed_dict_se.update({ph_se['dropout']: FLAGS.dropout})
+
+    # 负样本填充placeholder
+    t = 0
+    k = 0
+    e = ae_input[2][0]
+    L = np.ones((t, k))
+    neg_left = L.reshape((t * k,))
+    L = np.ones((t, k))
+    neg2_right = L.reshape((t * k,))
+    neg2_left = np.random.choice(e, t * k)
+    neg_right = np.random.choice(e, t * k)
+    feed_dict_ae.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
+    feed_dict_se.update({'neg_left:0': neg_left, 'neg_right:0': neg_right, 'neg2_left:0': neg2_left, 'neg2_right:0': neg2_right})
+
+    # Create model
+    model_func = GCN_Align
+    # attribute embedding model
+    model_ae = model_func(ph_ae, input_dim=ae_input[2][1], output_dim=FLAGS.ae_dim, ILL=train, sparse_inputs=True, featureless=False, logging=True)
+    # structure embedding model
+    model_se = model_func(ph_se, input_dim=ae_input[2][0], output_dim=FLAGS.se_dim, ILL=train, sparse_inputs=False, featureless=True, logging=True)
+
+    # load model
+    saver = tf.train.Saver()
+    saver.restore(sess, dir_best_model)
+
+    # run the last layer, get vector
+    # print(len(feed_dict_ae))
+    # for i in feed_dict_ae.keys():
+    #     print(i)
+    vec_ae = sess.run(model_ae.outputs, feed_dict=feed_dict_ae)
+    vec_se = sess.run(model_se.outputs, feed_dict=feed_dict_se)
+
+    # 清内存
+    print("清内存")
+    del saver
+    del model_ae
+    del model_se
+    del model_func
+    del feed_dict_ae
+    del feed_dict_se
+    del adj
+    del ae_input
+    del train
+    # del test
+    del support
+    del sess
+    gc.collect()
+
+    # print("AE")
+    # get_hits(vec_ae, test)
+    # print("SE")
+    # get_hits(vec_se, test)
+    # print("SE+AE")
+    # get_combine_hits(vec_se, vec_ae, FLAGS.beta, test)
+    #
+    # calculate similarity
+    # print("AE Similarity")
+    # print(len(vec_ae), len(test))
+    # predict(vec_ae, test)
+    # print("SE Similarity")
+    # predict(vec_se, test)
+    # print("AE+SE Similarity")
+    # predict(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1), test)
+
+    print("Predict New Align Orgs")
+    start_time = time.time()
+    predict_new(np.concatenate([vec_se*FLAGS.beta, vec_ae*(1.0-FLAGS.beta)], axis=1))
+    print("use time", time.time()-start_time)
+
+    print("e"+str(FLAGS.epochs), "d"+str(FLAGS.dropout), "k"+str(FLAGS.k), "s"+str(FLAGS.seed),
+          "lr"+str(FLAGS.learning_rate), "b"+str(FLAGS.beta))
+
+
+if __name__ == '__main__':
+    loadBestModel()

BIN
data1/100000/zh_en/model.ckpt.data-00000-of-00001


BIN
data1/100000/zh_en/model.ckpt.index


BIN
data1/100000/zh_en/model.ckpt.meta


+ 80 - 0
data1/100000/zh_en/ref_ent_ids_real_neg

@@ -5,7 +5,87 @@
 1118944	1899166
 5009972	10966414
 681097	1118944
+123692	969743
+1791845	2750730
+1882988	2856967
+1772758	3357141
+1782306	3459999
+410264	1845197
+1874888	45663924
+1841100	45663924
+94499	1818967
+2156185	2156186
+2079122	2297388
+2183393	3127840
+2151573	3516473
+395503	2279239
+2140475	4320364
+1078878	1391869
+35463	1438762
+147416	4011641
+701385	14732667
+634863	1456775
+728750	1434146
+773767	1456775
+980178	1394642
+963586	1434146
+973245	1349007
+1075636	1536110
+1223370	1561374
+1570827	1570960
+1570602	1570984
+1570949	1570984
+1570602	1570949
+1570751	1570934
 1431462	2903115
+351981	7196326
+58437	612829
+585481	677644
+619225	623843
+728215	792093
+666694	863853
+800686	1094520
+117138	836734
+773767	1456775
+179498	802656
+775976	2680005
+792184	2965226
+3486057	7811352
+779859	4522047
+1055702	1055731
+1055617	1055731
+1055617	1055702
+448841	1075636
+637504	1075636
+634315	1075636
+993905	1040395
+970116	1058212
+1223370	1561374
+1111975	1874888
+391662	961145
+987566	2382314
+987566	2283218
+973245	1349007
+970116	1058212
+514399	896938
+666694	863853
+3945130	9434411
+728215	792093
+465287	792184
+580828	792184
+779859	4522047
+792184	2965226
+117138	836734
+664437	3671929
+609710	1672693
+701385	14732667
+95522	580828
+554566	852241
+5167206	7704457
+552838	1090125
+5009972	10966414
+55970	47756334
+360649	47319034
 15367749	34432513
 1133449	14072570
 147416	4011641

+ 8 - 1
data1/Align/zh_en/ref_ent_ids_real

@@ -1003,4 +1003,11 @@
 81793	1379850
 3181284	3833577
 41185	33680479
-3894976	40246738
+3894976	40246738
+615017	651213
+655027	3768318
+3600029	6374467
+3691330	6175739
+1009151	1670090
+1580600	4243295
+1823648	5611310

+ 9 - 1
metrics.py

@@ -198,6 +198,11 @@ def predict(vec, test_pair):
 
 def predict_new(vec):
     dir_new_align = "C:\\Users\\admin\\Desktop\\Predict_Align_10w"
+
+    with open(dir_new_align, "r+", encoding='UTF-8') as f:
+        f.truncate()
+        f.close()
+
     # 读取Modelid 和 RealId 映射字典
     id_list2 = file2Data(dir + "ent_ids_1")
     map_dict = loadDict(dir + "ModelId2RealId")
@@ -423,14 +428,17 @@ def loadDict(filename):
 def file2Data(filename):
     with open(filename, 'r', encoding='UTF-8') as f:
         _list = f.readlines()
+        f.close()
     return _list
 
 
 def data2File(_list, filename):
     with open(filename, 'w', encoding='UTF-8') as f:
         f.writelines(_list)
+        f.close()
 
 
 def data2FileAppend(_list, filename):
     with open(filename, 'a+', encoding='UTF-8') as f:
-        f.writelines(_list)
+        f.writelines(_list)
+        f.close()

+ 6 - 1
train.py

@@ -8,6 +8,7 @@ from utils import *
 from metrics import *
 from models import GCN_Align
 import os
+from LoadBestModel import loadBestModel
 
 dir_best_model = os.getcwd()+"\\data1\\100000\\zh_en\\model.ckpt"
 
@@ -20,7 +21,7 @@ tf.set_random_seed(seed)
 flags = tf.app.flags
 FLAGS = flags.FLAGS
 flags.DEFINE_string('lang', 'zh_en', 'Dataset string.')  # 'zh_en', 'ja_en', 'fr_en'
-flags.DEFINE_float('learning_rate', 20, 'Initial learning rate.')
+flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
 flags.DEFINE_integer('epochs', 500, 'Number of epochs to train.')
 flags.DEFINE_float('dropout', 0.3, 'Dropout rate (1 - keep probability).')
 flags.DEFINE_float('gamma', 3.0, 'Hyper-parameter for margin based loss.')
@@ -128,6 +129,10 @@ for epoch in range(FLAGS.epochs):
         SE_train_loss = outs_se[1]
         print("Save best Model!")
 print("Optimization Finished!")
+print("e"+str(FLAGS.epochs), "d"+str(FLAGS.dropout), "k"+str(FLAGS.k), "s"+str(FLAGS.seed),
+      "lr"+str(FLAGS.learning_rate), "b"+str(FLAGS.beta))
+# loadBestModel()
+
 
 
 # Testing