|
@@ -12,9 +12,9 @@ class DQN():
|
|
|
n_features,
|
|
|
learning_rate=0.001,
|
|
|
reward_decay=0.9,
|
|
|
- e_greedy=1,
|
|
|
+ e_greedy=1.0,
|
|
|
replace_target_iter=300,
|
|
|
- memory_size=600,
|
|
|
+ memory_size=500,
|
|
|
batch_size=64,
|
|
|
e_greedy_increment=None,
|
|
|
output_graph=False
|
|
@@ -33,7 +33,7 @@ class DQN():
|
|
|
# total learning step
|
|
|
self.learn_step_counter = 0
|
|
|
|
|
|
- # initialize zero memory [s, a, r, s_]
|
|
|
+ # initialize zero memory [s, a, r, time, s_]
|
|
|
self.memory = np.zeros((self.memory_size, n_features * 2 + 3))
|
|
|
|
|
|
# consist of [target_net, evaluate_net]
|
|
@@ -48,10 +48,11 @@ class DQN():
|
|
|
# $ tensorboard --logdir=logs
|
|
|
# tf.train.SummaryWriter soon be deprecated, use following
|
|
|
tf.summary.FileWriter("logs/", self.sess.graph)
|
|
|
-
|
|
|
+ self.saver = tf.train.Saver(max_to_keep=10)
|
|
|
self.sess.run(tf.global_variables_initializer())
|
|
|
self.cost_his = []
|
|
|
|
|
|
+
|
|
|
def _build_net(self):
|
|
|
# ------------------ build evaluate_net ------------------
|
|
|
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input
|
|
@@ -60,7 +61,7 @@ class DQN():
|
|
|
with tf.variable_scope('eval_net'):
|
|
|
# c_names(collections_names) are the collections to store variables
|
|
|
c_names, n_l1, w_initializer, b_initializer = \
|
|
|
- ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
|
|
|
+ ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 24, \
|
|
|
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers
|
|
|
|
|
|
# first layer. collections is used later when assign to target net
|
|
@@ -154,13 +155,15 @@ class DQN():
|
|
|
# gamma = gamma.reshape((self.batch_size,1))
|
|
|
|
|
|
# q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
|
|
|
- q_target[batch_index, eval_act_index] = reward + gamma * np.max(q_next, axis=1)
|
|
|
+ # q_target[batch_index, eval_act_index] = reward + gamma * np.max(q_next, axis=1)
|
|
|
+ q_target[batch_index, eval_act_index] = reward + gamma * q_next[batch_index, eval_act_index]
|
|
|
|
|
|
# train eval network
|
|
|
_, self.cost = self.sess.run([self._train_op, self.loss],
|
|
|
feed_dict={self.s: batch_memory[:, :self.n_features],
|
|
|
self.q_target: q_target})
|
|
|
-
|
|
|
+ if self.learn_step_counter > 4000 and self.learn_step_counter % 100 == 0:
|
|
|
+ self.saver.save(self.sess,"../model/dqn/model.ckpt")
|
|
|
self.cost_his.append(self.cost)
|
|
|
|
|
|
# increasing epsilon
|
|
@@ -175,6 +178,30 @@ class DQN():
|
|
|
plt.xlabel('training steps')
|
|
|
plt.show()
|
|
|
|
|
|
- def predict(self,model_path=None):
|
|
|
+ def predict(self,s,s_,r,detal_time,model_path=None):
|
|
|
if model_path:
|
|
|
- model = load(model_path)
|
|
|
+ self.saver.restore(self.sess, model_path)
|
|
|
+
|
|
|
+ q_now = self.sess.run(self.q_eval,{self.s:s})
|
|
|
+ q_next = self.sess.run(self.q_eval,{self.s:s_})
|
|
|
+ a_pai = self.gamma ** detal_time * q_next - q_now + r / detal_time * sum([self.gamma ** i for i in range(detal_time)])
|
|
|
+ return q_now,q_next,a_pai
|
|
|
+
|
|
|
+ def test(self):
|
|
|
+ test_num = 10
|
|
|
+ test_index = np.random.choice(self.memory_size, size=test_num)
|
|
|
+ test_memory = self.memory[test_index, :]
|
|
|
+
|
|
|
+ q_now = self.sess.run(self.q_eval, {self.s: test_memory[:, :self.n_features]})
|
|
|
+
|
|
|
+ q_next = self.sess.run(self.q_eval, {self.s: test_memory[:, -self.n_features:]})
|
|
|
+
|
|
|
+ reward = test_memory[:, self.n_features + 1]
|
|
|
+ # 间隔时间段
|
|
|
+ travel_time = test_memory[:, self.n_features + 2]
|
|
|
+ gamma = np.array([self.gamma ** t for t in travel_time])
|
|
|
+ batch_index = np.arange(test_num, dtype=np.int32)
|
|
|
+ # a_pai = gamma * q_next[:, 1] - q_now[:, 1] + reward
|
|
|
+ a_pai = reward + gamma * q_next[:, 1]
|
|
|
+ return q_now,q_next,a_pai
|
|
|
+
|