ddpg.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. """
  2. Deep Deterministic Policy Gradient (DDPG), Reinforcement Learning.
  3. DDPG is Actor Critic based algorithm.
  4. Pendulum example.
  5. View more on my tutorial page: https://morvanzhou.github.io/tutorials/
  6. Using:
  7. tensorflow 1.0
  8. gym 0.8.0
  9. """
  10. #######################################################################
  11. # Copyright (C) #
  12. # 2016 - 2019 Pinard Liu(liujianping-ok@163.com) #
  13. # https://www.cnblogs.com/pinard #
  14. # Permission given to modify the code as long as you keep this #
  15. # declaration at the top #
  16. #######################################################################
  17. ## https://www.cnblogs.com/pinard/p/10345762.html.html ##
  18. ## 强化学习(十六) 深度确定性策略梯度(DDPG) ##
  19. import tensorflow as tf
  20. import numpy as np
  21. import gym
  22. import time
  23. ##################### hyper parameters ####################
  24. MAX_EPISODES = 2000
  25. MAX_EP_STEPS = 200
  26. LR_A = 0.001 # learning rate for actor
  27. LR_C = 0.002 # learning rate for critic
  28. GAMMA = 0.9 # reward discount
  29. TAU = 0.01 # soft replacement
  30. MEMORY_CAPACITY = 10000
  31. BATCH_SIZE = 32
  32. RENDER = False
  33. ENV_NAME = 'Pendulum-v0'
  34. ############################### DDPG ####################################
  35. class DDPG(object):
  36. def __init__(self, a_dim, s_dim, a_bound,):
  37. self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
  38. self.pointer = 0
  39. self.sess = tf.Session()
  40. self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
  41. self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
  42. self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
  43. self.R = tf.placeholder(tf.float32, [None, 1], 'r')
  44. with tf.variable_scope('Actor'):
  45. self.a = self._build_a(self.S, scope='eval', trainable=True)
  46. a_ = self._build_a(self.S_, scope='target', trainable=False)
  47. with tf.variable_scope('Critic'):
  48. # assign self.a = a in memory when calculating q for td_error,
  49. # otherwise the self.a is from Actor when updating Actor
  50. q = self._build_c(self.S, self.a, scope='eval', trainable=True)
  51. q_ = self._build_c(self.S_, a_, scope='target', trainable=False)
  52. # networks parameters
  53. self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval')
  54. self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target')
  55. self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval')
  56. self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target')
  57. # target net replacement
  58. self.soft_replace = [tf.assign(t, (1 - TAU) * t + TAU * e)
  59. for t, e in zip(self.at_params + self.ct_params, self.ae_params + self.ce_params)]
  60. q_target = self.R + GAMMA * q_
  61. # in the feed_dic for the td_error, the self.a should change to actions in memory
  62. td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
  63. self.ctrain = tf.train.AdamOptimizer(LR_C).minimize(td_error, var_list=self.ce_params)
  64. a_loss = - tf.reduce_mean(q) # maximize the q
  65. self.atrain = tf.train.AdamOptimizer(LR_A).minimize(a_loss, var_list=self.ae_params)
  66. self.sess.run(tf.global_variables_initializer())
  67. def choose_action(self, s):
  68. return self.sess.run(self.a, {self.S: s[np.newaxis, :]})[0]
  69. def learn(self):
  70. # soft target replacement
  71. self.sess.run(self.soft_replace)
  72. indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
  73. bt = self.memory[indices, :]
  74. bs = bt[:, :self.s_dim]
  75. ba = bt[:, self.s_dim: self.s_dim + self.a_dim]
  76. br = bt[:, -self.s_dim - 1: -self.s_dim]
  77. bs_ = bt[:, -self.s_dim:]
  78. self.sess.run(self.atrain, {self.S: bs})
  79. self.sess.run(self.ctrain, {self.S: bs, self.a: ba, self.R: br, self.S_: bs_})
  80. def store_transition(self, s, a, r, s_):
  81. transition = np.hstack((s, a, [r], s_))
  82. index = self.pointer % MEMORY_CAPACITY # replace the old memory with new memory
  83. self.memory[index, :] = transition
  84. self.pointer += 1
  85. def _build_a(self, s, scope, trainable):
  86. with tf.variable_scope(scope):
  87. net = tf.layers.dense(s, 30, activation=tf.nn.relu, name='l1', trainable=trainable)
  88. a = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, name='a', trainable=trainable)
  89. return tf.multiply(a, self.a_bound, name='scaled_a')
  90. def _build_c(self, s, a, scope, trainable):
  91. with tf.variable_scope(scope):
  92. n_l1 = 30
  93. w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], trainable=trainable)
  94. w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], trainable=trainable)
  95. b1 = tf.get_variable('b1', [1, n_l1], trainable=trainable)
  96. net = tf.nn.relu(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
  97. return tf.layers.dense(net, 1, trainable=trainable) # Q(s,a)
  98. ############################### training ####################################
  99. env = gym.make(ENV_NAME)
  100. env = env.unwrapped
  101. env.seed(1)
  102. s_dim = env.observation_space.shape[0]
  103. a_dim = env.action_space.shape[0]
  104. a_bound = env.action_space.high
  105. ddpg = DDPG(a_dim, s_dim, a_bound)
  106. var = 3 # control exploration
  107. t1 = time.time()
  108. for episode in range(MAX_EPISODES):
  109. s = env.reset()
  110. ep_reward = 0
  111. for j in range(MAX_EP_STEPS):
  112. if RENDER:
  113. env.render()
  114. # Add exploration noise
  115. a = ddpg.choose_action(s)
  116. a = np.clip(np.random.normal(a, var), -2, 2) # add randomness to action selection for exploration
  117. s_, r, done, info = env.step(a)
  118. ddpg.store_transition(s, a, r / 10, s_)
  119. if ddpg.pointer > MEMORY_CAPACITY:
  120. var *= .9995 # decay the action randomness
  121. ddpg.learn()
  122. s = s_
  123. ep_reward += r
  124. if j == MAX_EP_STEPS-1:
  125. print('Episode:', episode, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
  126. # if ep_reward > -300:RENDER = True
  127. break
  128. if episode % 100 == 0:
  129. total_reward = 0
  130. for i in range(10):
  131. state = env.reset()
  132. for j in range(MAX_EP_STEPS):
  133. env.render()
  134. action = ddpg.choose_action(state) # direct action for test
  135. state,reward,done,_ = env.step(action)
  136. total_reward += reward
  137. if done:
  138. break
  139. ave_reward = total_reward/300
  140. print ('episode: ',episode,'Evaluation Average Reward:',ave_reward)
  141. print('Running time: ', time.time() - t1)