5 năm trước cách đây · e2ff3c677e
--- a/myDQN/DQN.py
+++ b/myDQN/DQN.py
@@ -1,9 +1,10 @@
 
				 import numpy as np
			
 
				 import pandas as pd
			
 
				 import tensorflow as tf
			
 
				+from entity import *
			
 
				 
			
 
				-np.random.seed(1)
			
 
				-tf.set_random_seed(1)
			
 
				+# np.random.seed(1)
			
 
				+# tf.set_random_seed(1)
			
 
				 
			
 
				 class DQN():
			
 
				     def __init__(self,
			
@@ -11,9 +12,9 @@ class DQN():
 
				             n_features,
			
 
				             learning_rate=0.001,
			
 
				             reward_decay=0.9,
			
 
				-            e_greedy=0.9,
			
 
				+            e_greedy=1,
			
 
				             replace_target_iter=300,
			
 
				-            memory_size=800,
			
 
				+            memory_size=600,
			
 
				             batch_size=64,
			
 
				             e_greedy_increment=None,
			
 
				             output_graph=False
			
@@ -33,7 +34,7 @@ class DQN():
 
				         self.learn_step_counter = 0
			
 
				 
			
 
				         # initialize zero memory [s, a, r, s_]
			
 
				-        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
			
 
				+        self.memory = np.zeros((self.memory_size, n_features * 2 + 3))
			
 
				 
			
 
				         # consist of [target_net, evaluate_net]
			
 
				         self._build_net()
			
@@ -55,7 +56,7 @@ class DQN():
 
				         # ------------------ build evaluate_net ------------------
			
 
				         self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
			
 
				         self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
			
 
				-        # print(self.s)
			
 
				+
			
 
				         with tf.variable_scope('eval_net'):
			
 
				             # c_names(collections_names) are the collections to store variables
			
 
				             c_names, n_l1, w_initializer, b_initializer = \
			
@@ -97,11 +98,11 @@ class DQN():
 
				                 b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
			
 
				                 self.q_next = tf.matmul(l1, w2) + b2
			
 
				 
			
 
				-    def store_transition(self, s, a, r, s_):
			
 
				+    def store_transition(self, s, a, r,travel_time, s_):
			
 
				         if not hasattr(self, 'memory_counter'):
			
 
				             self.memory_counter = 0
			
 
				 
			
 
				-        transition = np.hstack((s, [a, r], s_))
			
 
				+        transition = np.hstack((s, [a, r,travel_time], s_))
			
 
				 
			
 
				         # replace the old memory with new memory
			
 
				         index = self.memory_counter % self.memory_size
			
@@ -147,11 +148,16 @@ class DQN():
 
				         batch_index = np.arange(self.batch_size, dtype=np.int32)
			
 
				         eval_act_index = batch_memory[:, self.n_features].astype(int)
			
 
				         reward = batch_memory[:, self.n_features + 1]
			
 
				+        # 间隔时间段
			
 
				+        travel_time = batch_memory[:, self.n_features + 2]
			
 
				+        gamma = np.array([self.gamma ** t for t in travel_time])
			
 
				+        # gamma = gamma.reshape((self.batch_size,1))
			
 
				 
			
 
				-        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
			
 
				+        # q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
			
 
				+        q_target[batch_index, eval_act_index] = reward + gamma * np.max(q_next, axis=1)
			
 
				 
			
 
				         # train eval network
			
 
				-        _, self.cost,a = self.sess.run([self._train_op, self.loss,self.s],
			
 
				+        _, self.cost = self.sess.run([self._train_op, self.loss],
			
 
				                                      feed_dict={self.s: batch_memory[:, :self.n_features],
			
 
				                                                 self.q_target: q_target})
			
 
				 
			
@@ -163,7 +169,12 @@ class DQN():
 
				 
			
 
				     def plot_cost(self):
			
 
				         import matplotlib.pyplot as plt
			
 
				+        print("min_loss:",min(self.cost_his))
			
 
				         plt.plot(np.arange(len(self.cost_his)), self.cost_his)
			
 
				         plt.ylabel('Cost')
			
 
				         plt.xlabel('training steps')
			
 
				-        plt.show()
			
 
				+        plt.show()
			
 
				+
			
 
				+    def predict(self,model_path=None):
			
 
				+        if model_path:
			
 
				+            model = load(model_path)
			
--- a/myDQN/train.py
+++ b/myDQN/train.py
@@ -4,14 +4,42 @@ import numpy as np
 
				 import pandas as pd
			
 
				 from entity import *
			
 
				 
			
 
				+n_actions = 2
			
 
				+n_features = 3
			
 
				+
			
 
				+max_x = 50
			
 
				+max_y = 50
			
 
				+max_time = 144
			
 
				 
			
 
				 def train():
			
 
				+    data = load('../train_data/train_data.pkl')
			
 
				+    print("数据量：",sum(len(i) for i in data))
			
 
				     step = 0
			
 
				-    data = load('train_data/train_data.pkl')
			
 
				-    print(data[0])
			
 
				+    RL = DQN(n_actions,n_features)
			
 
				+    for d in data:
			
 
				+        for match in d:
			
 
				+            s_x = match.driver.x / max_x
			
 
				+            s_y = match.driver.y / max_y
			
 
				+            s_time = match.order.order_time / max_time
			
 
				+            _s_x = match.order.to_x / max_x
			
 
				+            _s_y = match.order.to_y / max_y
			
 
				+            _s_time = match.order.arrive_time / max_time
			
 
				+            travel_time = match.order.travel_time
			
 
				+            reward = match.money
			
 
				+            RL.store_transition((s_x,s_y,s_time),0,reward,travel_time,(_s_x,_s_y,_s_time))
			
 
				+
			
 
				+            if (step > 200) and (step % 10 == 0):
			
 
				+                RL.learn()
			
 
				+            step += 1
			
 
				+
			
 
				+    RL.plot_cost()
			
 
				+
			
 
				+
			
 
				+
			
 
				 
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     train()
			
 
				+
			
 
				     pass