znj 4 năm trước cách đây
mục cha
commit
e2ff3c677e
2 tập tin đã thay đổi với 52 bổ sung13 xóa
  1. 22 11
      myDQN/DQN.py
  2. 30 2
      myDQN/train.py

+ 22 - 11
myDQN/DQN.py

@@ -1,9 +1,10 @@
 import numpy as np
 import pandas as pd
 import tensorflow as tf
+from entity import *
 
-np.random.seed(1)
-tf.set_random_seed(1)
+# np.random.seed(1)
+# tf.set_random_seed(1)
 
 class DQN():
     def __init__(self,
@@ -11,9 +12,9 @@ class DQN():
             n_features,
             learning_rate=0.001,
             reward_decay=0.9,
-            e_greedy=0.9,
+            e_greedy=1,
             replace_target_iter=300,
-            memory_size=800,
+            memory_size=600,
             batch_size=64,
             e_greedy_increment=None,
             output_graph=False
@@ -33,7 +34,7 @@ class DQN():
         self.learn_step_counter = 0
 
         # initialize zero memory [s, a, r, s_]
-        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))
+        self.memory = np.zeros((self.memory_size, n_features * 2 + 3))
 
         # consist of [target_net, evaluate_net]
         self._build_net()
@@ -55,7 +56,7 @@ class DQN():
         # ------------------ build evaluate_net ------------------
         self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
         self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
-        # print(self.s)
+
         with tf.variable_scope('eval_net'):
             # c_names(collections_names) are the collections to store variables
             c_names, n_l1, w_initializer, b_initializer = \
@@ -97,11 +98,11 @@ class DQN():
                 b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                 self.q_next = tf.matmul(l1, w2) + b2
 
-    def store_transition(self, s, a, r, s_):
+    def store_transition(self, s, a, r,travel_time, s_):
         if not hasattr(self, 'memory_counter'):
             self.memory_counter = 0
 
-        transition = np.hstack((s, [a, r], s_))
+        transition = np.hstack((s, [a, r,travel_time], s_))
 
         # replace the old memory with new memory
         index = self.memory_counter % self.memory_size
@@ -147,11 +148,16 @@ class DQN():
         batch_index = np.arange(self.batch_size, dtype=np.int32)
         eval_act_index = batch_memory[:, self.n_features].astype(int)
         reward = batch_memory[:, self.n_features + 1]
+        # 间隔时间段
+        travel_time = batch_memory[:, self.n_features + 2]
+        gamma = np.array([self.gamma ** t for t in travel_time])
+        # gamma = gamma.reshape((self.batch_size,1))
 
-        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+        # q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
+        q_target[batch_index, eval_act_index] = reward + gamma * np.max(q_next, axis=1)
 
         # train eval network
-        _, self.cost,a = self.sess.run([self._train_op, self.loss,self.s],
+        _, self.cost = self.sess.run([self._train_op, self.loss],
                                      feed_dict={self.s: batch_memory[:, :self.n_features],
                                                 self.q_target: q_target})
 
@@ -163,7 +169,12 @@ class DQN():
 
     def plot_cost(self):
         import matplotlib.pyplot as plt
+        print("min_loss:",min(self.cost_his))
         plt.plot(np.arange(len(self.cost_his)), self.cost_his)
         plt.ylabel('Cost')
         plt.xlabel('training steps')
-        plt.show()
+        plt.show()
+
+    def predict(self,model_path=None):
+        if model_path:
+            model = load(model_path)

+ 30 - 2
myDQN/train.py

@@ -4,14 +4,42 @@ import numpy as np
 import pandas as pd
 from entity import *
 
+n_actions = 2
+n_features = 3
+
+max_x = 50
+max_y = 50
+max_time = 144
 
 def train():
+    data = load('../train_data/train_data.pkl')
+    print("数据量:",sum(len(i) for i in data))
     step = 0
-    data = load('train_data/train_data.pkl')
-    print(data[0])
+    RL = DQN(n_actions,n_features)
+    for d in data:
+        for match in d:
+            s_x = match.driver.x / max_x
+            s_y = match.driver.y / max_y
+            s_time = match.order.order_time / max_time
+            _s_x = match.order.to_x / max_x
+            _s_y = match.order.to_y / max_y
+            _s_time = match.order.arrive_time / max_time
+            travel_time = match.order.travel_time
+            reward = match.money
+            RL.store_transition((s_x,s_y,s_time),0,reward,travel_time,(_s_x,_s_y,_s_time))
+
+            if (step > 200) and (step % 10 == 0):
+                RL.learn()
+            step += 1
+
+    RL.plot_cost()
+
+
+
 
 
 
 if __name__ == '__main__':
     train()
+
     pass