from myDQN.DQN import DQN
import tensorflow as tf
import numpy as np
import pandas as pd
from entity import *

n_actions = 2
n_features = 3
train_step = 10

max_x = 50
max_y = 50
max_time = 144
gamma = 0.9

def train():
    data = load('../train_data/train_data.pkl')
    print("数据量：",sum(len(i) for i in data))
    step = 0
    learn_num = 0
    RL = DQN(n_actions,n_features)
    for d in data:
        for match in d:
            s_x = match.driver.x / max_x
            s_y = match.driver.y / max_y
            s_time = match.order.order_time / max_time
            _s_x = match.order.to_x / max_x
            _s_y = match.order.to_y / max_y
            _s_time = match.order.arrive_time / max_time

            travel_time = match.order.travel_time
            reward = match.money
            # 滴滴论文 reward
            reward = (reward/travel_time) * sum([gamma ** i for i in range(travel_time)])
            if match.is_cancel:
                action = 0
            else:
                action = 1
            RL.store_transition([s_x,s_y,s_time],action,reward,travel_time,[_s_x,_s_y,_s_time])

            if (step > 200) and (step % train_step == 0):
                RL.learn()
                learn_num += 1
            step += 1
        if learn_num>20000:
            break
    RL.plot_cost()
    print(RL.test())
    # test_data = np.array([[1/50,2/50,10/144],
    #                       [25/50,46/50,141/144],
    #                       [45/50,2/50,65/144]])
    # print(RL.predict([[16/50,30/50,120/144]],[[25/50,46/50,141/144]],200,21))
    # for test in test_data:
    #     q = RL.predict(test)
    #     print(q)


if __name__ == '__main__':
    train()

    pass