자율주행/Deep Q Network

DQN(Deep Q Network)

Tony Lim 2021. 2. 24. 13:29

기존의 Qtable 에서 Neural Network 로 바꿔주는 것이다.

DQN CartPole 예제

#!/usr/bin/env python

import numpy as np
import random
import datetime
import os
import gym

import torch
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn

env = gym.make("CartPole-v0")

algorithm = 'DQN'

state_size = 4
action_size = env.action_space.n

load_model = False
train_mode = True

batch_size = 32

discount_factor = 0.99
learning_rate = 0.00025

run_step = 40000
test_step = 10000

print_episode = 10
save_step = 20000

epsilon_init = 1.0
epsilon_min = 0.1

date_time = datetime.datetime.now().strftime("%Y%m%d-%H-%M-%S")

save_path = "./saved_models/" + date_time
load_path = "./saved_models/20210205-18-52-50_DQN"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class DQN(nn.Module):
    def __init__(self):
        super(DQN,self).__init__()
        self.fc1 = nn.Linear(state_size,512)
        self.fc2 = nn.Linear(512,512)
        self.fc3 = nn.Linear(512, action_size)

    def forward(self,x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class DQNAgent():
    def __init__(self,model,optimizer):
        self.model = model
        self.optimizer = optimizer

        self.epsilon = epsilon_init

        if load_model == True:
            self.model.load_state_dict(torch.load(load_path+'/model.pth'),map_location=device)
            print("Model is loaded from {}".format(load_path +'/model.pth'))

    def get_action(self,state):
        if train_mode:
            if self.epsilon > np.random.rand():
                return np.random.randint(0,action_size)
            else:
                with torch.no_grad():
                    Q = self.model(torch.FloatTensor(state).unsqueeze(0).to(device))
                    return np.argmax(Q.cpu().detach().numpy())
        else:
            with torch.no_grad():
                Q = self.model(torch.FloatTensor(state).unsqueeze(0).to(device))
                return np.argmax(Q.cpu.detach().numpy())

    def save_model(self,load_model, train_mode):
        if not load_model and train_mode:
            os.makedirs(save_path + algorithm, exist_ok=True)
            torch.save(self.model.state_dict(), save_path + algorithm +'/model.pth')
            print("Save Model: {}".format(save_path + algorithm))

        elif load_model and train_mode:
            torch.save(self.model.state_dict(), load_path + '/model.pth')
            print("Save Model: {}".format(load_path))

    def train_model(self, state, action, reward, next_state, done):
        state = torch.Tensor(state).to(device)
        next_state = torch.Tensor(next_state).to(device)

        one_hot_action = torch.zeros(2).to(device)
        one_hot_action[action] = 1
        q = (self.model(state) * one_hot_action).sum()

        with torch.no_grad():
            max_Q = q.item()
            next_q = self.model(next_state)
            target_q = reward + next_q.max() * (discount_factor *(1-done))
        
        loss = F.smooth_l1_loss(q,target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item(), max_Q
if __name__ == '__main__':
    model = DQN().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    agent = DQNAgent(model,optimizer)
    model.train()

    step = 0
    episode = 0
    reward_list = []
    loss_list = []
    max_Q_list = []

    while step < run_step + test_step:
        state = env.reset()
        episode_rewards = 0
        done = False
        while not done:
            if step == run_step:
                train_mode = False
                model.eval()
            
            action = agent.get_action(state)
    
            next_state, reward, done, _ = env.step(action)
    
            episode_rewards  += reward
    
            if train_mode == False:
                agent.epsilon = 0.0
    
            state = next_state
            step += 1
    
            if train_mode:
                if agent.epsilon > epsilon_min:
                    agent.epsilon -= 1.0 / run_step
    
                loss, maxQ = agent.train_model(state, action, reward, next_state,done)
                loss_list.append(loss)
                max_Q_list.append(maxQ)
                
                if step % save_step == 0 and step != 0 and train_mode:
                    agent.save_model(load_model, train_mode)
    
            reward_list.append(episode_rewards)
            episode += 1
    
            if episode % print_episode == 0 and episode != 0:
                print("step: {} / episode: {} / reward: {:.2f} / loss: {:.4f} / maxQ: {:.2f} / epsilon: {:.4f}".format(step,episode,np.mean(reward_list), np.mean(loss_list),np.mean(max_Q_list),agent.epsilon))
    
                reward_list = []
                loss_list = []
                max_Q_list = []
    
    agent.save_model(load_model, train_mode)
    env.close()

하지만 reward가 충분히 높게 훈련이 되지않는다. 

'자율주행 > Deep Q Network' 카테고리의 다른 글

DQN Upgrade  (0) 2021.02.25
Reinforcement Learning  (0) 2021.02.23
ANN ,CNN -> ㅡMNIST 분석  (0) 2021.02.22