0

I'm trying to get Breakout going based on a implemented version of Double Deep Q Network. My problem is that I don't see any improvement even after 6000 episodes. At first sight I also don't see any problems with the hyperparameter set up:

gamma = 0.99

learning_rate = 0.0001

epsilon = 1.0

epsilon_min = 0.1,

epsilon_decay = 0.995,

number of episodes = 30000

Here is the code for my agent:

class Agent():
    def __init__(self, state_size, action_size, gamma = 0.99, epsilon = 1.0, epsilon_min = 0.1,
                 epsilon_decay = 0.995, update_rate = 10000, input_dims=(88,80,1)):
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.update_rate = update_rate
        self.batch_size = 32
        
        #self.n_actions = n_actions
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 100000)
        
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.target_model.set_weights(self.model.get_weights())
        
        self.memory = ReplayBuffer(100000, input_dims)
        self.model.summary()
        
        
    def build_model(self):
        x = Sequential()
        x.add(Conv2D(32, kernel_size = (8,8), strides = 4, activation = 'relu', input_shape = self.state_size))
        x.add(Conv2D(64, kernel_size=(4,4), strides = 2, activation = 'relu'))
        x.add(Conv2D(128, (3,3), strides = 1, activation = 'relu'))
        x.add(Flatten())
        
        #fc layers
        x.add(Dense(128, activation = 'relu'))
        x.add(Dense(128, activation = 'relu'))
        x.add(Dense(64, activation = 'relu'))
        x.add(Dense(self.action_size, activation = 'linear'))
        
        x.compile(loss = 'mse', optimizer = RMSprop(lr=0.00025, rho=0.95, 
                                                        epsilon = None, decay = 0.0))
        
        return x
        

    def remember(self, state, action, reward, next_state, done):
        # storage of experience in replay memory
        #self.memory.append((state, action, reward, next_state, done))
        self.conv1 = nn.Conv2d(32,64,8,4)
        self.relu1 = nn.ReLU(inplace = True)
        self.conv2 = nn.Conv2d(64,128,4,2)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = nn.Conv2d(128,128,3,1)
        self.relu3 = nn.ReLU(inplace = True)
        self.fc4 = nn.Linear
        
        
    def epsilon_greedy(self, state):
        if random.rand() < self.epsilon:
            return rnd.randrange(self.action_size)
        act_values = self.model.predict(state)
        
        return np.argmax(act_values[0])

    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return

        if self.learn_step_counter % self.replace == 0:
            self.q_next.set_weights(self.q_eval.get_weights())

        states, actions, rewards, states_, dones = \
                                    self.memory.sample_buffer(self.batch_size)

        q_pred = self.q_eval(states)
        q_next = tf.math.reduce_max(self.q_next(states_), axis=1, keepdims=True).numpy()
        q_target = np.copy(q_pred)

        # improve on my solution!
        for idx, terminal in enumerate(dones):
            if terminal:
                q_next[idx] = 0.0
            q_target[idx, actions[idx]] = rewards[idx] + self.gamma*q_next[idx]

        self.q_eval.train_on_batch(states, q_target)

        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
                        self.eps_min else self.eps_min

        self.learn_step_counter += 1

and here the code for my main function:

#env = gym.make('MsPacman-v0')
env = gym.make('Breakout-v0')
env.reset()
        
state_size = (88,80,3)
action_size = env.action_space.n
episodes = 30000
batch_size = 32
skip_start = 90 
total_time = 0
all_reward = 0
blend = 4 # Number of images to blend
done = False
gamma = 0.99
        
agent = Agent(state_size, action_size, gamma,  epsilon = 1.0, epsilon_min = 0.1,
        epsilon_decay = 0.995, update_rate = 50)
        

l_episodes = []
scores = []
done = False
score = 0
lives = 5
for e in range(episodes):
    print('Episode:', e)
    done = False
    total_reward = 0
    game_score = 0
    #state = env.reset()
    state = process_frame(env.reset())
    images = deque(maxlen = blend)
    images.append(state)
    
    while not done:
        dead = False

        while not dead:
            env.render()
            
            action = agent.epsilon_greedy(state)
            next_state, reward, done, info = env.step(action)
            
            agent.remember(state,action,reward,next_state, done)
            
            state = next_state
            game_score += reward 
            
            agent.learn()
            
            
            dead = info['ale.lives']<lives
            lives = info['ale.lives']
            # When Pacman dies gives penalty of -100
            
            #if dead:
            fin_reward = game_score if not dead else -100
            #else:
            #    fin_reward = game_score
                
            print('Episode: {}, game_score: {}, fin reward : {}'.format(e, game_score, fin_reward))
            
            if lives == 1:
                done = True
                dead = True
        if done:
            scores.append(fin_reward)
            l_episodes.append(e)

Does anybody see what I'm doing wrong?

Best regards

Sunshine
  • 75
  • 7
  • I'm not sure if I've missed it, but are you trying to loop your decay of epsilon or is it only decaying until it reaches the minimum and then it stays as the minimum? – user14518362 Feb 09 '21 at 16:25

0 Answers0