I'm trying to get Breakout going based on a implemented version of Double Deep Q Network. My problem is that I don't see any improvement even after 6000 episodes. At first sight I also don't see any problems with the hyperparameter set up:
gamma = 0.99
learning_rate = 0.0001
epsilon = 1.0
epsilon_min = 0.1,
epsilon_decay = 0.995,
number of episodes = 30000
Here is the code for my agent:
class Agent():
def __init__(self, state_size, action_size, gamma = 0.99, epsilon = 1.0, epsilon_min = 0.1,
epsilon_decay = 0.995, update_rate = 10000, input_dims=(88,80,1)):
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_min = epsilon_min
self.epsilon_decay = epsilon_decay
self.update_rate = update_rate
self.batch_size = 32
#self.n_actions = n_actions
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen = 100000)
self.model = self.build_model()
self.target_model = self.build_model()
self.target_model.set_weights(self.model.get_weights())
self.memory = ReplayBuffer(100000, input_dims)
self.model.summary()
def build_model(self):
x = Sequential()
x.add(Conv2D(32, kernel_size = (8,8), strides = 4, activation = 'relu', input_shape = self.state_size))
x.add(Conv2D(64, kernel_size=(4,4), strides = 2, activation = 'relu'))
x.add(Conv2D(128, (3,3), strides = 1, activation = 'relu'))
x.add(Flatten())
#fc layers
x.add(Dense(128, activation = 'relu'))
x.add(Dense(128, activation = 'relu'))
x.add(Dense(64, activation = 'relu'))
x.add(Dense(self.action_size, activation = 'linear'))
x.compile(loss = 'mse', optimizer = RMSprop(lr=0.00025, rho=0.95,
epsilon = None, decay = 0.0))
return x
def remember(self, state, action, reward, next_state, done):
# storage of experience in replay memory
#self.memory.append((state, action, reward, next_state, done))
self.conv1 = nn.Conv2d(32,64,8,4)
self.relu1 = nn.ReLU(inplace = True)
self.conv2 = nn.Conv2d(64,128,4,2)
self.relu2 = nn.ReLU(inplace=True)
self.conv3 = nn.Conv2d(128,128,3,1)
self.relu3 = nn.ReLU(inplace = True)
self.fc4 = nn.Linear
def epsilon_greedy(self, state):
if random.rand() < self.epsilon:
return rnd.randrange(self.action_size)
act_values = self.model.predict(state)
return np.argmax(act_values[0])
def learn(self):
if self.memory.mem_cntr < self.batch_size:
return
if self.learn_step_counter % self.replace == 0:
self.q_next.set_weights(self.q_eval.get_weights())
states, actions, rewards, states_, dones = \
self.memory.sample_buffer(self.batch_size)
q_pred = self.q_eval(states)
q_next = tf.math.reduce_max(self.q_next(states_), axis=1, keepdims=True).numpy()
q_target = np.copy(q_pred)
# improve on my solution!
for idx, terminal in enumerate(dones):
if terminal:
q_next[idx] = 0.0
q_target[idx, actions[idx]] = rewards[idx] + self.gamma*q_next[idx]
self.q_eval.train_on_batch(states, q_target)
self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
self.eps_min else self.eps_min
self.learn_step_counter += 1
and here the code for my main function:
#env = gym.make('MsPacman-v0')
env = gym.make('Breakout-v0')
env.reset()
state_size = (88,80,3)
action_size = env.action_space.n
episodes = 30000
batch_size = 32
skip_start = 90
total_time = 0
all_reward = 0
blend = 4 # Number of images to blend
done = False
gamma = 0.99
agent = Agent(state_size, action_size, gamma, epsilon = 1.0, epsilon_min = 0.1,
epsilon_decay = 0.995, update_rate = 50)
l_episodes = []
scores = []
done = False
score = 0
lives = 5
for e in range(episodes):
print('Episode:', e)
done = False
total_reward = 0
game_score = 0
#state = env.reset()
state = process_frame(env.reset())
images = deque(maxlen = blend)
images.append(state)
while not done:
dead = False
while not dead:
env.render()
action = agent.epsilon_greedy(state)
next_state, reward, done, info = env.step(action)
agent.remember(state,action,reward,next_state, done)
state = next_state
game_score += reward
agent.learn()
dead = info['ale.lives']<lives
lives = info['ale.lives']
# When Pacman dies gives penalty of -100
#if dead:
fin_reward = game_score if not dead else -100
#else:
# fin_reward = game_score
print('Episode: {}, game_score: {}, fin reward : {}'.format(e, game_score, fin_reward))
if lives == 1:
done = True
dead = True
if done:
scores.append(fin_reward)
l_episodes.append(e)
Does anybody see what I'm doing wrong?
Best regards