diff --git a/Code 3. Atari Game/1. Breakout/Breakout_DQN.py b/Code 3. Atari Game/1. Breakout/Breakout_DQN.py index 33c993d1..a6c01c9a 100644 --- a/Code 3. Atari Game/1. Breakout/Breakout_DQN.py +++ b/Code 3. Atari Game/1. Breakout/Breakout_DQN.py @@ -2,7 +2,9 @@ import pylab import random import numpy as np +from skimage.color import rgb2gray from skimage.transform import resize +from skimage.exposure import rescale_intensity from collections import deque from keras.layers import Dense, Flatten from keras.optimizers import Adam @@ -13,25 +15,24 @@ class DQNAgent: - def __init__(self, state_size, action_size): - self.render = False + def __init__(self): + self.render = True - self.state_size = state_size - self.action_size = action_size + self.state_size = (80, 80, 4) + self.action_size = 6 - self.epsilon = 1.0 - self.epsilon_start = 1.0 - self.epsilon_end = 0.1 - self.epsilon_decay = 1000000 + self.epsilon = 1 + self.epsilon_start = 1 + self.epsilon_end = 0.01 + self.epsilon_decay = 50000. self.epsilon_decay_step = \ (self.epsilon_start - self.epsilon_end) / self.epsilon_decay self.batch_size = 32 - self.train_start = 50000 + self.train_start = 3200 self.update_target_rate = 10000 self.discount_factor = 0.99 - self.learning_rate = 0.00025 - self.memory = deque(maxlen=1000000) + self.memory = deque(maxlen=50000) self.model = self.build_model() self.target_model = self.build_model() @@ -48,7 +49,8 @@ def build_model(self): model.add(Flatten()) model.add(Dense(512, activation='relu', kernel_initializer='glorot_uniform')) model.add(Dense(self.action_size, activation='linear')) - model.compile(loss='mse', optimizer=Adam(lr=0.00025, beta_1=0.95, beta_2=0.95, epsilon=0.01)) + model.summary() + model.compile(loss='mse', optimizer=Adam(lr=0.001)) return model def update_target_model(self): @@ -58,38 +60,36 @@ def get_action(self, history): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: - q_value = self.model.predict(np.reshape(np.float32(history[:, :, 0:4] / 255.), [1, 84, 84, 4])) + q_value = self.model.predict(history) return np.argmax(q_value[0]) - def replay_memory(self, history, action, reward, done): - self.memory.append((history, action, reward, done)) - if self.epsilon > self.epsilon_end: + def replay_memory(self, history, action, reward, history1, done): + self.memory.append((history, action, reward, history1, done)) + if len(self.memory) > 3200: self.epsilon -= self.epsilon_decay_step - # print(len(self.memory)) def train_replay(self): - # if len(self.memory) < self.train_start: - # return + if len(self.memory) < self.train_start: + return batch_size = min(self.batch_size, len(self.memory)) mini_batch = random.sample(self.memory, batch_size) - update_input = [] - update_target = [] + update_input = np.zeros((batch_size, self.state_size[0], self.state_size[1], self.state_size[2])) + update_target = np.zeros((batch_size, self.action_size)) for i in range(batch_size): - history, action, reward, done = mini_batch[i] + history, action, reward, history1, done = mini_batch[i] history = np.float32(history/255.) - target = self.model.predict(np.reshape(history[:, :, 0:4], [1, 84, 84, 4]))[0] + target = self.model.predict(history)[0] if done: target[action] = reward else: - target[action] = reward + self.discount_factor * np.amax(self.target_model.predict - (np.reshape(history[:, :, 1:5], [1, 84, 84, 4]))[0]) - update_target.append(target) - update_input.append(np.reshape(history[:, :, 0:4], [84, 84, 4])) + target[action] = reward + self.discount_factor * np.amax(self.target_model.predict(history1)[0]) + update_target[i] = target + update_input[i] = history - self.model.fit(np.array(update_input), np.array(update_target), batch_size=batch_size, epochs=1, verbose=0) + self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0) def load_model(self, name): self.model.load_weights(name) @@ -99,52 +99,54 @@ def save_model(self, name): def pre_processing(observe): - observe = np.uint8(np.dot(observe[:, :, :3], [0.299, 0.587, 0.114])) - print(observe[30, :]) - observe = resize(observe, (110, 84), mode='reflect') - print(observe[30, :]) - observe = observe[17:101, :] + observe = rgb2gray(observe) + observe = resize(observe, (80, 80), mode='constant') + observe = rescale_intensity(observe, out_range=(0, 255)) return observe if __name__ == "__main__": env = gym.make('BreakoutDeterministic-v3') - env.seed(0) - state_size = (84, 84, 4) - action_size = env.action_space.n - - agent = DQNAgent(state_size, action_size) + agent = DQNAgent() scores, episodes, global_step = [], [], 0 - history = np.zeros([84, 84, 5], dtype=np.uint8) for e in range(EPISODES): done = False + dead = False score, start_live = 0, 5 observe = env.reset() state = pre_processing(observe) - for i in range(4): - history[:, :, i] = state + history = np.stack((state, state, state, state), axis=2) + history = history.reshape(1, history.shape[0], history.shape[1], history.shape[2]) while not done: if agent.render: env.render() + action = agent.get_action(history) next_observe, reward, done, info = env.step(action) - # next_state = pre_processing(next_observe) + next_state = pre_processing(next_observe) + next_state = np.reshape([next_state], (1, 80, 80, 1)) + history1 = np.append(next_state, history[:, :, :, :3], axis=3) + if start_live > info['ale.lives']: - done = True + dead = True start_live = info['ale.lives'] reward = -1 - history[:, :, 4] = next_state - - agent.replay_memory(history[:, :, 0:5], action, reward, done) + agent.replay_memory(history, action, reward, history1, done) agent.train_replay() - history[:, :, 0:4] = history[:, :, 1:5] score += reward + if dead: + history = np.stack((next_state, next_state, next_state, next_state), axis=2) + history = np.reshape([history], (1, 80, 80, 4)) + dead = False + else: + history = history1 + if global_step % agent.update_target_rate == 0: agent.update_target_model() diff --git a/Code 3. Atari Game/1. Breakout/other.py b/Code 3. Atari Game/1. Breakout/other.py new file mode 100644 index 00000000..e69de29b