diff --git a/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py b/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py index 5bf590fd..ae52a1b4 100644 --- a/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py +++ b/Code 1. Grid World/7. Policy Gradient/Gridworld_PG.py @@ -41,7 +41,7 @@ def optimizer(self): # Policy Gradient 의 핵심 # log(정책) * return 의 gradient 를 구해서 최대화시킴 good_prob = K.sum(action * self.model.output, axis=1) - eligibility = K.log(good_prob) * discounted_rewards + eligibility = K.log(good_prob) * K.stop_gradient(discounted_rewards) loss = -K.sum(eligibility) optimizer = Adam(lr=self.learning_rate) diff --git a/Code 2. Cartpole/1. DQN/save_graph/MountainCar_DQN_play.png b/Code 2. Cartpole/1. DQN/save_graph/MountainCar_DQN_play.png new file mode 100644 index 00000000..6fd16b0a Binary files /dev/null and b/Code 2. Cartpole/1. DQN/save_graph/MountainCar_DQN_play.png differ diff --git a/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py b/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py index 14cfeadb..c3833673 100644 --- a/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py +++ b/Code 2. Cartpole/3. Dueling DQN/Cartpole_DuelingDQN.py @@ -160,7 +160,7 @@ def save_model(self, name): # every episode update the target model to be same with model agent.update_target_model() # every episode, plot the play time - score = score if score == 499 else score + 100 + score = score if score == 500 else score + 100 scores.append(score) episodes.append(e) pylab.plot(episodes, scores, 'b') diff --git a/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py b/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py index 512d3ed6..9701136c 100644 --- a/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py +++ b/Code 2. Cartpole/4. Policy Gradient/Cartpole_PolicyGradient.py @@ -15,7 +15,7 @@ class PGAgent: def __init__(self, state_size, action_size): # if you want to see Cartpole learning, then change to True - self.render = True + self.render = False # get size of state and action self.state_size = state_size @@ -144,7 +144,7 @@ def save_model(self, name): scores.append(score) episodes.append(e) pylab.plot(episodes, scores, 'b') - # pylab.savefig("./save_graph/Cartpole_PG.png") + pylab.savefig("./save_graph/Cartpole_PG.png") print("episode:", e, " score:", score) # if the mean of scores of last 10 episode is bigger than 490 diff --git a/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png b/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png index 796da0d4..720ab1ec 100644 Binary files a/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png and b/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG.png differ diff --git a/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG1.png b/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG1.png new file mode 100644 index 00000000..c9561504 Binary files /dev/null and b/Code 2. Cartpole/4. Policy Gradient/save_graph/Cartpole_PG1.png differ diff --git a/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py b/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py index a0ca4e66..8f455b62 100644 --- a/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py +++ b/Code 2. Cartpole/5. Actor-Critic/Cartpole_ActorCritic.py @@ -64,7 +64,7 @@ def actor_optimizer(self): # Policy Gradient 의 핵심 # log(정책) * return 의 gradient 를 구해서 최대화시킴 good_prob = K.sum(action * self.actor.output, axis=1) - eligibility = K.log(good_prob + 1e-10) * advantages + eligibility = K.log(good_prob + 1e-10) * K.stop_gradient(advantages) loss = -K.sum(eligibility) optimizer = Adam(lr=self.actor_lr) diff --git a/Code 3. Atari Game/1. Breakout/Breakout_DQN.py b/Code 3. Atari Game/1. Breakout/Breakout_DQN.py index b3c26253..82b02821 100644 --- a/Code 3. Atari Game/1. Breakout/Breakout_DQN.py +++ b/Code 3. Atari Game/1. Breakout/Breakout_DQN.py @@ -5,7 +5,7 @@ from collections import deque from skimage.color import rgb2gray from skimage.transform import resize - +from keras import backend as K from keras.models import Sequential from keras.optimizers import RMSprop from keras.layers import Dense, Flatten @@ -16,7 +16,7 @@ class DQNAgent: def __init__(self): - self.render = True + self.render = False self.state_size = (84, 84, 4) self.action_size = 6 @@ -33,7 +33,7 @@ def __init__(self): self.update_target_rate = 10000 self.discount_factor = 0.99 self.memory = deque(maxlen=400000) - self.no_op_steps = 30 + self.not_move_steps = 30 self.learning_rate = 0.00025 self.momentum = 0.95 self.min_gradient = 0.01 @@ -44,20 +44,26 @@ def __init__(self): def build_model(self): model = Sequential() - model.add(Conv2D(32, (8, 8), input_shape=self.state_size, activation='relu', strides=(4, 4), - kernel_initializer='glorot_uniform')) - model.add(Conv2D(64, (4, 4), activation='relu', strides=(2, 2), - kernel_initializer='glorot_uniform')) - model.add(Conv2D(64, (3, 3), activation='relu', strides=(1, 1), - kernel_initializer='glorot_uniform')) + model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=self.state_size)) + model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu')) + model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu')) model.add(Flatten()) - model.add(Dense(512, activation='relu', kernel_initializer='glorot_uniform')) + model.add(Dense(512, activation='relu')) model.add(Dense(self.action_size)) model.summary() - model.compile(loss='mse', optimizer=RMSprop( - lr=self.learning_rate, rho=self.momentum, epsilon=self.min_gradient)) return model + def optimizer(self): + action = K.placeholder(shape=[None, self.action_size]) + predicted_q_value = K.placeholder(shape=[None, ]) + + loss = a + + optimizer = RMSprop(lr=self.learning_rate, rho=self.momentum, epsilon=self.min_gradient) + updates = optimizer.get_updates(self.model.trainable_weights, [], loss) + train = K.function([self.model.input, action, predicted_q_value], [], updates=updates) + return train + def update_target_model(self): self.target_model.set_weights(self.model.get_weights()) @@ -123,7 +129,7 @@ def pre_processing(next_observe, observe): score, start_live = 0, 5 observe = env.reset() next_observe = observe - for _ in range(random.randint(1, agent.no_op_steps)): + for _ in range(random.randint(1, agent.not_move_steps)): observe = next_observe next_observe, _, _, _ = env.step(1) diff --git a/Code 4. Mountain Car/1. DQN/MountainCar_DQN.py b/Code 4. Mountain Car/1. DQN/MountainCar_DQN.py index 84802b76..0715bf6c 100644 --- a/Code 4. Mountain Car/1. DQN/MountainCar_DQN.py +++ b/Code 4. Mountain Car/1. DQN/MountainCar_DQN.py @@ -8,15 +8,15 @@ from keras.optimizers import Adam from keras.models import Sequential -EPISODES = 300 +EPISODES = 5000 -# this is DQN Agent for the Cartpole +# this is DQN Agent for the MountainCar # it uses Neural Network to approximate q function # and replay memory & target q network class DQNAgent: def __init__(self, state_size, action_size): - # if you want to see Cartpole learning, then change to True + # if you want to see MountainCar learning, then change to True self.render = False # get size of state and action @@ -26,13 +26,14 @@ def __init__(self, state_size, action_size): # these is hyper parameters for the DQN self.discount_factor = 0.99 self.learning_rate = 0.001 + self.epsilon = 1.0 - self.epsilon_decay = 0.999 - self.epsilon_min = 0.01 + self.epsilon_decay = 0.99999 + self.epsilon_min = 0.1 self.batch_size = 64 - self.train_start = 1000 + self.train_start = 100000 # create replay memory using deque - self.memory = deque(maxlen=2000) + self.memory = deque(maxlen=100000) # create main model and target model self.model = self.build_model() @@ -45,8 +46,7 @@ def __init__(self, state_size, action_size): # state is input and Q Value of each action is output of network def build_model(self): model = Sequential() - model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) - model.add(Dense(24, activation='relu', kernel_initializer='he_uniform')) + model.add(Dense(64, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform')) model.summary() model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) @@ -67,13 +67,15 @@ def get_action(self, state): # save sample to the replay memory def replay_memory(self, state, action, reward, next_state, done): self.memory.append((state, action, reward, next_state, done)) - if self.epsilon > self.epsilon_min: - self.epsilon *= self.epsilon_decay # pick samples randomly from replay memory (with batch_size) def train_replay(self): if len(self.memory) < self.train_start: return + + if self.epsilon > self.epsilon_min: + self.epsilon *= self.epsilon_decay + batch_size = min(self.batch_size, len(self.memory)) mini_batch = random.sample(self.memory, batch_size) @@ -109,33 +111,41 @@ def save_model(self, name): if __name__ == "__main__": # in case of CartPole-v1, you can play until 500 time step - env = gym.make('CartPole-v1') + env = gym.make('MountainCar-v0') # get size of state and action from environment state_size = env.observation_space.shape[0] - action_size = env.action_space.n - + action_size = 2 # env.action_space.n agent = DQNAgent(state_size, action_size) scores, episodes = [], [] + action_fake = 0 + goal_position = 0.5 + global_step = 0 for e in range(EPISODES): done = False score = 0 state = env.reset() + at_top = False state = np.reshape(state, [1, state_size]) - # agent.load_model("./save_model/cartpole-master.h5") while not done: if agent.render: env.render() - + global_step += 1 # get action for the current state and go one step in environment action = agent.get_action(state) - next_state, reward, done, info = env.step(action) + if action == 0: + action_fake = 0 + if action == 1: + action_fake = 2 + + next_state, reward, done, info = env.step(action_fake) next_state = np.reshape(next_state, [1, state_size]) - # if an action make the episode end, then gives penalty of -100 - reward = reward if not done or score == 499 else -100 + if next_state[0][0] >= goal_position: + reward = 100 + at_top = True # save the sample to the replay memory agent.replay_memory(state, action, reward, next_state, done) # every time step do the training @@ -144,24 +154,14 @@ def save_model(self, name): state = next_state if done: - env.reset() - # every episode update the target model to be same with model - agent.update_target_model() - # every episode, plot the play time - score = score if score == 500 else score + 100 scores.append(score) episodes.append(e) pylab.plot(episodes, scores, 'b') - pylab.savefig("./save_graph/Cartpole_DQN14.png") + pylab.savefig("./save_graph/MountainCar_DQN1.png") print("episode:", e, " score:", score, " memory length:", len(agent.memory), - " epsilon:", agent.epsilon) - - # if the mean of scores of last 10 episode is bigger than 490 - # stop training - if np.mean(scores[-min(10, len(scores)):]) > 490: - sys.exit() + " global_step:", global_step, " epsilon:", agent.epsilon, " at_top:", at_top) # save the model - if e % 50 == 0: - agent.save_model("./save_model/Cartpole_DQN14.h5") \ No newline at end of file + # if e % 50 == 0: + # agent.save_model("./save_model/MountainCar_DQN.h5") \ No newline at end of file diff --git a/Code 4. Mountain Car/1. DQN/MountainCar_play.py b/Code 4. Mountain Car/1. DQN/MountainCar_play.py deleted file mode 100644 index e69de29b..00000000 diff --git a/Code 4. Mountain Car/1. DQN/save_graph/Cartpole_DQN.png b/Code 4. Mountain Car/1. DQN/save_graph/Cartpole_DQN.png deleted file mode 100644 index 49114fd6..00000000 Binary files a/Code 4. Mountain Car/1. DQN/save_graph/Cartpole_DQN.png and /dev/null differ diff --git a/Code 4. Mountain Car/1. DQN/save_graph/Cartpole_DQN14.png b/Code 4. Mountain Car/1. DQN/save_graph/Cartpole_DQN14.png deleted file mode 100644 index 5c0f54d3..00000000 Binary files a/Code 4. Mountain Car/1. DQN/save_graph/Cartpole_DQN14.png and /dev/null differ diff --git a/Code 4. Mountain Car/1. DQN/save_graph/MountainCar_DQN.png b/Code 4. Mountain Car/1. DQN/save_graph/MountainCar_DQN.png new file mode 100644 index 00000000..36342e81 Binary files /dev/null and b/Code 4. Mountain Car/1. DQN/save_graph/MountainCar_DQN.png differ diff --git a/Code 4. Mountain Car/1. DQN/save_graph/MountainCar_DQN1.png b/Code 4. Mountain Car/1. DQN/save_graph/MountainCar_DQN1.png new file mode 100644 index 00000000..b91efb94 Binary files /dev/null and b/Code 4. Mountain Car/1. DQN/save_graph/MountainCar_DQN1.png differ diff --git a/Code 4. Mountain Car/1. DQN/save_model/Cartpole.h5 b/Code 4. Mountain Car/1. DQN/save_model/Cartpole.h5 deleted file mode 100644 index 25268bae..00000000 Binary files a/Code 4. Mountain Car/1. DQN/save_model/Cartpole.h5 and /dev/null differ diff --git a/Code 4. Mountain Car/1. DQN/save_model/Cartpole10.h5 b/Code 4. Mountain Car/1. DQN/save_model/Cartpole10.h5 deleted file mode 100644 index dc6f1d69..00000000 Binary files a/Code 4. Mountain Car/1. DQN/save_model/Cartpole10.h5 and /dev/null differ diff --git a/Code 4. Mountain Car/1. DQN/save_model/Cartpole8.h5 b/Code 4. Mountain Car/1. DQN/save_model/Cartpole8.h5 deleted file mode 100644 index 02094b4e..00000000 Binary files a/Code 4. Mountain Car/1. DQN/save_model/Cartpole8.h5 and /dev/null differ diff --git a/Code 4. Mountain Car/1. DQN/save_model/Cartpole_DQN1.h5 b/Code 4. Mountain Car/1. DQN/save_model/Cartpole_DQN1.h5 deleted file mode 100644 index ba846f85..00000000 Binary files a/Code 4. Mountain Car/1. DQN/save_model/Cartpole_DQN1.h5 and /dev/null differ diff --git a/Code 4. Mountain Car/1. DQN/save_model/Cartpole_DQN13.h5 b/Code 4. Mountain Car/1. DQN/save_model/Cartpole_DQN13.h5 deleted file mode 100644 index c63a4dc6..00000000 Binary files a/Code 4. Mountain Car/1. DQN/save_model/Cartpole_DQN13.h5 and /dev/null differ diff --git a/Code 4. Mountain Car/1. DQN/save_model/Cartpole_DQN14.h5 b/Code 4. Mountain Car/1. DQN/save_model/Cartpole_DQN14.h5 deleted file mode 100644 index d4d4bcd5..00000000 Binary files a/Code 4. Mountain Car/1. DQN/save_model/Cartpole_DQN14.h5 and /dev/null differ diff --git a/Code 4. Mountain Car/1. DQN/save_model/Cartpole9.h5 b/Code 4. Mountain Car/1. DQN/save_model/MountainCar_DQN.h5 similarity index 53% rename from Code 4. Mountain Car/1. DQN/save_model/Cartpole9.h5 rename to Code 4. Mountain Car/1. DQN/save_model/MountainCar_DQN.h5 index ce883e7d..53cf1b6c 100644 Binary files a/Code 4. Mountain Car/1. DQN/save_model/Cartpole9.h5 and b/Code 4. Mountain Car/1. DQN/save_model/MountainCar_DQN.h5 differ diff --git a/Code 4. Mountain Car/2. DDPG/MountainCar_DDPG.py b/Code 4. Mountain Car/2. DDPG/MountainCar_DDPG.py new file mode 100644 index 00000000..3916a1c0 --- /dev/null +++ b/Code 4. Mountain Car/2. DDPG/MountainCar_DDPG.py @@ -0,0 +1 @@ +# we will implement DDPG to mountain car \ No newline at end of file