diff --git a/agent.py b/agent.py index d9376cd..24ed4b9 100644 --- a/agent.py +++ b/agent.py @@ -25,17 +25,26 @@ def random_action(self, board): legal_moves = board.legal_moves() return legal_moves[random.randint(0, len(legal_moves) - 1)] - def action(self, board, epsilon=0): + def softmax_action(self, board, beta): + action_value_vector = self.model.action_values(board) + legal_action_values = output_rep.get_legal_vals(board, action_value_vector) + + legal_val_tensor = tf.constant([list(legal_action_values.values())]) + sampled_ind = tf.random.categorical(tf.math.log(tf.nn.softmax(beta * legal_val_tensor)), 1)[0, 0] + + return list(legal_action_values.keys())[sampled_ind] + + def action(self, board, epsilon=0, beta=1): legal_moves = board.legal_moves() assert len(legal_moves) > 0, "No legal moves can be played." - greedy_move = self.greedy_action(board) + best_move = self.softmax_action(board, beta) # Exploration if random.random() < epsilon: move = self.random_action(board) else: - move = greedy_move + move = best_move return move diff --git a/hof.py b/hof.py index 002bc64..e0798cb 100644 --- a/hof.py +++ b/hof.py @@ -27,7 +27,7 @@ def store(self, model): self.basel += 1 / self.pop_size**2 # Samples from the hall of fame with the provided method - def sample(self, method='limit-uniform', index=None): + def sample(self, method='uniform', index=None): if method == 'limit-uniform': # Performs poorly. Do not use. threshold = random.random()*self.basel @@ -49,4 +49,4 @@ def sample(self, method='limit-uniform', index=None): self.sample_history.append(ind) name = self.hof[ind] - return Model(self.mnk, "{}/{}".format(self.folder, name)) + return Model(self.mnk, location="{}/{}".format(self.folder, name)) diff --git a/model.py b/model.py index 9e510ec..560620a 100644 --- a/model.py +++ b/model.py @@ -5,6 +5,7 @@ import numpy as np from keras.models import Sequential from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D +from keras.regularizers import l2 from tensorflow.keras.optimizers import SGD, Adam from state_representation import get_input_rep import output_representation as output_rep @@ -33,17 +34,16 @@ def __init__(self, mnk, lr=0.001, location=None, model=None): self.model = model return - opt = Adam(learning_rate=lr) + self.opt = Adam(learning_rate=lr) + regularization = 0.0001 self.model = Sequential() - self.model.add(Conv2D(filters=32, kernel_size=3, input_shape=(m, n, 2))) + self.model.add(Conv2D(filters=32, kernel_size=3, input_shape=(m, n, 2), kernel_regularizer=l2(regularization))) self.model.add(Flatten()) - self.model.add(Dense(54, kernel_initializer='normal', activation='relu')) - self.model.add(Dense(54, kernel_initializer='normal', activation='relu')) - self.model.add(Dense(54, kernel_initializer='normal', activation='relu')) - self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal')) + self.model.add(Dense(128, kernel_initializer='normal', activation='relu', kernel_regularizer=l2(regularization))) + self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', kernel_regularizer=l2(regularization))) - self.model.compile(loss='mean_squared_error', optimizer=opt) + self.model.compile(loss='mean_squared_error', optimizer=self.opt) @staticmethod def retrieve(location): diff --git a/plot.py b/plot.py index c90ff6d..86c1a42 100644 --- a/plot.py +++ b/plot.py @@ -8,15 +8,14 @@ class Diagnostics: - def __init__(self, run_length=100): + def __init__(self, run_length=50): self.run_length = run_length self.xo_outcomes = [[], [], []] self.model_outcomes = [[], [], []] self.rewards = [] - self.reward_totals = [] + self.reward_avg = [] self.reward_deltas = [] self.gating_indices = [] - self.index = 0 def update_xo(self, x_outcome, o_outcome): self.xo_outcomes[0].append(x_outcome) @@ -29,31 +28,20 @@ def update_outcome(self, train_outcome, hof_outcome): self.model_outcomes[2].append(1 - train_outcome - hof_outcome) def update_reward(self, reward): - self.rewards.append(reward) - self.reward_totals.append(reward) - self.reward_deltas.append(reward) - - if self.index > 0: - self.reward_totals[-1] += self.reward_totals[-2] - self.reward_deltas[-1] += self.reward_deltas[-2] - - if self.index >= self.run_length: - self.reward_totals[-1] -= self.rewards[self.index - self.run_length] - self.reward_deltas[-1] -= 2 * self.rewards[self.index - self.run_length] + n = min(self.run_length, len(self.rewards)) - if self.index >= 2 * self.run_length: - self.reward_deltas[-1] += self.rewards[self.index - 2 * self.run_length] - - self.index += 1 + self.rewards.append(reward) + self.reward_avg.append(np.mean(self.rewards[-n:])) + self.reward_deltas.append(np.mean(self.rewards[-(n//2):]) - np.mean(self.rewards[-n:-(n//2)])) def add_gate_ind(self): - self.gating_indices.append(self.index) + self.gating_indices.append(len(self.rewards) - 1) def get_recent_performance(self): - if self.index == 0: + if len(self.rewards) == 0: return 0, 0 - return self.reward_totals[-1] / self.run_length, self.reward_deltas[-1] / self.run_length + return self.reward_avg[-1], self.reward_deltas[-1] def plot_wins(outcomes, model_name, players): @@ -136,17 +124,17 @@ def save_plots(mnk, hof, model_name, diagnostics): # Graph and save each plot - plt.plot(range(diagnostics.index), np.array(diagnostics.reward_totals) / diagnostics.run_length) + plt.plot(range(len(diagnostics.rewards)), np.array(diagnostics.reward_avg)) add_gating_markers(diagnostics.gating_indices) - plt.title("{}: Reward for {} diagnostic games".format(model_name, diagnostics.index+1)) + plt.title("{}: Reward for {} diagnostic games".format(model_name, len(diagnostics.rewards)+1)) plt.xlabel("Game #") plt.ylabel("Cumulative reward over previous {} games".format(diagnostics.run_length)) plt.savefig("{}/Reward.png".format(plots_dir)) plt.clf() - plt.plot(range(diagnostics.index), np.array(diagnostics.reward_deltas) / diagnostics.run_length) + plt.plot(range(len(diagnostics.rewards)), np.array(diagnostics.reward_deltas)) add_gating_markers(diagnostics.gating_indices) - plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, diagnostics.index+1)) + plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, len(diagnostics.rewards)+1)) plt.xlabel("Game #") plt.ylabel("Difference in cumulative reward for previous two {} length runs".format(diagnostics.run_length)) plt.savefig("{}/Improvement.png".format(plots_dir)) @@ -157,11 +145,11 @@ def save_plots(mnk, hof, model_name, diagnostics): plt.clf() plt.figure() - plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[0], run_length=diagnostics.run_length), label="X") - plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[1], run_length=diagnostics.run_length), label="O") - plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[2], run_length=diagnostics.run_length), label="Tie") + plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.xo_outcomes[0], run_length=diagnostics.run_length), label="X") + plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.xo_outcomes[1], run_length=diagnostics.run_length), label="O") + plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.xo_outcomes[2], run_length=diagnostics.run_length), label="Tie") plt.legend() - plt.title("{}: XO wins for {} diagnostic games".format(model_name, diagnostics.index + 1)) + plt.title("{}: XO wins for {} diagnostic games".format(model_name, len(diagnostics.rewards) + 1)) plt.xlabel("Game #") plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length)) add_gating_markers(diagnostics.gating_indices) @@ -169,11 +157,11 @@ def save_plots(mnk, hof, model_name, diagnostics): plt.clf() plt.figure() - plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[0], run_length=diagnostics.run_length), label="Best") - plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[1], run_length=diagnostics.run_length), label="HOF") - plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[2], run_length=diagnostics.run_length), label="Tie") + plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.model_outcomes[0], run_length=diagnostics.run_length), label="Best") + plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.model_outcomes[1], run_length=diagnostics.run_length), label="HOF") + plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.model_outcomes[2], run_length=diagnostics.run_length), label="Tie") plt.legend() - plt.title("{}: Model v Best wins for {} diagnostic games".format(model_name, diagnostics.index + 1)) + plt.title("{}: Model v Best wins for {} diagnostic games".format(model_name, len(diagnostics.rewards) + 1)) plt.xlabel("Game #") plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length)) add_gating_markers(diagnostics.gating_indices) diff --git a/replay_buffer.py b/replay_buffer.py index 39971b7..0aff1ee 100644 --- a/replay_buffer.py +++ b/replay_buffer.py @@ -8,6 +8,10 @@ def __init__(self, capacity, batch_size): self.buffer = [] self.index = 0 + def clear(self): + self.buffer = [] + self.index = 0 + def store(self, experience): if len(self.buffer) >= self.capacity: self.buffer[self.index] = experience diff --git a/train.py b/train.py index 5d6d909..2d8b272 100644 --- a/train.py +++ b/train.py @@ -20,7 +20,7 @@ # Set cmd-line training arguments verbose, mcts, model_name = arg_parser(sys.argv) -model_name = "new_model" +verbose, model_name = False, "new_model" mnk = (3, 3, 3) @@ -59,9 +59,9 @@ def get_corrected_action_values(model, lagging_model, state, action, next_state, if terminal: td_target = tf.constant(reward, dtype="float32", shape=(1, 1)) else: - legal_slow_action_values = output_rep.get_legal_vals(next_board, lagging_model.action_values(next_board)) - argmax_move = max(legal_slow_action_values, key=legal_slow_action_values.get) - td_target = model.action_values(next_board)[0][argmax_move[0] * next_board.n + argmax_move[1]] + action_vals = output_rep.get_legal_vals(next_board, model.action_values(next_board)) + argmax_move = max(action_vals, key=action_vals.get) + td_target = lagging_model.action_values(next_board)[0][argmax_move[0] * next_board.n + argmax_move[1]] target_output[0][action[0] * n + action[1]] = td_target return target_output @@ -91,7 +91,7 @@ def train_on_replays(model, lagging_model, batch): model.model.fit(states, target_outputs, epochs=1, batch_size=len(states), steps_per_epoch=1, callbacks=[lr_scheduler], verbose=False) -def run_training_game(transitions, agent_train, agent_versing, lagging_model, replay_buffer, n_steps=1, model_update_freq=4, lagging_freq=100, start_at=5000, epsilon=0, mnk=(3, 3, 3), verbose=False): +def run_training_game(transitions, agent_train, agent_versing, lagging_model, replay_buffer, n_steps=1, model_update_freq=4, lagging_freq=100, start_at=5000, epsilon=0, beta=1, mnk=(3, 3, 3), verbose=False): """Runs a training game with the provided agents. Args: @@ -112,10 +112,10 @@ def run_training_game(transitions, agent_train, agent_versing, lagging_model, re while board.game_ongoing(): # Select a move if board.player == agent_versing.player: - board.move(*agent_versing.action(board)) + board.move(*agent_versing.greedy_action(board)) else: transitions += 1 - move = agent_train.action(board, epsilon) + move = agent_train.action(board, epsilon, beta) if len(state_queue) >= n_steps: # Adds last action to replay buffer @@ -143,22 +143,15 @@ def run_training_game(transitions, agent_train, agent_versing, lagging_model, re # Back up the terminal state value to the last actions chosen by training agent while len(state_queue) > 0: reward = agent_train.player * winner - if reward == 0: - reward = 0 replay_buffer.store((*state_queue.pop(0), board.get_board(), reward, True)) - if verbose: - print(board) - return winner, game, transitions -def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon_i, epsilon_f, decay_period, buffer_size, n_steps, update_freq, lagging_freq, start_transition, model, lr): - diagnostics = Diagnostics() +def train(hof, total_games, diagnostic_freq, run_length, resample_freq, hof_gate_freq, hof_wait_period, batch_size, epsilon, beta, buffer_size, n_steps, update_freq, lagging_freq, start_transition, model, lr): + diagnostics = Diagnostics(run_length=run_length) games = ["" for _ in range(total_games // diagnostic_freq * 2)] - epsilon_step = (epsilon_f - epsilon_i) / decay_period - epsilon = epsilon_i # Initialize hall of fame hof.store(model) @@ -169,40 +162,47 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch # Initialize lagging model lagging_model = Model(mnk, model=tf.keras.models.clone_model(model.model)) transitions = 0 + games_since_hof = 0 try: for game in range(total_games): - epsilon += epsilon_step + games_since_hof += 1 + # Regularly choose a new HOF opponent if game % resample_freq == 0: side_best = [-1, 1][random.random() > 0.5] side_hof = side_best * -1 - model_hof = hof.sample("uniform") + model_hof = hof.sample(index=game % hof.pop_size) # Initialize the agents agent_best = Agent(model, side_best) agent_hof = Agent(model_hof, side_hof) # Play game and train on its outcome - _, _, transitions = run_training_game(transitions, agent_best, agent_hof, lagging_model, replay_buffer, n_steps, update_freq, lagging_freq, start_transition, epsilon, mnk) + _, _, transitions = run_training_game(transitions, agent_best, agent_hof, lagging_model, replay_buffer, n_steps, update_freq, lagging_freq, start_transition, epsilon, beta, mnk) # Switch sides for next game side_hof *= -1 side_best = side_hof * -1 # Regularly attempt to add the model into HOF ("gating") - if game % hof_gate_freq == 0: + if game % hof_gate_freq == 0 and games_since_hof > hof_wait_period: reward, improvement = diagnostics.get_recent_performance() # Only add if reward is positive and improvement has plateaued - if reward > 0 and np.abs(improvement) < 0.025: - epsilon = epsilon_i - K.set_value(model.model.opt.learning_rate, lr) + if (reward > 0 and np.abs(improvement) < 0.05) or reward == 1: + print("\nAdding model to HOF...") hof.store(model) - # Adds red line for when new models are added in plots diagnostics.add_gate_ind() + replay_buffer.clear() + transitions = 0 + games_since_hof = 0 + K.set_value(model.opt.learning_rate, lr) + + print("Done.\n") + if game % diagnostic_freq == 0: print("Game: ", game) @@ -214,7 +214,7 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch model_hof = hof.sample(index=i) diagnostic_winner, game_data = run_diagnostic(model, model_hof, 1) - games[game // diagnostic_freq * 2] = game_data + # games[game // diagnostic_freq * 2] = game_data avg_win += diagnostic_winner if diagnostic_winner == 1: @@ -225,7 +225,7 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch avg_hof += 1 diagnostic_winner, game_data = run_diagnostic(model, model_hof, -1) - games[game // diagnostic_freq * 2 + 1] = game_data + # games[game // diagnostic_freq * 2 + 1] = game_data avg_win += -diagnostic_winner if diagnostic_winner == 1: @@ -239,6 +239,8 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch diagnostics.update_xo(avg_x / (hof.pop_size * 2), avg_o / (hof.pop_size * 2)) diagnostics.update_outcome(avg_t / (hof.pop_size * 2), avg_hof / (hof.pop_size * 2)) + print("Real Reward: {}, Smoothed Reward: {}, Improvement: {}".format(diagnostics.rewards[-1], *diagnostics.get_recent_performance())) + except KeyboardInterrupt: print("\n=======================") print("Training interrupted.") @@ -261,7 +263,7 @@ def run_diagnostic(model, model_hof, side_model): agent_model = Agent(model, side_model) agent_hof = Agent(model_hof, side_hof) - return run_game(agent_model, agent_hof, mnk=mnk, verbose=verbose) + return run_game(agent_model, agent_hof, mnk=mnk, verbose=False) # Deletes entries in HOF folder @@ -275,29 +277,31 @@ def clear_hof(folder): def main(): # Hyperparameter List - diagnostic_freq = 25 # How often to run diagnostic games (in games) + diagnostic_freq = 50 # How often to run diagnostic games (in number of games) + run_length = 50 # Run length for diagnostic smoothing (in diagnostic games) + resample_freq = 100 # How often to choose a new HOF opponent (in games) - hof_gate_freq = 2000 # How often to gate a new model into the HOF (in games) + hof_gate_freq = 1000 # How often to gate a new model into the HOF (in games) + hof_wait_period = run_length * diagnostic_freq # How long to wait after adding to HOF before adding again total_games = 100000 # Total num of training games batch_size = 32 # Batch size for training - lr = 0.01 # Learning rate for SGD + lr = 0.001 # Learning rate for SGD - update_freq = 4 # How often to train the model on a replay batch (in moves) - buffer_size = 50000 # Num of moves to store in replay buffer + update_freq = 2 # How often to train the model on a replay batch (in moves) + buffer_size = 10000 # Num of moves to store in replay buffer n_steps = 1 # Num of steps used for temporal difference training targets lagging_freq = 500 # How often to update the lagging model (in moves) - start_transition = 5000 + start_transition = 10000 - epsilon_i = 0.1 # Probability with which a random move is chosen to play - epsilon_f = 0.01 - decay_period = 10000 + epsilon = 0.1 # Chance of picking a random move + beta = 1.0 # The lower this is, the more likely a "worse" move is chosen (don't set < 0) hof_folder = "menagerie" # Folder to store the hall-of-fame models hof = HOF(mnk, folder=hof_folder) print("\nTraining model: {}\n".format(model_name)) - model, diagnostics, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon_i, epsilon_f, decay_period, buffer_size, n_steps, update_freq, lagging_freq, start_transition, Model(mnk, lr=lr), lr=lr) + model, diagnostics, games = train(hof, total_games, diagnostic_freq, run_length, resample_freq, hof_gate_freq, hof_wait_period, batch_size, epsilon, beta, buffer_size, n_steps, update_freq, lagging_freq, start_transition, Model(mnk, lr=lr), lr=lr) save_model(model, model_name) save_plots(mnk, hof, model_name, diagnostics) diff --git a/utils.py b/utils.py index 6cbcb87..05de858 100644 --- a/utils.py +++ b/utils.py @@ -14,17 +14,14 @@ def run_game(agent_train, agent_versing, mnk=(3, 3, 3), verbose=False): while board.game_ongoing(): # Select a move if board.player == agent_versing.player: - board.move(*agent_versing.action(board)) + board.move(*agent_versing.greedy_action(board)) else: - board.move(*agent_train.action(board)) + board.move(*agent_train.greedy_action(board)) # Store game for later analysis if verbose: game.append(board.__str__()) - if verbose: - print(board) - return board.who_won(), game @@ -40,4 +37,4 @@ def arg_parser(argv): else: present.append("Model__" + str(datetime.datetime.now())[:-7].replace(" ", "__")) - return tuple(present) \ No newline at end of file + return tuple(present)