diff --git a/agent.py b/agent.py
index d9376cd..24ed4b9 100644
--- a/agent.py
+++ b/agent.py
@@ -25,17 +25,26 @@ def random_action(self, board):
         legal_moves = board.legal_moves()
         return legal_moves[random.randint(0, len(legal_moves) - 1)]
 
-    def action(self, board, epsilon=0):
+    def softmax_action(self, board, beta):
+        action_value_vector = self.model.action_values(board)
+        legal_action_values = output_rep.get_legal_vals(board, action_value_vector)
+
+        legal_val_tensor = tf.constant([list(legal_action_values.values())])
+        sampled_ind = tf.random.categorical(tf.math.log(tf.nn.softmax(beta * legal_val_tensor)), 1)[0, 0]
+
+        return list(legal_action_values.keys())[sampled_ind]
+
+    def action(self, board, epsilon=0, beta=1):
         legal_moves = board.legal_moves()
         assert len(legal_moves) > 0, "No legal moves can be played."
 
-        greedy_move = self.greedy_action(board)
+        best_move = self.softmax_action(board, beta)
 
         # Exploration
         if random.random() < epsilon:
             move = self.random_action(board)
         else:
-            move = greedy_move
+            move = best_move
 
         return move
 
diff --git a/hof.py b/hof.py
index 002bc64..e0798cb 100644
--- a/hof.py
+++ b/hof.py
@@ -27,7 +27,7 @@ def store(self, model):
         self.basel += 1 / self.pop_size**2
 
     # Samples from the hall of fame with the provided method
-    def sample(self, method='limit-uniform', index=None):
+    def sample(self, method='uniform', index=None):
         if method == 'limit-uniform':  # Performs poorly. Do not use.
             threshold = random.random()*self.basel
 
@@ -49,4 +49,4 @@ def sample(self, method='limit-uniform', index=None):
             self.sample_history.append(ind)
             name = self.hof[ind]
 
-        return Model(self.mnk, "{}/{}".format(self.folder, name))
+        return Model(self.mnk, location="{}/{}".format(self.folder, name))
diff --git a/model.py b/model.py
index 9e510ec..560620a 100644
--- a/model.py
+++ b/model.py
@@ -5,6 +5,7 @@
 import numpy as np
 from keras.models import Sequential
 from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
+from keras.regularizers import l2
 from tensorflow.keras.optimizers import SGD, Adam
 from state_representation import get_input_rep
 import output_representation as output_rep
@@ -33,17 +34,16 @@ def __init__(self, mnk, lr=0.001, location=None, model=None):
             self.model = model
             return
 
-        opt = Adam(learning_rate=lr)
+        self.opt = Adam(learning_rate=lr)
+        regularization = 0.0001
 
         self.model = Sequential()
-        self.model.add(Conv2D(filters=32, kernel_size=3, input_shape=(m, n, 2)))
+        self.model.add(Conv2D(filters=32, kernel_size=3, input_shape=(m, n, 2), kernel_regularizer=l2(regularization)))
         self.model.add(Flatten())
-        self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
-        self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
-        self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
-        self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal'))
+        self.model.add(Dense(128, kernel_initializer='normal', activation='relu', kernel_regularizer=l2(regularization)))
+        self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', kernel_regularizer=l2(regularization)))
 
-        self.model.compile(loss='mean_squared_error', optimizer=opt)
+        self.model.compile(loss='mean_squared_error', optimizer=self.opt)
 
     @staticmethod
     def retrieve(location):
diff --git a/plot.py b/plot.py
index c90ff6d..86c1a42 100644
--- a/plot.py
+++ b/plot.py
@@ -8,15 +8,14 @@
 
 
 class Diagnostics:
-    def __init__(self, run_length=100):
+    def __init__(self, run_length=50):
         self.run_length = run_length
         self.xo_outcomes = [[], [], []]
         self.model_outcomes = [[], [], []]
         self.rewards = []
-        self.reward_totals = []
+        self.reward_avg = []
         self.reward_deltas = []
         self.gating_indices = []
-        self.index = 0
 
     def update_xo(self, x_outcome, o_outcome):
         self.xo_outcomes[0].append(x_outcome)
@@ -29,31 +28,20 @@ def update_outcome(self, train_outcome, hof_outcome):
         self.model_outcomes[2].append(1 - train_outcome - hof_outcome)
 
     def update_reward(self, reward):
-        self.rewards.append(reward)
-        self.reward_totals.append(reward)
-        self.reward_deltas.append(reward)
-
-        if self.index > 0:
-            self.reward_totals[-1] += self.reward_totals[-2]
-            self.reward_deltas[-1] += self.reward_deltas[-2]
-
-        if self.index >= self.run_length:
-            self.reward_totals[-1] -= self.rewards[self.index - self.run_length]
-            self.reward_deltas[-1] -= 2 * self.rewards[self.index - self.run_length]
+        n = min(self.run_length, len(self.rewards))
 
-        if self.index >= 2 * self.run_length:
-            self.reward_deltas[-1] += self.rewards[self.index - 2 * self.run_length]
-
-        self.index += 1
+        self.rewards.append(reward)
+        self.reward_avg.append(np.mean(self.rewards[-n:]))
+        self.reward_deltas.append(np.mean(self.rewards[-(n//2):]) - np.mean(self.rewards[-n:-(n//2)]))
 
     def add_gate_ind(self):
-        self.gating_indices.append(self.index)
+        self.gating_indices.append(len(self.rewards) - 1)
 
     def get_recent_performance(self):
-        if self.index == 0:
+        if len(self.rewards) == 0:
             return 0, 0
 
-        return self.reward_totals[-1] / self.run_length, self.reward_deltas[-1] / self.run_length
+        return self.reward_avg[-1], self.reward_deltas[-1]
 
 
 def plot_wins(outcomes, model_name, players):
@@ -136,17 +124,17 @@ def save_plots(mnk, hof, model_name, diagnostics):
 
     # Graph and save each plot
 
-    plt.plot(range(diagnostics.index), np.array(diagnostics.reward_totals) / diagnostics.run_length)
+    plt.plot(range(len(diagnostics.rewards)), np.array(diagnostics.reward_avg))
     add_gating_markers(diagnostics.gating_indices)
-    plt.title("{}: Reward for {} diagnostic games".format(model_name, diagnostics.index+1))
+    plt.title("{}: Reward for {} diagnostic games".format(model_name, len(diagnostics.rewards)+1))
     plt.xlabel("Game #")
     plt.ylabel("Cumulative reward over previous {} games".format(diagnostics.run_length))
     plt.savefig("{}/Reward.png".format(plots_dir))
     plt.clf()
 
-    plt.plot(range(diagnostics.index), np.array(diagnostics.reward_deltas) / diagnostics.run_length)
+    plt.plot(range(len(diagnostics.rewards)), np.array(diagnostics.reward_deltas))
     add_gating_markers(diagnostics.gating_indices)
-    plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, diagnostics.index+1))
+    plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, len(diagnostics.rewards)+1))
     plt.xlabel("Game #")
     plt.ylabel("Difference in cumulative reward for previous two {} length runs".format(diagnostics.run_length))
     plt.savefig("{}/Improvement.png".format(plots_dir))
@@ -157,11 +145,11 @@ def save_plots(mnk, hof, model_name, diagnostics):
     plt.clf()
 
     plt.figure()
-    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[0], run_length=diagnostics.run_length), label="X")
-    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[1], run_length=diagnostics.run_length), label="O")
-    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[2], run_length=diagnostics.run_length), label="Tie")
+    plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.xo_outcomes[0], run_length=diagnostics.run_length), label="X")
+    plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.xo_outcomes[1], run_length=diagnostics.run_length), label="O")
+    plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.xo_outcomes[2], run_length=diagnostics.run_length), label="Tie")
     plt.legend()
-    plt.title("{}: XO wins for {} diagnostic games".format(model_name, diagnostics.index + 1))
+    plt.title("{}: XO wins for {} diagnostic games".format(model_name, len(diagnostics.rewards) + 1))
     plt.xlabel("Game #")
     plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length))
     add_gating_markers(diagnostics.gating_indices)
@@ -169,11 +157,11 @@ def save_plots(mnk, hof, model_name, diagnostics):
     plt.clf()
 
     plt.figure()
-    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[0], run_length=diagnostics.run_length), label="Best")
-    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[1], run_length=diagnostics.run_length), label="HOF")
-    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[2], run_length=diagnostics.run_length), label="Tie")
+    plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.model_outcomes[0], run_length=diagnostics.run_length), label="Best")
+    plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.model_outcomes[1], run_length=diagnostics.run_length), label="HOF")
+    plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.model_outcomes[2], run_length=diagnostics.run_length), label="Tie")
     plt.legend()
-    plt.title("{}: Model v Best wins for {} diagnostic games".format(model_name, diagnostics.index + 1))
+    plt.title("{}: Model v Best wins for {} diagnostic games".format(model_name, len(diagnostics.rewards) + 1))
     plt.xlabel("Game #")
     plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length))
     add_gating_markers(diagnostics.gating_indices)
diff --git a/replay_buffer.py b/replay_buffer.py
index 39971b7..0aff1ee 100644
--- a/replay_buffer.py
+++ b/replay_buffer.py
@@ -8,6 +8,10 @@ def __init__(self, capacity, batch_size):
         self.buffer = []
         self.index = 0
 
+    def clear(self):
+        self.buffer = []
+        self.index = 0
+
     def store(self, experience):
         if len(self.buffer) >= self.capacity:
             self.buffer[self.index] = experience
diff --git a/train.py b/train.py
index 5d6d909..2d8b272 100644
--- a/train.py
+++ b/train.py
@@ -20,7 +20,7 @@
 
 # Set cmd-line training arguments
 verbose, mcts, model_name = arg_parser(sys.argv)
-model_name = "new_model"
+verbose, model_name = False, "new_model"
 mnk = (3, 3, 3)
 
 
@@ -59,9 +59,9 @@ def get_corrected_action_values(model, lagging_model, state, action, next_state,
     if terminal:
         td_target = tf.constant(reward, dtype="float32", shape=(1, 1))
     else:
-        legal_slow_action_values = output_rep.get_legal_vals(next_board, lagging_model.action_values(next_board))
-        argmax_move = max(legal_slow_action_values, key=legal_slow_action_values.get)
-        td_target = model.action_values(next_board)[0][argmax_move[0] * next_board.n + argmax_move[1]]
+        action_vals = output_rep.get_legal_vals(next_board, model.action_values(next_board))
+        argmax_move = max(action_vals, key=action_vals.get)
+        td_target = lagging_model.action_values(next_board)[0][argmax_move[0] * next_board.n + argmax_move[1]]
 
     target_output[0][action[0] * n + action[1]] = td_target
     return target_output
@@ -91,7 +91,7 @@ def train_on_replays(model, lagging_model, batch):
     model.model.fit(states, target_outputs, epochs=1, batch_size=len(states), steps_per_epoch=1, callbacks=[lr_scheduler], verbose=False)
 
 
-def run_training_game(transitions, agent_train, agent_versing, lagging_model, replay_buffer, n_steps=1, model_update_freq=4, lagging_freq=100, start_at=5000, epsilon=0, mnk=(3, 3, 3), verbose=False):
+def run_training_game(transitions, agent_train, agent_versing, lagging_model, replay_buffer, n_steps=1, model_update_freq=4, lagging_freq=100, start_at=5000, epsilon=0, beta=1, mnk=(3, 3, 3), verbose=False):
     """Runs a training game with the provided agents.
 
     Args:
@@ -112,10 +112,10 @@ def run_training_game(transitions, agent_train, agent_versing, lagging_model, re
     while board.game_ongoing():
         # Select a move
         if board.player == agent_versing.player:
-            board.move(*agent_versing.action(board))
+            board.move(*agent_versing.greedy_action(board))
         else:
             transitions += 1
-            move = agent_train.action(board, epsilon)
+            move = agent_train.action(board, epsilon, beta)
 
             if len(state_queue) >= n_steps:
                 # Adds last action to replay buffer
@@ -143,22 +143,15 @@ def run_training_game(transitions, agent_train, agent_versing, lagging_model, re
     # Back up the terminal state value to the last actions chosen by training agent
     while len(state_queue) > 0:
         reward = agent_train.player * winner
-        if reward == 0:
-            reward = 0
 
         replay_buffer.store((*state_queue.pop(0), board.get_board(), reward, True))
 
-    if verbose:
-        print(board)
-
     return winner, game, transitions
 
 
-def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon_i, epsilon_f, decay_period, buffer_size, n_steps, update_freq, lagging_freq, start_transition, model, lr):
-    diagnostics = Diagnostics()
+def train(hof, total_games, diagnostic_freq, run_length, resample_freq, hof_gate_freq, hof_wait_period, batch_size, epsilon, beta, buffer_size, n_steps, update_freq, lagging_freq, start_transition, model, lr):
+    diagnostics = Diagnostics(run_length=run_length)
     games = ["" for _ in range(total_games // diagnostic_freq * 2)]
-    epsilon_step = (epsilon_f - epsilon_i) / decay_period
-    epsilon = epsilon_i
 
     # Initialize hall of fame
     hof.store(model)
@@ -169,40 +162,47 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
     # Initialize lagging model
     lagging_model = Model(mnk, model=tf.keras.models.clone_model(model.model))
     transitions = 0
+    games_since_hof = 0
 
     try:
         for game in range(total_games):
-            epsilon += epsilon_step
+            games_since_hof += 1
+
             # Regularly choose a new HOF opponent
             if game % resample_freq == 0:
                 side_best = [-1, 1][random.random() > 0.5]
                 side_hof = side_best * -1
-                model_hof = hof.sample("uniform")
+                model_hof = hof.sample(index=game % hof.pop_size)
 
             # Initialize the agents
             agent_best = Agent(model, side_best)
             agent_hof = Agent(model_hof, side_hof)
 
             # Play game and train on its outcome
-            _, _, transitions = run_training_game(transitions, agent_best, agent_hof, lagging_model, replay_buffer, n_steps, update_freq, lagging_freq, start_transition, epsilon, mnk)
+            _, _, transitions = run_training_game(transitions, agent_best, agent_hof, lagging_model, replay_buffer, n_steps, update_freq, lagging_freq, start_transition, epsilon, beta, mnk)
 
             # Switch sides for next game
             side_hof *= -1
             side_best = side_hof * -1
 
             # Regularly attempt to add the model into HOF ("gating")
-            if game % hof_gate_freq == 0:
+            if game % hof_gate_freq == 0 and games_since_hof > hof_wait_period:
                 reward, improvement = diagnostics.get_recent_performance()
 
                 # Only add if reward is positive and improvement has plateaued
-                if reward > 0 and np.abs(improvement) < 0.025:
-                    epsilon = epsilon_i
-                    K.set_value(model.model.opt.learning_rate, lr)
+                if (reward > 0 and np.abs(improvement) < 0.05) or reward == 1:
+                    print("\nAdding model to HOF...")
                     hof.store(model)
-
                     # Adds red line for when new models are added in plots
                     diagnostics.add_gate_ind()
 
+                    replay_buffer.clear()
+                    transitions = 0
+                    games_since_hof = 0
+                    K.set_value(model.opt.learning_rate, lr)
+
+                    print("Done.\n")
+
             if game % diagnostic_freq == 0:
                 print("Game: ", game)
 
@@ -214,7 +214,7 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
                     model_hof = hof.sample(index=i)
 
                     diagnostic_winner, game_data = run_diagnostic(model, model_hof, 1)
-                    games[game // diagnostic_freq * 2] = game_data
+                    # games[game // diagnostic_freq * 2] = game_data
 
                     avg_win += diagnostic_winner
                     if diagnostic_winner == 1:
@@ -225,7 +225,7 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
                         avg_hof += 1
 
                     diagnostic_winner, game_data = run_diagnostic(model, model_hof, -1)
-                    games[game // diagnostic_freq * 2 + 1] = game_data
+                    # games[game // diagnostic_freq * 2 + 1] = game_data
 
                     avg_win += -diagnostic_winner
                     if diagnostic_winner == 1:
@@ -239,6 +239,8 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
                 diagnostics.update_xo(avg_x / (hof.pop_size * 2), avg_o / (hof.pop_size * 2))
                 diagnostics.update_outcome(avg_t / (hof.pop_size * 2), avg_hof / (hof.pop_size * 2))
 
+                print("Real Reward: {}, Smoothed Reward: {}, Improvement: {}".format(diagnostics.rewards[-1], *diagnostics.get_recent_performance()))
+
     except KeyboardInterrupt:
         print("\n=======================")
         print("Training interrupted.")
@@ -261,7 +263,7 @@ def run_diagnostic(model, model_hof, side_model):
     agent_model = Agent(model, side_model)
     agent_hof = Agent(model_hof, side_hof)
 
-    return run_game(agent_model, agent_hof, mnk=mnk, verbose=verbose)
+    return run_game(agent_model, agent_hof, mnk=mnk, verbose=False)
 
 
 # Deletes entries in HOF folder
@@ -275,29 +277,31 @@ def clear_hof(folder):
 
 def main():
     # Hyperparameter List
-    diagnostic_freq = 25  # How often to run diagnostic games (in games)
+    diagnostic_freq = 50  # How often to run diagnostic games (in number of games)
+    run_length = 50 # Run length for diagnostic smoothing (in diagnostic games)
+
     resample_freq = 100  # How often to choose a new HOF opponent (in games)
-    hof_gate_freq = 2000  # How often to gate a new model into the HOF (in games)
+    hof_gate_freq = 1000  # How often to gate a new model into the HOF (in games)
+    hof_wait_period = run_length * diagnostic_freq  # How long to wait after adding to HOF before adding again
 
     total_games = 100000  # Total num of training games
     batch_size = 32  # Batch size for training
-    lr = 0.01  # Learning rate for SGD
+    lr = 0.001  # Learning rate for SGD
 
-    update_freq = 4  # How often to train the model on a replay batch (in moves)
-    buffer_size = 50000  # Num of moves to store in replay buffer
+    update_freq = 2  # How often to train the model on a replay batch (in moves)
+    buffer_size = 10000  # Num of moves to store in replay buffer
     n_steps = 1  # Num of steps used for temporal difference training targets
     lagging_freq = 500  # How often to update the lagging model (in moves)
-    start_transition = 5000
+    start_transition = 10000
 
-    epsilon_i = 0.1  # Probability with which a random move is chosen to play
-    epsilon_f = 0.01
-    decay_period = 10000
+    epsilon = 0.1  # Chance of picking a random move
+    beta = 1.0  # The lower this is, the more likely a "worse" move is chosen (don't set < 0)
 
     hof_folder = "menagerie"    # Folder to store the hall-of-fame models
     hof = HOF(mnk, folder=hof_folder)
 
     print("\nTraining model: {}\n".format(model_name))
-    model, diagnostics, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon_i, epsilon_f, decay_period, buffer_size, n_steps, update_freq, lagging_freq, start_transition, Model(mnk, lr=lr), lr=lr)
+    model, diagnostics, games = train(hof, total_games, diagnostic_freq, run_length, resample_freq, hof_gate_freq, hof_wait_period, batch_size, epsilon, beta, buffer_size, n_steps, update_freq, lagging_freq, start_transition, Model(mnk, lr=lr), lr=lr)
 
     save_model(model, model_name)
     save_plots(mnk, hof, model_name, diagnostics)
diff --git a/utils.py b/utils.py
index 6cbcb87..05de858 100644
--- a/utils.py
+++ b/utils.py
@@ -14,17 +14,14 @@ def run_game(agent_train, agent_versing, mnk=(3, 3, 3), verbose=False):
     while board.game_ongoing():
         # Select a move
         if board.player == agent_versing.player:
-            board.move(*agent_versing.action(board))
+            board.move(*agent_versing.greedy_action(board))
         else:
-            board.move(*agent_train.action(board))
+            board.move(*agent_train.greedy_action(board))
 
         # Store game for later analysis
         if verbose:
             game.append(board.__str__())
 
-    if verbose:
-        print(board)
-
     return board.who_won(), game
 
 
@@ -40,4 +37,4 @@ def arg_parser(argv):
     else:
         present.append("Model__" + str(datetime.datetime.now())[:-7].replace(" ", "__"))
 
-    return tuple(present)
\ No newline at end of file
+    return tuple(present)