Reorganized evaluation, added gating mechanism

Succesful gating is plotted with red lines
ucfai · Apr 19, 2022 · f29be17 · f29be17
1 parent 626a3f0
commit f29be17
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 113 deletions.
diff --git a/mnk.py b/mnk.py
@@ -92,7 +92,7 @@ def num_legal_moves(self):
 
     # Reshapes board into the form needed for the model
     def get_board(self):
-        return self.board, self.player
+        return np.copy(self.board), self.player
 
     def game_ongoing(self):
         return not (self.player_has_lost() or (self.num_legal_moves() == 0))

diff --git a/model.py b/model.py
@@ -29,7 +29,7 @@ def __init__(self, mnk, location=None):
             self.model = self.retrieve(location)
             return
 
-        opt = SGD(learning_rate=0.001)
+        opt = SGD(learning_rate=0.01)
 
         self.model = Sequential()
         self.model.add(Flatten(input_shape=(m, n, 2)))
@@ -93,27 +93,6 @@ def action_values(self, board):
 
         return self.model(get_input_rep(board.get_board()))
 
-    def scheduler(self, epoch, lr):
-        """Returns an epsilon value as a function of the current epoch.
-        As a function of the epoch number, it returns a decreasing epsilon value
-        used in the Epsilon-Greedy Method.
-
-        Args:
-            epoch (int): Number of training epoch.
-            lr (???): ??? (Is this for the decay?)
-
-        Returns:
-            double: Epsilon value. Probability of choosing to explore.
-        """
-        if epoch < 5000:
-            return 0.02
-        elif epoch < 15000:
-            return 0.01
-        elif epoch < 25000:
-            return 0.002
-        else:
-            return 0.001
-
     def get_target(self, state, action, next_state):
         m, n, k = self.mnk
 
@@ -144,4 +123,12 @@ def td_update(self, state, action, next_state):
         """
         target_output = self.get_target(state, action, next_state)
 
-        self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0)
+        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
+        self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0, callbacks=[lr_scheduler])
+
+
+def scheduler(epoch, lr):
+    if lr > 0.0005:
+        return lr * tf.math.exp(-0.00005)
+    else:
+        return lr
diff --git a/plot.py b/plot.py
@@ -5,15 +5,57 @@
 import random
 import os
 
-def plot_wins(outcomes, model_name, players):
 
+class Diagnostics:
+    def __init__(self, run_length=100):
+        self.run_length = run_length
+        self.xo_outcomes = []
+        self.model_outcomes = []
+        self.reward_totals = []
+        self.reward_deltas = []
+        self.gating_indices = []
+        self.index = 0
+
+    def update_diagnostics(self, outcome, player):
+        self.xo_outcomes.append(outcome)
+
+        reward = outcome*player
+        self.model_outcomes.append(reward)
+
+        self.reward_totals.append(reward)
+        self.reward_deltas.append(reward)
+
+        if self.index > 0:
+            self.reward_totals[-1] += self.reward_totals[-2]
+            self.reward_deltas[-1] += self.reward_deltas[-2]
+
+        if self.index >= self.run_length:
+            self.reward_totals[-1] -= self.model_outcomes[self.index - self.run_length]
+            self.reward_deltas[-1] -= 2 * self.model_outcomes[self.index - self.run_length]
+
+        if self.index >= 2 * self.run_length:
+            self.reward_deltas[-1] += self.model_outcomes[self.index - 2 * self.run_length]
+
+        self.index += 1
+
+    def add_gate_ind(self):
+        self.gating_indices.append(self.index)
+
+    def get_recent_performance(self):
+        if self.index == 0:
+            return 0, 0
+
+        return self.reward_totals[-1], self.reward_deltas[-1]
+
+
+def plot_wins(outcomes, model_name, players):
     # We don't plot total wins for each player bc the graph would always increase, making performance evaluation harder.
     # Instead, we plot runs: how many of the previous n games were won. This way, if a model begins performing worse, its line will decrease.
 
     player1_wins, player2_wins, ties = [], [], []
     run_totals = [0, 0, 0]
     num_games = len(outcomes)
-    run_length = max(min(num_games // 100, 100), 1)
+    run_length = 100
 
     for i, outcome in enumerate(outcomes):
         if i < run_length:
@@ -38,64 +80,19 @@ def plot_wins(outcomes, model_name, players):
     plt.ylabel("Wins out of previous {} games".format(run_length))
 
 
-def plot_reward(outcomes, model_name):
-
-    # We don't plot total wins for each player bc the graph would always increase, making performance evaluation harder.
-    # Instead, we plot runs: how many of the previous n games were won. This way, if a model begins performing worse, its line will decrease.
-
-    run_totals = []
-    num_games = len(outcomes)
-    run_length = max(min(num_games // 100, 100), 1)
+# Vertical lines where the model was gated
+def add_gating_markers(gating_indices):
+    for ind in gating_indices:
+        plt.axvline(x=ind, c='red')
 
-    for i, outcome in enumerate(outcomes):
-        if i == 0:
-            run_totals.append(outcome)
-        elif i < run_length:
-            run_totals.append(run_totals[-1] + outcome)
-        else:
-            run_totals.append(run_totals[-1] + outcome - outcomes[i - run_length])
-
-    game = range(num_games)
-
-    plt.plot(game, run_totals)
-
-    plt.title("{}: Reward for {} diagnostic games".format(model_name, num_games))
-    plt.xlabel("Game #")
-    plt.ylabel("Cumulative reward over previous {} games".format(run_length))
-
-def plot_improvement(outcomes, model_name):
-
-    # We don't plot total wins for each player bc the graph would always increase, making performance evaluation harder.
-    # Instead, we plot runs: how many of the previous n games were won. This way, if a model begins performing worse, its line will decrease.
-
-    run_deltas = []
-    num_games = len(outcomes)
-    run_length = max(min(num_games // 100, 100), 1)
-
-    for i, outcome in enumerate(outcomes):
-        if i == 0:
-            run_deltas.append(outcome)
-        elif i < run_length:
-            run_deltas.append(run_deltas[-1] + outcome)
-        elif i < 2 * run_length:
-            run_deltas.append(run_deltas[-1] + outcome - 2 * outcomes[i-run_length])
-        else:
-            run_deltas.append(run_deltas[-1] + outcome - 2 * outcomes[i-run_length] + outcomes[i-2*run_length])
-
-    game = range(num_games)
-
-    plt.plot(game, run_deltas)
-
-    plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, num_games))
-    plt.xlabel("Game #")
-    plt.ylabel("Difference in cumulative reward for previous two {} length runs".format(run_length))
 
 # Displays a histogram of the model iterations sampled from the hall of fame
 def sample_histogram(sample_history, bins=100):
     plt.hist(sample_history, bins)
     plt.title("Sampling of Model Indices from HOF")
     plt.show()
 
+
 # 1v1 matrix for historical models: ideally, newer versions beating earlier ones
 def winrate_matrix(mnk, num_games, step):
     print("Calculating winrate matrix... (may take a few mins)")
@@ -115,7 +112,7 @@ def winrate_matrix(mnk, num_games, step):
     return matrix
 
 
-def save_plots(mnk, hof, model_name, winnersXO, winnersHOF):
+def save_plots(mnk, hof, model_name, diagnostics):
 
     # Create model's plots folder
     plots_dir = "plots/{}".format(model_name)
@@ -124,29 +121,39 @@ def save_plots(mnk, hof, model_name, winnersXO, winnersHOF):
 
     # Graph and save each plot
     plt.figure()
-    plot_wins(winnersXO, model_name, ['X', 'O'])
+    plot_wins(diagnostics.xo_outcomes, model_name, ['X', 'O'])
+    add_gating_markers(diagnostics.gating_indices)
     plt.savefig("{}/XO.png".format(plots_dir))
     plt.clf()
 
-    plot_wins(winnersHOF, model_name, ["Best", "HOF"])
+    plot_wins(diagnostics.model_outcomes, model_name, ["Best", "HOF"])
+    add_gating_markers(diagnostics.gating_indices)
     plt.savefig("{}/HOF.png".format(plots_dir))
     plt.clf()
 
-    plot_reward(winnersHOF, model_name)
+    plt.plot(range(diagnostics.index), diagnostics.reward_totals)
+    add_gating_markers(diagnostics.gating_indices)
+    plt.title("{}: Reward for {} diagnostic games".format(model_name, diagnostics.index+1))
+    plt.xlabel("Game #")
+    plt.ylabel("Cumulative reward over previous {} games".format(diagnostics.run_length))
     plt.savefig("{}/Reward.png".format(plots_dir))
     plt.clf()
 
-    plot_improvement(winnersHOF, model_name)
+    plt.plot(range(diagnostics.index), diagnostics.reward_deltas)
+    add_gating_markers(diagnostics.gating_indices)
+    plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, diagnostics.index+1))
+    plt.xlabel("Game #")
+    plt.ylabel("Difference in cumulative reward for previous two {} length runs".format(diagnostics.run_length))
     plt.savefig("{}/Improvement.png".format(plots_dir))
     plt.clf()
 
     sample_histogram(hof.sample_history, 20)
     plt.savefig("{}/Sampling.png".format(plots_dir))
     plt.clf()
 
-    num_games = len(winnersXO)
-    step = max(1, num_games // 40)
-    matrix = winrate_matrix(mnk, num_games, step)
+    num_games = diagnostics.index
+    step = max(1, hof.pop_size // 40)
+    matrix = winrate_matrix(mnk, hof.pop_size, step)
     plt.imshow(matrix, cmap="bwr")
     plt.imsave("plots/{}/Matrix.png".format(model_name), matrix, cmap="bwr")
     plt.clf()
diff --git a/train.py b/train.py
@@ -2,10 +2,11 @@
 
 from mnk import Board
 import random
+import tensorflow as tf
 import matplotlib.pyplot as plt
 from agent import Agent
-from model import Model
-from plot import plot_wins, save_plots
+from model import Model, scheduler
+from plot import Diagnostics, save_plots
 from hof import HOF
 from replay_buffer import ReplayBuffer
 from state_representation import get_input_rep
@@ -32,8 +33,9 @@ def train_on_replays(model, batch):
 
     target_outputs = np.asarray(target_outputs)
 
-    # Theres a parameter for train_on_batch for sample weights. Use if we do importance sampling
-    model.model.fit(states, target_outputs, verbose=0)
+    # Theres a parameter for sample weights. Use if we do importance sampling
+    lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
+    model.model.fit(states, target_outputs, verbose=0, callbacks=[lr_scheduler])
 
 
 def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=(3, 3, 3), verbose=False):
@@ -62,8 +64,7 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=
     winner = board.who_won()
 
     # Back up the terminal state value to the last action chosen by training agent
-    if winner != agent_train.player:
-        agent_train.model.td_update(state, action, board.get_board())
+    agent_train.model.td_update(state, action, board.get_board())
 
     if verbose:
         print(board)
@@ -73,10 +74,10 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=
 
 def main():
     # Hyperparameter List
-    total_games = 50000
+    total_games = 100000
     diagnostic_freq = 20
     resample_freq = 10
-    hof_gate_freq = 2000
+    hof_gate_freq = 500
     batch_size = 32
     buffer_size = 4000
     epsilon = 0.2  # probability with which a random move is chosen to play
@@ -85,10 +86,10 @@ def main():
     hof = HOF(mnk, folder=hof_folder)
 
     print("\nTraining model: {}\n".format(model_name))
-    model, winnersXO, winnersHOF, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk))
+    model, diagnostics, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk))
 
     save_model(model, model_name)
-    save_plots(mnk, hof, model_name, winnersXO, winnersHOF)
+    save_plots(mnk, hof, model_name, diagnostics)
     clear_hof(hof_folder)
 
     # Can be used after looking at plot to analyze important milestones
@@ -106,9 +107,8 @@ def main():
 
 
 def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, model):
-    winnersXO = [0 for _ in range(total_games//diagnostic_freq)]
-    winnersHOF = [0 for _ in range(total_games//diagnostic_freq)]
-    games = ["" for _ in range(total_games//diagnostic_freq)]
+    diagnostics = Diagnostics()
+    games = ["" for _ in range(total_games//diagnostic_freq * 2)]
 
     # Initialize hall of fame
     hof.store(model)
@@ -132,38 +132,46 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
 
             # Switch sides for next game
             side_hof *= -1
-            side_best *= -1
+            side_best = side_hof * -1
 
             # Gate the model for HOF
             if game % hof_gate_freq == 0:
-                hof.gate(model)
+                reward, improvement = diagnostics.get_recent_performance()
+                if reward > 0 and np.abs(improvement) < 10:
+                    hof.gate(model)
+                    diagnostics.add_gate_ind()
 
             if game % diagnostic_freq == 0:
                 print("Game: ", game)
 
-                # Resample hof so diagnostic is not biased towards last game played
-                temp_side_best = [-1, 1][random.random() > 0.5]
-                temp_side_hof = side_best * -1
-
-                temp_model_hof = hof.sample("uniform")
-                temp_agent_best = Agent(model, temp_side_best)
-                temp_agent_hof = Agent(temp_model_hof, temp_side_hof)
-
                 # Run a diagnostic (non-training, no exploration) game to collect data
-                diagnostic_winner, game_data = run_game(temp_agent_best, temp_agent_hof, mnk=mnk, verbose=verbose)
+                diagnostic_winner, game_data = run_diagnostic(model, hof, 1)
+                games[game // diagnostic_freq * 2] = game_data
+                diagnostics.update_diagnostics(diagnostic_winner, 1)
 
-                # Store data from diagnostic game for this batch
-                games[game//diagnostic_freq] = game_data
-                winnersXO[game//diagnostic_freq] = diagnostic_winner              # X or O
-                winnersHOF[game//diagnostic_freq] = diagnostic_winner*side_best   # Best or HOF
+                diagnostic_winner, game_data = run_diagnostic(model, hof, -1)
+                games[game // diagnostic_freq * 2 + 1] = game_data
+                diagnostics.update_diagnostics(diagnostic_winner, -1)
 
     except KeyboardInterrupt:
         print("\n=======================")
         print("Training interrupted.")
         print("=======================")
 
     print("Training completed.")
-    return model, winnersXO, winnersHOF, games
+    return model, diagnostics, games
+
+
+def run_diagnostic(model, hof, side_model):
+    side_hof = side_model * -1
+
+    model_hof = hof.sample("uniform")
+    agent_model = Agent(model, side_model)
+    agent_hof = Agent(model_hof, side_hof)
+
+    # Run a diagnostic (non-training, no exploration) game to collect data
+    return run_game(agent_model, agent_hof, mnk=mnk, verbose=verbose)
+
 
 def clear_hof(folder):
     if os.path.isdir(folder):