A whole bunch of changes

- double q learning - lagging model - fixed win evaluation bug - added option for n-step td targets - adam optimizer - epsilon annealing - learning_rate annealing
ucfai · Nov 3, 2022 · ea08cc6 · ea08cc6
1 parent 2dd420f
commit ea08cc6
Show file tree

Hide file tree

Showing 9 changed files with 263 additions and 161 deletions.
diff --git a/hof.py b/hof.py
@@ -26,13 +26,8 @@ def store(self, model):
         self.pop_size += 1
         self.basel += 1 / self.pop_size**2
 
-    # Gating method decides whether to add the model to the hall of fame
-    def gate(self, model):
-        # Simple gating method, stores model after every training episode
-        self.store(model)
-
     # Samples from the hall of fame with the provided method
-    def sample(self, method='uniform'):
+    def sample(self, method='limit-uniform', index=None):
         if method == 'limit-uniform':  # Performs poorly. Do not use.
             threshold = random.random()*self.basel
 
@@ -48,32 +43,10 @@ def sample(self, method='uniform'):
         elif method == 'naive':
             ind = self.pop_size-1
 
-        self.sample_history.append(ind)
+        if index is not None:
+            name = self.hof[index]
+        else:
+            self.sample_history.append(ind)
+            name = self.hof[ind]
 
-        name = self.hof[ind]
         return Model(self.mnk, "{}/{}".format(self.folder, name))
-
-    ''' === MOVED TO PLOT.PY LMK IF I CAN DELETE IT FROM HERE ===
-    # Displays a histogram of the model iterations sampled from the hall of fame
-    def sample_histogram(self, num=100):
-        pyplot.hist(self.sample_history, num)
-        pyplot.title("Sampling of Model Indices from HOF")
-        pyplot.show()
-    '''
-
-    ''' === MOVED TO PLOT.PY LMK IF I CAN DELETE IT FROM HERE ===
-    # Displays a winrate matrix of the historical policies for the given player
-    def winrate_matrix(self, iterations):
-        matrix = []
-        for i in range (0, self.pop_size, iterations):
-            matrix.append([])
-            for j in range (0, self.pop_size, iterations):
-                model_i = Model("{}/{}".format(self.folder, self.hof[i]))
-                model_j = Model("{}/{}".format(self.folder, self.hof[j]))
-
-                value = run_game(Agent(model_i, 1), Agent(model_j, -1))[0]
-                matrix[-1].append(value)
-
-        pyplot.imshow(matrix, cmap="bwr")
-        pyplot.imsave("plots/Matrix.png", matrix, cmap="bwr")
-    '''
diff --git a/mcts_demo.py b/mcts_demo.py
@@ -164,8 +164,8 @@ def rollout(board):
 class Node:
     def __init__(self, last_move = None):
         self.last_move = last_move
-        self.q = 0
-        self.n = 0
+        self.q = 0  # Average of rewards from rollouts
+        self.n = 0  # Number of times node has been visited
         self.children = []
         self.isLeaf = True
 

diff --git a/model.py b/model.py
@@ -5,14 +5,14 @@
 import numpy as np
 from keras.models import Sequential
 from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
-from tensorflow.keras.optimizers import SGD
+from tensorflow.keras.optimizers import SGD, Adam
 from state_representation import get_input_rep
 import output_representation as output_rep
 from mnk import Board
 
 
 class Model:
-    def __init__(self, mnk, lr=0.001, location=None):
+    def __init__(self, mnk, lr=0.001, location=None, model=None):
         """Tic-Tac-Toe Game Evaluator Model.
         Provides a Convolutional Neural Network that can be trained to evaluate different
         board states, determining which player has the advantage at any given state. 
@@ -29,13 +29,19 @@ def __init__(self, mnk, lr=0.001, location=None):
             self.model = self.retrieve(location)
             return
 
-        opt = SGD(learning_rate=lr)
+        if model is not None:
+            self.model = model
+            return
+
+        opt = Adam(learning_rate=lr)
 
         self.model = Sequential()
-        self.model.add(Flatten(input_shape=(m, n, 2)))
-        self.model.add(Dense(24, kernel_initializer='normal', activation='relu'))
-        self.model.add(Dense(24, kernel_initializer='normal', activation='relu'))
-        self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', activation='tanh'))
+        self.model.add(Conv2D(filters=32, kernel_size=3, input_shape=(m, n, 2)))
+        self.model.add(Flatten())
+        self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
+        self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
+        self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
+        self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal'))
 
         self.model.compile(loss='mean_squared_error', optimizer=opt)
 
@@ -90,45 +96,9 @@ def action_values(self, board):
 
         return self.model(get_input_rep(board.get_board()))
 
-    def get_target(self, state, action, next_state):
-        m, n, k = self.mnk
-
-        # TODO: Is this actually necessary? Might be wasteful
-        start_board = Board(*self.mnk, state=state)
-        next_board = Board(*self.mnk, state=next_state)
-
-        prev_output = self.action_values(start_board)
-
-        # OPT 1: If this line is used, illegal actions will be ignored.
-        target_output = np.copy(prev_output)
-
-        # OPT 2: If this is used, illegal actions will be trained to have action value -1.
-        # target_output = np.full(shape=prev_output.shape, fill_value=-1, dtype='float32')
-        #
-        # for move in start_board.legal_moves():
-        #    index = move[0] * m + move[1]
-        #    target_output[0][index] = prev_output[0][index]
-
-        target_output[0][action[0] * n + action[1]] = self.state_value(next_board, player=state[1])
-        return target_output
-
-    # Performs training on a single sample
-    def td_update(self, state, action, next_state):
-        """Performs a temporal difference update of the model.
-
-        Args:
-            state: Board representing the previous state of the game.
-            action: Move played after previous state.
-            next_state: Next state of the game after action was taken.
-        """
-        target_output = self.get_target(state, action, next_state)
-
-        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
-        self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0, callbacks=[lr_scheduler])
-
 
 def scheduler(epoch, lr):
-    if lr > 0.0005:
-        return lr * tf.math.exp(-0.00005)
+    if lr > 0.0001:
+        return lr * tf.math.exp(-0.0009)
     else:
         return lr
diff --git a/play.py b/play.py
@@ -6,8 +6,10 @@
 
 board = mnk.Board(3, 3, 3)
 
-assert len(sys.argv) == 2, "Please specify which model you would like to play against (ex: python3 play.py models/PedrosModel).\n Tab complete works!"
-model = model.Model((3, 3, 3), sys.argv[1])
+#assert len(sys.argv) == 2, "Please specify which model you would like to play against (ex: python3 play.py models/PedrosModel).\n Tab complete works!"
+#model = model.Model((3, 3, 3), sys.argv[1])
+
+model = model.Model((3, 3, 3), "new_model")
 
 print("\n\n" + str(board))
 current_player = input("\nWho plays first (Me/AI)? ")

diff --git a/plot.py b/plot.py
@@ -3,25 +3,33 @@
 from agent import Agent
 from utils import run_game
 import random
+import numpy as np
 import os
 
 
 class Diagnostics:
     def __init__(self, run_length=100):
         self.run_length = run_length
-        self.xo_outcomes = []
-        self.model_outcomes = []
+        self.xo_outcomes = [[], [], []]
+        self.model_outcomes = [[], [], []]
+        self.rewards = []
         self.reward_totals = []
         self.reward_deltas = []
         self.gating_indices = []
         self.index = 0
 
-    def update_diagnostics(self, outcome, player):
-        self.xo_outcomes.append(outcome)
+    def update_xo(self, x_outcome, o_outcome):
+        self.xo_outcomes[0].append(x_outcome)
+        self.xo_outcomes[1].append(o_outcome)
+        self.xo_outcomes[2].append(1 - x_outcome - o_outcome)
 
-        reward = outcome*player
-        self.model_outcomes.append(reward)
+    def update_outcome(self, train_outcome, hof_outcome):
+        self.model_outcomes[0].append(train_outcome)
+        self.model_outcomes[1].append(hof_outcome)
+        self.model_outcomes[2].append(1 - train_outcome - hof_outcome)
 
+    def update_reward(self, reward):
+        self.rewards.append(reward)
         self.reward_totals.append(reward)
         self.reward_deltas.append(reward)
 
@@ -30,11 +38,11 @@ def update_diagnostics(self, outcome, player):
             self.reward_deltas[-1] += self.reward_deltas[-2]
 
         if self.index >= self.run_length:
-            self.reward_totals[-1] -= self.model_outcomes[self.index - self.run_length]
-            self.reward_deltas[-1] -= 2 * self.model_outcomes[self.index - self.run_length]
+            self.reward_totals[-1] -= self.rewards[self.index - self.run_length]
+            self.reward_deltas[-1] -= 2 * self.rewards[self.index - self.run_length]
 
         if self.index >= 2 * self.run_length:
-            self.reward_deltas[-1] += self.model_outcomes[self.index - 2 * self.run_length]
+            self.reward_deltas[-1] += self.rewards[self.index - 2 * self.run_length]
 
         self.index += 1
 
@@ -45,7 +53,7 @@ def get_recent_performance(self):
         if self.index == 0:
             return 0, 0
 
-        return self.reward_totals[-1], self.reward_deltas[-1]
+        return self.reward_totals[-1] / self.run_length, self.reward_deltas[-1] / self.run_length
 
 
 def plot_wins(outcomes, model_name, players):
@@ -90,28 +98,35 @@ def add_gating_markers(gating_indices):
 def sample_histogram(sample_history, bins=100):
     plt.hist(sample_history, bins)
     plt.title("Sampling of Model Indices from HOF")
-    plt.show()
 
 
 # 1v1 matrix for historical models: ideally, newer versions beating earlier ones
 def winrate_matrix(mnk, num_games, step):
-    print("Calculating winrate matrix... (may take a few mins)")
-    matrix = []
-    for i in range (0, num_games, step):
-        matrix.append([])
-        for j in range (0, num_games, step):
+    print("Calculating winrate matrix... (may take a while)")
+    matrix = np.zeros((num_games // step, num_games // step))
+    for i in range(0, num_games, step):
+        for j in range(0, num_games, step):
             model_i = Model(mnk, "menagerie/{}".format(i))
             model_j = Model(mnk, "menagerie/{}".format(j))
 
-            side_i = [-1, 1][random.random() > 0.5]
+            side_i = 1
             side_j = side_i * -1
 
             value = run_game(Agent(model_i, side_i), Agent(model_j, side_j))[0]
-            matrix[-1].append(value)
+            matrix[i // step, j // step] = value
 
     return matrix
 
 
+def get_moving_avg(data, run_length=50):
+    arr = []
+    for i in range(len(data)):
+        avg = sum(data[max(0, i - run_length):i+1]) / min(run_length, (i + 1))
+        arr.append(avg)
+
+    return arr
+
+
 def save_plots(mnk, hof, model_name, diagnostics):
 
     # Create model's plots folder
@@ -120,40 +135,54 @@ def save_plots(mnk, hof, model_name, diagnostics):
         os.makedirs(plots_dir)
 
     # Graph and save each plot
-    plt.figure()
-    plot_wins(diagnostics.xo_outcomes, model_name, ['X', 'O'])
-    add_gating_markers(diagnostics.gating_indices)
-    plt.savefig("{}/XO.png".format(plots_dir))
-    plt.clf()
 
-    plot_wins(diagnostics.model_outcomes, model_name, ["Best", "HOF"])
-    add_gating_markers(diagnostics.gating_indices)
-    plt.savefig("{}/HOF.png".format(plots_dir))
-    plt.clf()
-
-    plt.plot(range(diagnostics.index), diagnostics.reward_totals)
+    plt.plot(range(diagnostics.index), np.array(diagnostics.reward_totals) / diagnostics.run_length)
     add_gating_markers(diagnostics.gating_indices)
     plt.title("{}: Reward for {} diagnostic games".format(model_name, diagnostics.index+1))
     plt.xlabel("Game #")
     plt.ylabel("Cumulative reward over previous {} games".format(diagnostics.run_length))
     plt.savefig("{}/Reward.png".format(plots_dir))
     plt.clf()
 
-    plt.plot(range(diagnostics.index), diagnostics.reward_deltas)
+    plt.plot(range(diagnostics.index), np.array(diagnostics.reward_deltas) / diagnostics.run_length)
     add_gating_markers(diagnostics.gating_indices)
     plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, diagnostics.index+1))
     plt.xlabel("Game #")
     plt.ylabel("Difference in cumulative reward for previous two {} length runs".format(diagnostics.run_length))
     plt.savefig("{}/Improvement.png".format(plots_dir))
     plt.clf()
 
-    sample_histogram(hof.sample_history, 20)
+    sample_histogram(hof.sample_history, hof.pop_size if hof.pop_size < 40 else 20)
     plt.savefig("{}/Sampling.png".format(plots_dir))
     plt.clf()
 
-    num_games = diagnostics.index
+    plt.figure()
+    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[0], run_length=diagnostics.run_length), label="X")
+    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[1], run_length=diagnostics.run_length), label="O")
+    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[2], run_length=diagnostics.run_length), label="Tie")
+    plt.legend()
+    plt.title("{}: XO wins for {} diagnostic games".format(model_name, diagnostics.index + 1))
+    plt.xlabel("Game #")
+    plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length))
+    add_gating_markers(diagnostics.gating_indices)
+    plt.savefig("{}/XO.png".format(plots_dir))
+    plt.clf()
+
+    plt.figure()
+    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[0], run_length=diagnostics.run_length), label="Best")
+    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[1], run_length=diagnostics.run_length), label="HOF")
+    plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[2], run_length=diagnostics.run_length), label="Tie")
+    plt.legend()
+    plt.title("{}: Model v Best wins for {} diagnostic games".format(model_name, diagnostics.index + 1))
+    plt.xlabel("Game #")
+    plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length))
+    add_gating_markers(diagnostics.gating_indices)
+    plt.savefig("{}/HOF.png".format(plots_dir))
+    plt.clf()
+
     step = max(1, hof.pop_size // 40)
     matrix = winrate_matrix(mnk, hof.pop_size, step)
     plt.imshow(matrix, cmap="bwr")
     plt.imsave("plots/{}/Matrix.png".format(model_name), matrix, cmap="bwr")
     plt.clf()
+
diff --git a/replay_buffer.py b/replay_buffer.py
@@ -6,12 +6,14 @@ def __init__(self, capacity, batch_size):
         self.capacity = capacity
         self.batch_size = batch_size
         self.buffer = []
+        self.index = 0
 
     def store(self, experience):
-        self.buffer.append(experience)
-
-        if len(self.buffer) > self.capacity:
-            del self.buffer[0]
+        if len(self.buffer) >= self.capacity:
+            self.buffer[self.index] = experience
+            self.index = (self.index + 1) % self.capacity
+        else:
+            self.buffer.append(experience)
 
     def sample(self):
         if len(self.buffer) < self.batch_size:

diff --git a/state_representation.py b/state_representation.py
@@ -32,4 +32,4 @@ def get_input_rep(board, form="multiplanar-turnflipped"):
                     board_planes[i][j][0] = 1
                 elif board[i][j] == -1 * player:
                     board_planes[i][j][1] = 1
-        return np.copy(board_planes.reshape(1, m, n, 2))
+        return np.copy(np.expand_dims(board_planes, axis=0))