Training on batches from replay

ucfai · Apr 8, 2022 · 0fd653e · 0fd653e
1 parent a0d1e9e
commit 0fd653e
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 33 deletions.
diff --git a/model.py b/model.py
@@ -30,11 +30,9 @@ def __init__(self, mnk, location=None):
         opt = SGD(learning_rate=0.02, momentum=0.0)
 
         self.model = Sequential()
-        self.model.add(Conv2D(8, 3, activation='relu', padding="same", input_shape=(m, n, 2)))
-        self.model.add(Conv2D(8, 3, activation='relu', padding="same"))
-
-        self.model.add(Flatten())
-        self.model.add(Dense(8, kernel_initializer='normal', activation='relu', input_shape=(1, m * n * 2)))
+        self.model.add(Flatten(input_shape=(m, n, 2)))
+        self.model.add(Dense(16, kernel_initializer='normal', activation='relu'))
+        self.model.add(Dense(16, kernel_initializer='normal', activation='relu'))
         self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', activation='tanh'))
 
         self.model.compile(loss='mean_squared_error', optimizer=opt)
@@ -114,19 +112,9 @@ def scheduler(self, epoch, lr):
         else:
             return 0.001
 
-    def td_update(self, state, action, next_state):
-        """Performs a temporal difference update of the model.
-
-        Args:
-            board (Board): Board representing the current state of the game.
-            greedy_move ((int, int)): Move to be played. Defaults to None.
-            terminal (bool, optional): True if the current state of the game is terminal,
-                False otherwise. Defaults to False.
-        """
+    def get_target(self, state, action, next_state):
         m, n, k = self.mnk
 
-        callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler)
-
         start_board = Board(*self.mnk, state=state)
         next_board = Board(*self.mnk, state=next_state)
 
@@ -138,5 +126,18 @@ def td_update(self, state, action, next_state):
             target_output[0][index] = prev_output[0][index]
 
         target_output[0][action[0] * m + action[1]] = self.state_value(next_board, player=state[1])
+        return target_output
+
+    def td_update(self, state, action, next_state):
+        """Performs a temporal difference update of the model.
+
+        Args:
+            board (Board): Board representing the current state of the game.
+            greedy_move ((int, int)): Move to be played. Defaults to None.
+            terminal (bool, optional): True if the current state of the game is terminal,
+                False otherwise. Defaults to False.
+        """
+        callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler)
+        target_output = self.get_target(state, action, next_state)
 
-        self.model.fit(get_input_rep(start_board.get_board()), target_output, batch_size=1, verbose=0, callbacks=[callback])
+        self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0, callbacks=[callback])
diff --git a/plot.py b/plot.py
@@ -13,7 +13,7 @@ def plot_wins(outcomes, model_name, players):
     player1_wins, player2_wins, ties = [], [], []
     run_totals = [0, 0, 0]
     num_games = len(outcomes)
-    run_length = max(num_games // 10 , 1)
+    run_length = max(num_games // 20, 1)
 
     for i, outcome in enumerate(outcomes):
         if i < run_length:
@@ -44,14 +44,14 @@ def sample_histogram(sample_history, bins=100):
     plt.show()
 
 # 1v1 matrix for historical models: ideally, newer versions beating earlier ones
-def winrate_matrix(num_games, step):
+def winrate_matrix(mnk, num_games, step):
     print("Calculating winrate matrix... (may take a few mins)")
     matrix = []
     for i in range (0, num_games, step):
         matrix.append([])
         for j in range (0, num_games, step):
-            model_i = Model("menagerie/{}".format(i))
-            model_j = Model("menagerie/{}".format(j))
+            model_i = Model(mnk, "menagerie/{}".format(i))
+            model_j = Model(mnk, "menagerie/{}".format(j))
 
             side_i = [-1, 1][random.random() > 0.5]
             side_j = side_i * -1
@@ -62,7 +62,7 @@ def winrate_matrix(num_games, step):
     return matrix
 
 
-def save_plots(hof, model_name, winnersXO, winnersHOF):
+def save_plots(mnk, hof, model_name, winnersXO, winnersHOF):
 
     # Create model's plots folder
     plots_dir = "plots/{}".format(model_name)
@@ -85,7 +85,7 @@ def save_plots(hof, model_name, winnersXO, winnersHOF):
 
     num_games = len(winnersXO)
     step = max(1, num_games // 20)
-    matrix = winrate_matrix(num_games, step)
+    matrix = winrate_matrix(mnk, num_games, step)
     plt.imshow(matrix, cmap="bwr")
     plt.imsave("plots/{}/Matrix.png".format(model_name), matrix, cmap="bwr")
     plt.clf()
diff --git a/replay_buffer.py b/replay_buffer.py
@@ -0,0 +1,23 @@
+import random
+
+
+class ReplayBuffer:
+    def __init__(self, capacity, batch_size):
+        self.capacity = capacity
+        self.batch_size = batch_size
+        self.buffer = []
+
+    def store(self, experience):
+        self.buffer.append(experience)
+
+        if len(self.buffer) > self.capacity:
+            del self.buffer[0]
+
+    def sample(self):
+        if len(self.buffer) < self.batch_size:
+            return self.buffer
+
+        return random.sample(self.buffer, self.batch_size)
+
+
+
diff --git a/train.py b/train.py
@@ -1,10 +1,14 @@
+import numpy as np
+
 from mnk import Board
 import random
 import matplotlib.pyplot as plt
 from agent import Agent
 from model import Model
 from plot import plot_wins, save_plots
 from hof import HOF
+from replay_buffer import ReplayBuffer
+from state_representation import get_input_rep
 from utils import run_game, arg_parser
 from save_model import save_model
 import sys
@@ -13,10 +17,26 @@
 
 # Set cmd-line training arguments
 verbose, mcts, model_name = arg_parser(sys.argv)
+model_name = "new_model"
 mnk = (3, 3, 3)
 
 
-def run_training_game(agent_train, agent_versing, epsilon=0, mnk=(3, 3, 3), verbose=False):
+def train_on_replays(model, batch):
+    states = []
+    target_outputs = []
+    for experience in batch:
+        target_outputs.append(model.get_target(*experience))
+        states.append(get_input_rep(experience[0])[0])
+
+    states = np.asarray(states)
+
+    target_outputs = np.asarray(target_outputs)
+
+    # Theres a parameter for train_on_batch for sample weights. Use if we do importance sampling
+    model.model.fit(states, target_outputs, verbose=0)
+
+
+def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=(3, 3, 3), verbose=False):
     board = Board(*mnk, hist_length=-1)
     game = []
     state, action = None, None
@@ -30,6 +50,8 @@ def run_training_game(agent_train, agent_versing, epsilon=0, mnk=(3, 3, 3), verb
 
             if state is not None and action is not None:
                 agent_train.model.td_update(state, action, board.get_board())
+                replay_buffer.store((state, action, board.get_board()))
+                train_on_replays(agent_train.model, replay_buffer.sample())
 
             state, action = board.get_board(), move
             board.move(*move)
@@ -51,43 +73,53 @@ def run_training_game(agent_train, agent_versing, epsilon=0, mnk=(3, 3, 3), verb
 
 def main():
     # Hyperparameter List
-    num_batches = 20        # Total training games = num_batches * games_per_batch
-    games_per_batch = 5
+    num_cycles = 8000        # Total training games = num_cycles * games_per_cycle
+    games_per_cycle = 5
+    batch_size = 16
+    buffer_size = 1000
     epsilon = 0.2               # Epsilon is the exploration factor: probability with which a random move is chosen to play
 
     hof_folder = "menagerie"    # Folder to store the hall-of-fame models
     hof = HOF(mnk, folder=hof_folder)
 
     print("\nTraining model: {}\n".format(model_name))
-    model, winnersXO, winnersHOF, games = train(hof, num_batches, games_per_batch, epsilon, Model(mnk))
+    model, winnersXO, winnersHOF, games = train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, Model(mnk))
 
     save_model(model, model_name)
-    save_plots(hof, model_name, winnersXO, winnersHOF)
+    save_plots(mnk, hof, model_name, winnersXO, winnersHOF)
     clear_hof(hof_folder)
 
     # Can be used after looking at plot to analyze important milestones
     ind = 0                                                                          # Put into a function
     while ind != -1:
         ind = int(input("Query a game: "))
+
+        if ind >= len(games):
+            print("Too large. Try again")
+            continue
+
         for move in games[ind]:
             print(move)
         pass
 
 
-def train(hof, num_batches, games_per_batch, epsilon, model):
+def train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, model):
     winnersXO = []
     winnersHOF = []
     games = []
 
     # Initialize hall of fame
     hof.store(model)
 
+    # Initialize replay buffer
+    replay_buffer = ReplayBuffer(buffer_size, batch_size)
+
     try:
-        for batch_number in range(num_batches):
-            print("Batch:", batch_number, "(Games {}-{})".format(batch_number * games_per_batch + 1, (batch_number + 1) * games_per_batch))
+        for batch_number in range(num_cycles):
+            print("Batch:", batch_number, "(Games {}-{})".format(batch_number * games_per_cycle + 1, (batch_number + 1) * games_per_cycle))
 
             # Runs a batch of games, after which we can play/save a diagnostic game to see if it improved and store current model to hof
-            for game in range(games_per_batch):
+            for game in range(games_per_cycle):
 
                 # Randomly assign sides (X or O) for game to be played
                 side_best = [-1, 1][random.random() > 0.5]
@@ -100,7 +132,7 @@ def train(hof, num_batches, games_per_batch, epsilon, model):
                 agent_hof = Agent(model_hof, side_hof)
 
                 # Play game and train on its outcome
-                run_training_game(agent_best, agent_hof, epsilon, mnk)
+                run_training_game(agent_best, agent_hof, replay_buffer, epsilon, mnk)
 
             # Gate will determine if model is worthy, and store in hof only if it is (Currently, it just stores every game)
             hof.gate(model)