Commented various functions and variables

ucfai · Oct 11, 2022 · 2dd420f · 2dd420f
1 parent 4ad968a
commit 2dd420f
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 70 deletions.
diff --git a/mnk.py b/mnk.py
@@ -90,7 +90,7 @@ def legal_moves(self):
     def num_legal_moves(self):
         return len(self.legal_moves())
 
-    # Reshapes board into the form needed for the model
+    # Returns tuple of board and player
     def get_board(self):
         return np.copy(self.board), self.player
 

diff --git a/model.py b/model.py
@@ -12,7 +12,7 @@
 
 
 class Model:
-    def __init__(self, mnk, location=None):
+    def __init__(self, mnk, lr=0.001, location=None):
         """Tic-Tac-Toe Game Evaluator Model.
         Provides a Convolutional Neural Network that can be trained to evaluate different
         board states, determining which player has the advantage at any given state. 
@@ -29,7 +29,7 @@ def __init__(self, mnk, location=None):
             self.model = self.retrieve(location)
             return
 
-        opt = SGD(learning_rate=0.01)
+        opt = SGD(learning_rate=lr)
 
         self.model = Sequential()
         self.model.add(Flatten(input_shape=(m, n, 2)))
@@ -39,7 +39,8 @@ def __init__(self, mnk, location=None):
 
         self.model.compile(loss='mean_squared_error', optimizer=opt)
 
-    def retrieve(self, location):
+    @staticmethod
+    def retrieve(location):
         """Retrieves keras model located at the given path and returns it.
 
         Args:
@@ -59,14 +60,15 @@ def save_to(self, location):
         self.model.save(location)
 
     def state_value(self, board, player):
-        """Evaluates the state of the board and returns the advantage of the current player.
-        Changes 1 to mean the supplied player is at advantage, -1 disadvantage.
+        """Evaluates the state of the board and returns the advantage of the given player.
+        1 means the supplied player is at advantage, -1 disadvantage.
 
         Args:
             board (Board): Board object to be evaluated.
+            player: Player being used as point of reference.
 
         Returns:
-            tf.Tensor(1,1): Value indicating the advantage of the current player.
+            tf.Tensor(shape=(1,1)): Value indicating the advantage of the current player.
         """
 
         if board.who_won() != 2:
@@ -77,49 +79,47 @@ def state_value(self, board, player):
             return max(legal_action_values.values())
 
     def action_values(self, board):
-        """Evaluates the advantage that the current player would have if he makes a
-        given move on the board. Returns the value of taking a move from the given
-        board state. Changes 1 to mean the supplied player would be at advantage, -1
-        disadvantage.
+        """Returns the vector of action values for all actions in the current board state. This includes
+        illegal actions that cannot be taken.
 
         Args:
-            board (Board): Board object where to make the move.
-            move ((int, int)): (x, y) coordinates of the move to be played.
-
+            board (Board): Board object representing current state.
         Returns:
-            tf.Tensor(1,1): Value indicating the advantage the player who made the move
-                would have after making the move.
+            tf.Tensor(shape=(m * n)): Vector where entry i indicates the value of taking move i from the current state.
         """
 
         return self.model(get_input_rep(board.get_board()))
 
     def get_target(self, state, action, next_state):
         m, n, k = self.mnk
 
+        # TODO: Is this actually necessary? Might be wasteful
         start_board = Board(*self.mnk, state=state)
         next_board = Board(*self.mnk, state=next_state)
 
         prev_output = self.action_values(start_board)
-        # test leaving illegal action values alone (np.copy(prev_output) rather than fill -1)
+
+        # OPT 1: If this line is used, illegal actions will be ignored.
         target_output = np.copy(prev_output)
 
-        #target_output = np.full(shape=prev_output.shape, fill_value=-1, dtype='float32')
+        # OPT 2: If this is used, illegal actions will be trained to have action value -1.
+        # target_output = np.full(shape=prev_output.shape, fill_value=-1, dtype='float32')
         #
-        #for move in start_board.legal_moves():
+        # for move in start_board.legal_moves():
         #    index = move[0] * m + move[1]
         #    target_output[0][index] = prev_output[0][index]
 
         target_output[0][action[0] * n + action[1]] = self.state_value(next_board, player=state[1])
         return target_output
 
+    # Performs training on a single sample
     def td_update(self, state, action, next_state):
         """Performs a temporal difference update of the model.
 
         Args:
-            board (Board): Board representing the current state of the game.
-            greedy_move ((int, int)): Move to be played. Defaults to None.
-            terminal (bool, optional): True if the current state of the game is terminal,
-                False otherwise. Defaults to False.
+            state: Board representing the previous state of the game.
+            action: Move played after previous state.
+            next_state: Next state of the game after action was taken.
         """
         target_output = self.get_target(state, action, next_state)
 

diff --git a/save_model.py b/save_model.py
diff --git a/train.py b/train.py
@@ -10,8 +10,7 @@
 from hof import HOF
 from replay_buffer import ReplayBuffer
 from state_representation import get_input_rep
-from utils import run_game, arg_parser
-from save_model import save_model
+from utils import run_game, arg_parser, save_model
 import sys
 import os
 import shutil
@@ -25,16 +24,18 @@
 def train_on_replays(model, batch):
     states = []
     target_outputs = []
+
+    # Experiences are tuples (state, action, state')
     for experience in batch:
         target_outputs.append(model.get_target(*experience))
         states.append(get_input_rep(experience[0])[0])
 
     states = np.asarray(states)
-
     target_outputs = np.asarray(target_outputs)
 
     # Theres a parameter for sample weights. Use if we do importance sampling
     lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
+
     model.model.fit(states, target_outputs, verbose=0, callbacks=[lr_scheduler])
 
 
@@ -51,7 +52,10 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=
             move = agent_train.action(board, epsilon)
 
             if state is not None and action is not None:
+                # Trains on only the last action
                 agent_train.model.td_update(state, action, board.get_board())
+
+                # Adds last action to replay buffer and trains on a batch
                 replay_buffer.store((state, action, board.get_board()))
                 train_on_replays(agent_train.model, replay_buffer.sample())
 
@@ -72,43 +76,9 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=
     return winner, game
 
 
-def main():
-    # Hyperparameter List
-    total_games = 100000
-    diagnostic_freq = 20
-    resample_freq = 10
-    hof_gate_freq = 500
-    batch_size = 32
-    buffer_size = 4000
-    epsilon = 0.2  # probability with which a random move is chosen to play
-
-    hof_folder = "menagerie"    # Folder to store the hall-of-fame models
-    hof = HOF(mnk, folder=hof_folder)
-
-    print("\nTraining model: {}\n".format(model_name))
-    model, diagnostics, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk))
-
-    save_model(model, model_name)
-    save_plots(mnk, hof, model_name, diagnostics)
-    clear_hof(hof_folder)
-
-    # Can be used after looking at plot to analyze important milestones
-    ind = 0                                                                          # Put into a function
-    while ind != -1:
-        ind = int(input("Query a game: "))
-
-        if ind >= len(games):
-            print("Too large. Try again")
-            continue
-
-        for move in games[ind]:
-            print(move)
-        pass
-
-
 def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, model):
     diagnostics = Diagnostics()
-    games = ["" for _ in range(total_games//diagnostic_freq * 2)]
+    games = ["" for _ in range(total_games // diagnostic_freq * 2)]
 
     # Initialize hall of fame
     hof.store(model)
@@ -118,6 +88,7 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
 
     try:
         for game in range(total_games):
+            # Regularly choose a new HOF opponent
             if game % resample_freq == 0:
                 side_best = [-1, 1][random.random() > 0.5]
                 side_hof = side_best * -1
@@ -134,17 +105,20 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
             side_hof *= -1
             side_best = side_hof * -1
 
-            # Gate the model for HOF
+            # Regularly attempt to add the model into HOF
             if game % hof_gate_freq == 0:
                 reward, improvement = diagnostics.get_recent_performance()
+
+                # Only add if reward is positive and improvement has plateaued
                 if reward > 0 and np.abs(improvement) < 10:
                     hof.gate(model)
                     diagnostics.add_gate_ind()
 
             if game % diagnostic_freq == 0:
                 print("Game: ", game)
 
-                # Run a diagnostic (non-training, no exploration) game to collect data
+                # Run diagnostic (non-training, no exploration) games to collect data
+                # One game is played as player 1, one as player 2
                 diagnostic_winner, game_data = run_diagnostic(model, hof, 1)
                 games[game // diagnostic_freq * 2] = game_data
                 diagnostics.update_diagnostics(diagnostic_winner, 1)
@@ -162,23 +136,62 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
     return model, diagnostics, games
 
 
+# Runs a diagnostic (non-training, no exploration) game to collect data
 def run_diagnostic(model, hof, side_model):
     side_hof = side_model * -1
 
     model_hof = hof.sample("uniform")
     agent_model = Agent(model, side_model)
     agent_hof = Agent(model_hof, side_hof)
 
-    # Run a diagnostic (non-training, no exploration) game to collect data
     return run_game(agent_model, agent_hof, mnk=mnk, verbose=verbose)
 
 
+# Deletes entries in HOF folder
 def clear_hof(folder):
     if os.path.isdir(folder):
         try:
             shutil.rmtree(folder)
         except:
-            print("Error while clearing HOF folder.")
+            print("Error while clearing HOF folder (Specified folder not found).")
+
+
+def main():
+    # Hyperparameter List
+    diagnostic_freq = 20  # How often to run diagnostic games
+    resample_freq = 10  # How often to choose a new HOF opponent
+    hof_gate_freq = 500  # How often to gate a new model into the HOF
+
+    total_games = 1000000  # Total num of training games
+    batch_size = 32  # Batch size for training
+    lr = 0.001  # Learning rate for SGD
+    buffer_size = 4000  # Num of moves to store in replay buffer
+    epsilon = 0.1  # Probability with which a random move is chosen to play
+
+    hof_folder = "menagerie"    # Folder to store the hall-of-fame models
+    hof = HOF(mnk, folder=hof_folder)
+
+    print("\nTraining model: {}\n".format(model_name))
+    model, diagnostics, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk, lr=lr))
+
+    save_model(model, model_name)
+    save_plots(mnk, hof, model_name, diagnostics)
+    clear_hof(hof_folder)
+
+    # Can be used after looking at plot to analyze important milestones
+    # TODO: Put into a function
+    ind = 0
+    while ind != -1:
+        ind = int(input("Query a game: "))
+
+        if ind >= len(games):
+            print("Too large. Try again")
+            continue
+
+        for move in games[ind]:
+            print(move)
+        pass
+
 
 if __name__ == "__main__":
     main()
diff --git a/utils.py b/utils.py
@@ -2,6 +2,11 @@
 import datetime
 
 
+def save_model(model, model_name):
+    print("Saving trained model to models/{}".format(model_name))
+    model.save_to('models/{}'.format(model_name))
+
+
 def run_game(agent_train, agent_versing, mnk=(3, 3, 3), verbose=False):
     board = Board(*mnk, hist_length=-1)
     game = []
@@ -21,6 +26,7 @@ def run_game(agent_train, agent_versing, mnk=(3, 3, 3), verbose=False):
 
     return board.who_won(), game
 
+
 def arg_parser(argv):
     possible_arguments = ["-v", "-mcts"]