Changed everything to use q-network and refactored

td_update calls were consolidated to train.py, run_training_game and run_game were split off, created option to make Board object with board array
ucfai · Feb 26, 2022 · a47c42c · a47c42c
1 parent 503b601
commit a47c42c
Show file tree

Hide file tree

Showing 8 changed files with 115 additions and 105 deletions.
diff --git a/agent.py b/agent.py
@@ -3,6 +3,8 @@
 import tensorflow as tf
 import random
 
+import output_representation as output_rep
+
 
 class Agent:
 
@@ -14,34 +16,27 @@ def greedy_action(self, board):
         legal_moves = board.legal_moves()
         assert len(legal_moves) > 0, "No legal moves can be played."
 
-        best_move = legal_moves[0]
-        max_evaluation = -1
-
-        for move in legal_moves:
-            val = self.model.action_value(board, move)
-            if val > max_evaluation:
-                best_move = move
-                max_evaluation = val
+        action_value_vector = self.model.action_values(board)
+        legal_action_values = output_rep.get_legal_vals(board, action_value_vector)
+        best_move = max(legal_action_values, key=legal_action_values.get)
 
         return best_move
 
     def random_action(self, board):
         legal_moves = board.legal_moves()
         return legal_moves[random.randint(0, len(legal_moves) - 1)]
 
-    def action(self, board, training=False, epsilon=0):
+    def action(self, board, epsilon=0):
         legal_moves = board.legal_moves()
         assert len(legal_moves) > 0, "No legal moves can be played."
 
         greedy_move = self.greedy_action(board)
-        if training:
-            self.model.td_update(board, greedy_move)
 
         # Exploration
         if random.random() < epsilon:
             move = self.random_action(board)
         else:
             move = greedy_move
 
-        board.move(*move)
+        return move
 
diff --git a/hof.py b/hof.py
@@ -51,7 +51,7 @@ def sample(self, method='uniform'):
         self.sample_history.append(ind)
 
         name = self.hof[ind]
-        return Model("{}/{}".format(self.folder, name))
+        return Model(self.mnk, "{}/{}".format(self.folder, name))
 
     ''' === MOVED TO PLOT.PY LMK IF I CAN DELETE IT FROM HERE ===
     # Displays a histogram of the model iterations sampled from the hall of fame

diff --git a/mnk.py b/mnk.py
@@ -5,20 +5,26 @@
 
 
 class Board:
-    def __init__(self, m, n, k, form="flatten", hist_length=-1):
+    def __init__(self, m, n, k, hist_length=-1, state=None):
+        if state is None:
+            self.board = np.zeros((m, n), dtype=int)
+            self.player, self.opponent = 1, -1
+        else:
+            self.board, self.player = state
+            self.opponent = self.player * -1
+
         self.m = m
         self.n = n
         self.k = k
-        self.form = form
         self.hist_length = hist_length
-        self.board = np.zeros((m, n), dtype=int)
         self.empty = 0
-        self.player = 1
-        self.opponent = -1
         self.board_history = []
         self.undo_buffer = np.zeros((m, n), dtype=int)
         self.move_history = []
 
+    def shape(self):
+        return self.m, self.n
+
     def history(self):
         return self.board_history
 
@@ -86,7 +92,7 @@ def num_legal_moves(self):
 
     # Reshapes board into the form needed for the model
     def get_board(self):
-        return (self.board, self.player)
+        return self.board, self.player
 
     def game_ongoing(self):
         return not (self.player_has_lost() or (self.num_legal_moves() == 0))

diff --git a/model.py b/model.py
@@ -1,13 +1,16 @@
 import mnk
 import tensorflow as tf
+import numpy as np
 from keras.models import Sequential
 from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
 from tensorflow.keras.optimizers import SGD
 from state_representation import get_input_rep
+import output_representation as output_rep
+from mnk import Board
 
 
 class Model:
-    def __init__(self, location=None):
+    def __init__(self, mnk, location=None):
         """Tic-Tac-Toe Game Evaluator Model.
         Provides a Convolutional Neural Network that can be trained to evaluate different
         board states, determining which player has the advantage at any given state. 
@@ -16,6 +19,8 @@ def __init__(self, location=None):
             location (str, optional): Path to where the model is located. If none
                 is provided a new model is initialized. Defaults to None.
         """
+        self.mnk = mnk
+        m, n, k = mnk
 
         # If a location is provided, retrieve the model stored at that location
         if location is not None:
@@ -25,10 +30,12 @@ def __init__(self, location=None):
         opt = SGD(learning_rate=0.02, momentum=0.0)
 
         self.model = Sequential()
-        self.model.add(Conv2D(48, 3, activation='relu', input_shape=(3, 3, 2)))
+        self.model.add(Conv2D(8, 3, activation='relu', padding="same", input_shape=(m, n, 2)))
+        self.model.add(Conv2D(8, 3, activation='relu', padding="same"))
+
         self.model.add(Flatten())
-        self.model.add(Dense(27, kernel_initializer='normal', activation='relu', input_shape=(1, 18)))
-        self.model.add(Dense(1, kernel_initializer='normal', activation='tanh'))
+        self.model.add(Dense(8, kernel_initializer='normal', activation='relu', input_shape=(1, m * n * 2)))
+        self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', activation='tanh'))
 
         self.model.compile(loss='mean_squared_error', optimizer=opt)
 
@@ -51,47 +58,7 @@ def save_to(self, location):
         """
         self.model.save(location)
 
-    def raw_value(self, board):
-        """Evaluates the players advantage of given a board state.
-        Given board state, the model evaluates and it returns a value in the range
-        (-1, 1) indicating which player has the advantage at the current state.
-        Values closer to 1 mean "X" advantage, -1 means "O" advantage.
-
-        Args:
-            board (Board): Board object to be evaluated.
-
-        Returns:
-            tf.Tensor(1,1): Value indicating which player has the advantage according
-                to the model. (advantage): (-1) "O" <--...0...--> "X" (+1)
-        """
-        if board.who_won() != 2:
-            return tf.constant(board.who_won(), dtype="float32", shape=(1, 1))
-        else:
-            return board.player*self.model(get_input_rep(board.get_board()))
-
-    def raw_action_value(self, board, move):
-        """Evaluates the players advantage if a given move was made on the board.
-        Given a board state and a move to be played, the model evaluates the board that
-        results from this move and returns a value in the range (-1, 1) indicating which
-        player has the advantage after the move. Values closer to 1 mean "X" advantage,
-        -1 means "O" advantage.
-
-        Args:
-            board (Board): Board object where to make the move.
-            move ((int, int)): (x, y) coordinates of the move to be played.
-
-        Returns:
-            tf.Tensor(1,1): Value indicating which player has the advantage after the move
-                according to the model. (advantage): (-1) "O" <--...0...--> "X" (+1)
-        """
-        board.move(*move)
-        val = self.raw_value(board)
-        board.undo_move()
-
-        return val
-
-
-    def state_value(self, board):
+    def state_value(self, board, player):
         """Evaluates the state of the board and returns the advantage of the current player.
         Changes 1 to mean the supplied player is at advantage, -1 disadvantage.
 
@@ -101,17 +68,15 @@ def state_value(self, board):
         Returns:
             tf.Tensor(1,1): Value indicating the advantage of the current player.
         """
-        if board.who_won() == 0:
-            return tf.constant(0, dtype="float32", shape=(1, 1))
-        elif board.who_won() == board.player:
-            return tf.constant(1, dtype="float32", shape=(1, 1))
-        elif board.who_won() == -1*board.player:
-            return tf.constant(-1, dtype="float32", shape=(1, 1))
+
+        if board.who_won() != 2:
+            return tf.constant(player * board.who_won(), dtype="float32", shape=(1, 1))
         else:
-            return self.model(get_input_rep(board.get_board()))
+            action_value_vector = self.action_values(board)
+            legal_action_values = output_rep.get_legal_vals(board, action_value_vector)
+            return max(legal_action_values.values())
 
-    # 
-    def action_value(self, board, move):
+    def action_values(self, board):
         """Evaluates the advantage that the current player would have if he makes a
         given move on the board. Returns the value of taking a move from the given
         board state. Changes 1 to mean the supplied player would be at advantage, -1
@@ -125,10 +90,8 @@ def action_value(self, board, move):
             tf.Tensor(1,1): Value indicating the advantage the player who made the move
                 would have after making the move.
         """
-        board.move(*move)
-        val = self.state_value(board)
-        board.undo_move()
-        return val
+
+        return self.model(get_input_rep(board.get_board()))
 
     def scheduler(self, epoch, lr):
         """Returns an epsilon value as a function of the current epoch.
@@ -151,8 +114,7 @@ def scheduler(self, epoch, lr):
         else:
             return 0.001
 
-
-    def td_update(self, board, greedy_move=None, terminal=False):
+    def td_update(self, state, action, next_state):
         """Performs a temporal difference update of the model.
 
         Args:
@@ -161,15 +123,20 @@ def td_update(self, board, greedy_move=None, terminal=False):
             terminal (bool, optional): True if the current state of the game is terminal,
                 False otherwise. Defaults to False.
         """
-        # Ensures td_update is possible (agent has experienced 2 states)
-        if len(board.history()) < 3:
-            return
+        m, n, k = self.mnk
 
         callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler)
-        if terminal:
-            assert board.who_won() != 2
-            assert greedy_move is None
-            self.model.fit(get_input_rep(board.history()[-2]), self.state_value(board), batch_size=1, verbose=0, callbacks=[callback])
-        else:
-            self.model.fit(get_input_rep(board.history()[-2]), self.action_value(board, greedy_move), batch_size=1, verbose=0, callbacks=[callback])
 
+        start_board = Board(*self.mnk, state=state)
+        next_board = Board(*self.mnk, state=next_state)
+
+        prev_output = self.action_values(start_board)
+        target_output = np.zeros(shape=prev_output.shape, dtype='float32')
+
+        for move in start_board.legal_moves():
+            index = move[0] * m + move[1]
+            target_output[0][index] = prev_output[0][index]
+
+        target_output[0][action[0] * m + action[1]] = self.state_value(next_board, player=state[1])
+
+        self.model.fit(get_input_rep(start_board.get_board()), target_output, batch_size=1, verbose=0, callbacks=[callback])
diff --git a/output_representation.py b/output_representation.py
@@ -0,0 +1,14 @@
+import tensorflow as tf
+import numpy as np
+
+
+def get_legal_vals(board, q_value_vector):
+    move_dict = {}
+    q_value_array = np.array(q_value_vector)[0]
+
+    for move in board.legal_moves():
+        move_dict[move] = q_value_array[move[0] * board.m + move[1]]
+
+    return move_dict
+
+
diff --git a/save_model.py b/save_model.py
@@ -2,4 +2,4 @@
 
 def save_model(model, model_name):
     print("Saving trained model to models/{}".format(model_name))
-    model.save_to('models/{}'.format(model_name))
+    model.save_to('models/{}'.format(model_name))
diff --git a/train.py b/train.py
@@ -13,17 +13,50 @@
 verbose, mcts, model_name = arg_parser(sys.argv)
 mnk = (3, 3, 3)
 
-def main():
 
+def run_training_game(agent_train, agent_versing, epsilon=0, mnk=(3, 3, 3), verbose=False):
+    board = Board(*mnk, hist_length=-1)
+    game = []
+    state, action = None, None
+
+    while board.game_ongoing():
+        # Select a move
+        if board.player == agent_versing.player:
+            board.move(*agent_versing.action(board))
+        else:
+            move = agent_train.action(board, epsilon)
+
+            if state is not None and action is not None:
+                agent_train.model.td_update(state, action, board.get_board())
+
+            state, action = board.get_board(), move
+            board.move(*move)
+
+        # Store game for later analysis
+        game.append(board.__str__())
+
+    winner = board.who_won()
+
+    # Back up the terminal state value to the last action chosen by training agent
+    if winner != agent_train.player:
+        agent_train.model.td_update(state, action, board.get_board())
+
+    if verbose:
+        print(board)
+
+    return winner, game
+
+
+def main():
     # Hyperparameter List
-    num_batches = 20_000        # Total training games = num_batches * games_per_batch
+    num_batches = 20        # Total training games = num_batches * games_per_batch
     games_per_batch = 5
     epsilon = 0.2               # Epsilon is the exploration factor: probability with which a random move is chosen to play
 
     hof = HOF(mnk, folder="menagerie")
 
     print("\nTraining model: {}\n".format(model_name))
-    model, winnersXO, winnersHOF, games = train(hof, num_batches, games_per_batch, epsilon, Model())
+    model, winnersXO, winnersHOF, games = train(hof, num_batches, games_per_batch, epsilon, Model(mnk))
 
     save_model(model, model_name)
     save_plots(hof, model_name, winnersXO, winnersHOF)
@@ -63,7 +96,7 @@ def train(hof, num_batches, games_per_batch, epsilon, model):
                 agent_hof = Agent(model_hof, side_hof)
 
                 # Play game and train on its outcome
-                run_game(agent_best, agent_hof, epsilon, training=True)
+                run_training_game(agent_best, agent_hof, epsilon, mnk)
 
             # Gate will determine if model is worthy, and store in hof only if it is (Currently, it just stores every game)
             hof.gate(model)
@@ -76,7 +109,7 @@ def train(hof, num_batches, games_per_batch, epsilon, model):
             agent_hof = Agent(model_hof, side_hof)
 
             # Run a diagnostic (non-training, no exploration) game to collect data
-            diagnostic_winner, game_data = run_game(agent_best, agent_hof, 0, training=False, mnk=mnk, verbose=verbose)
+            diagnostic_winner, game_data = run_game(agent_best, agent_hof, mnk=mnk, verbose=verbose)
 
             # Store data from diagnostic game for this batch
             games.append(game_data)

diff --git a/utils.py b/utils.py
@@ -1,30 +1,25 @@
 from mnk import Board
 import datetime
 
-def run_game(agent_train, agent_versing, epsilon=0, training=False, mnk=(3, 3, 3), verbose=False):
-    board = Board(*mnk, form="multiplanar-turnflipped", hist_length=-1)
+
+def run_game(agent_train, agent_versing, mnk=(3, 3, 3), verbose=False):
+    board = Board(*mnk, hist_length=-1)
     game = []
 
     while board.game_ongoing():
         # Select a move
         if board.player == agent_versing.player:
-            agent_versing.action(board)
+            board.move(*agent_versing.action(board))
         else:
-            agent_train.action(board, training, epsilon)
+            board.move(*agent_train.action(board))
 
         # Store game for later analysis
         game.append(board.__str__())
 
-    winner = board.who_won()
-
-    # Back up the terminal state value to the last action chosen by training agent
-    if winner != agent_train.player and training:
-        agent_train.model.td_update(board, terminal=True)
-
     if verbose:
         print(board)
 
-    return winner, game
+    return board.who_won(), game
 
 def arg_parser(argv):
     possible_arguments = ["-v", "-mcts"]