See previous commit

ucfai · Feb 9, 2022 · ad027ee · ad027ee
1 parent ad1eab4
commit ad027ee
Show file tree

Hide file tree

Showing 6 changed files with 44 additions and 50 deletions.
diff --git a/agent.py b/agent.py
@@ -34,7 +34,7 @@ def action(self, board, training=False, epsilon=0):
         assert len(legal_moves) > 0, "No legal moves can be played."
 
         greedy_move = self.greedy_action(board)
-        if training and len(board.history()) >= (2 + (self.player == -1)):
+        if training:
             self.model.td_update(board, greedy_move)
 
         # Exploration

diff --git a/hof.py b/hof.py
@@ -6,6 +6,7 @@
 from model import Model
 from agent import Agent
 from mnk import Board
+from utils import run_game
 
 
 class HOF:
@@ -67,21 +68,10 @@ def winrate_matrix(self, iterations):
                 model_i = Model("{}/{}".format(self.folder, self.hof[i]))
                 model_j = Model("{}/{}".format(self.folder, self.hof[j]))
 
-                value = self.run_game(Agent(model_i, 1), Agent(model_j, -1))
+                value = run_game(Agent(model_i, 1), Agent(model_j, -1))
                 matrix[-1].append(value)
         pyplot.imshow(matrix, cmap="bwr")
 
-    def run_game(self, agent1, agent2):
-        board = Board(*self.mnk, form="multiplanar-2", hist_length=-1)
-
-        while board.game_ongoing():
-            if board.player == agent1.player:
-                agent1.action(board)
-            else:
-                agent2.action(board)
-
-        return board.who_won()
-
 
 
 

diff --git a/mnk.py b/mnk.py
@@ -34,7 +34,7 @@ def del_history(self):
         if self.hist_length == -1 or len(self.board_history) < self.hist_length:
             self.board_history.pop()
         else:
-            for i in range(0,len(self.board_history)-1, -1):
+            for i in range(0, len(self.board_history)-1, -1):
                 self.board_history[i+1] = self.board_history[i]
             self.board_history[0] = self.undo_buffer
             self.undo_buffer = np.zeros((self.m, self.n), dtype=int)
@@ -93,7 +93,7 @@ def get_board(self):
                     elif self.board[i][j] == -1:
                         board_planes[i][j][1] = 1
             return np.copy(board_planes.reshape(1, self.m, self.n, 2))
-        elif self.form == "multiplanar-2":
+        elif self.form == "multiplanar-turnflipped":
             board_planes = np.zeros((self.m, self.n, 2), dtype=int)
             for i in range(self.m):
                 for j in range(self.n):
@@ -103,9 +103,8 @@ def get_board(self):
                         board_planes[i][j][1] = 1
             return np.copy(board_planes.reshape(1, self.m, self.n, 2))
 
-
     def game_ongoing(self):
-        return not ( self.player_has_lost() or (self.num_legal_moves() == 0) )
+        return not (self.player_has_lost() or (self.num_legal_moves() == 0))
 
     # Converting numbers to their respective game values
     @staticmethod

diff --git a/model.py b/model.py
@@ -4,21 +4,22 @@
 from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
 from tensorflow.keras.optimizers import SGD
 
+
 class Model:
 
-    def __init__(self, location=False):
+    def __init__(self, location=None):
 
         # If a location is provided, retrieve the model stored at that location
-        if location != False:
+        if location is not None:
             self.model = self.retrieve(location)
             return
 
         opt = SGD(learning_rate=0.02, momentum=0.0)
 
         self.model = Sequential()
-        self.model.add(Conv2D(48, 3, activation='relu', input_shape=(3,3,2)))
+        self.model.add(Conv2D(48, 3, activation='relu', input_shape=(3, 3, 2)))
         self.model.add(Flatten())
-        self.model.add(Dense(27, kernel_initializer='normal', activation='relu', input_shape=(1,18)))
+        self.model.add(Dense(27, kernel_initializer='normal', activation='relu', input_shape=(1, 18)))
         self.model.add(Dense(1, kernel_initializer='normal', activation='tanh'))
 
         self.model.compile(loss='mean_squared_error', optimizer=opt)
@@ -73,11 +74,15 @@ def scheduler(self, epoch, lr):
             return 0.001
 
     # Performs a temporal difference update of the model
-    def td_update(self, board, greedy_move=(), terminal=False):
+    def td_update(self, board, greedy_move=None, terminal=False):
+        # Ensures td_update is possible (agent has experienced 2 states)
+        if len(board.history()) < 3:
+            return
+
         callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler)
         if terminal:
             assert board.who_won() != 2
-            assert greedy_move == ()
+            assert greedy_move is None
             self.model.fit(board.history()[-2], self.state_value(board), batch_size=1, verbose=0, callbacks=[callback])
         else:
             self.model.fit(board.history()[-2], self.action_value(board, greedy_move), batch_size=1, verbose=0, callbacks=[callback])

diff --git a/train.py b/train.py
@@ -6,32 +6,10 @@
 from model import Model
 from plot import plot_wins
 from hof import HOF
+from utils import run_game
 
 mnk = (3, 3, 3)
 
-# Runs a game from start to end
-def run_game(agent_train, agent_versing, epsilon, training):
-    board = Board(*mnk, form="multiplanar-2", hist_length=-1)
-    game = []
-
-    while board.game_ongoing():
-        # Select a move
-        if board.player == agent_versing.player:
-            agent_versing.action(board)
-        else:
-            agent_train.action(board, training, epsilon)
-
-        # Store game for later analysis
-        game.append(board.__str__())
-
-    winner = board.who_won()
-
-    # Back up the terminal state value to the last action chosen by training agent
-    if winner != agent_train.player and training:
-        agent_train.model.td_update(board, terminal=True)
-
-    return winner, game
-
 
 def train(hof, loops, loop_length, epsilon, model):
     end_states = []
@@ -43,7 +21,7 @@ def train(hof, loops, loop_length, epsilon, model):
     model_hof = hof.sample()
 
     for loop in range(loops):
-        print("\n loop: ",loop)
+        print("\n loop: ", loop)
 
         side_best = [-1, 1][random.random() > 0.5]
         side_hof = side_best * -1
@@ -71,14 +49,13 @@ def train(hof, loops, loop_length, epsilon, model):
         agent_hof = Agent(model_hof, side_hof)
 
         # Run a diagnostic (non-training, no exploration) game to collect data
-        diagnostic_winner, game_data = run_game(agent_best, agent_hof, 0, training=False)
+        diagnostic_winner, game_data = run_game(agent_best, agent_hof, 0, training=False, mnk=mnk)
 
         # Store data from loop
         games.append(game_data)
         end_states.append(diagnostic_winner)
         victories.append(diagnostic_winner*side_best)
 
-
     return model, end_states, victories, games
 
 
@@ -113,7 +90,6 @@ def train(hof, loops, loop_length, epsilon, model):
     hof.winrate_matrix(150)
     plt.show()
 
-
     ind = 0
     while ind != -1:
         ind = int(input("Query a game: "))

diff --git a/utils.py b/utils.py
@@ -0,0 +1,24 @@
+from mnk import Board
+
+
+def run_game(agent_train, agent_versing, epsilon=0, training=False, mnk=(3, 3, 3)):
+    board = Board(*mnk, form="multiplanar-turnflipped", hist_length=-1)
+    game = []
+
+    while board.game_ongoing():
+        # Select a move
+        if board.player == agent_versing.player:
+            agent_versing.action(board)
+        else:
+            agent_train.action(board, training, epsilon)
+
+        # Store game for later analysis
+        game.append(board.__str__())
+
+    winner = board.who_won()
+
+    # Back up the terminal state value to the last action chosen by training agent
+    if winner != agent_train.player and training:
+        agent_train.model.td_update(board, terminal=True)
+
+    return winner, game