Fixed play.py, reorganized training

ucfai · Apr 14, 2022 · 38257d0 · 38257d0
1 parent 0fd653e
commit 38257d0
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 53 deletions.
diff --git a/agent.py b/agent.py
@@ -1,5 +1,4 @@
 import mnk
-import keras.models
 import tensorflow as tf
 import random
 

diff --git a/model.py b/model.py
@@ -1,3 +1,5 @@
+from tensorflow.keras.optimizers import Adam
+
 import mnk
 import tensorflow as tf
 import numpy as np
@@ -27,12 +29,12 @@ def __init__(self, mnk, location=None):
             self.model = self.retrieve(location)
             return
 
-        opt = SGD(learning_rate=0.02, momentum=0.0)
+        opt = SGD(learning_rate=0.001)
 
         self.model = Sequential()
         self.model.add(Flatten(input_shape=(m, n, 2)))
-        self.model.add(Dense(16, kernel_initializer='normal', activation='relu'))
-        self.model.add(Dense(16, kernel_initializer='normal', activation='relu'))
+        self.model.add(Dense(24, kernel_initializer='normal', activation='relu'))
+        self.model.add(Dense(24, kernel_initializer='normal', activation='relu'))
         self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', activation='tanh'))
 
         self.model.compile(loss='mean_squared_error', optimizer=opt)
@@ -119,13 +121,16 @@ def get_target(self, state, action, next_state):
         next_board = Board(*self.mnk, state=next_state)
 
         prev_output = self.action_values(start_board)
-        target_output = np.zeros(shape=prev_output.shape, dtype='float32')
+        # test leaving illegal action values alone (np.copy(prev_output) rather than fill -1)
+        target_output = np.copy(prev_output)
 
-        for move in start_board.legal_moves():
-            index = move[0] * m + move[1]
-            target_output[0][index] = prev_output[0][index]
+        #target_output = np.full(shape=prev_output.shape, fill_value=-1, dtype='float32')
+        #
+        #for move in start_board.legal_moves():
+        #    index = move[0] * m + move[1]
+        #    target_output[0][index] = prev_output[0][index]
 
-        target_output[0][action[0] * m + action[1]] = self.state_value(next_board, player=state[1])
+        target_output[0][action[0] * n + action[1]] = self.state_value(next_board, player=state[1])
         return target_output
 
     def td_update(self, state, action, next_state):
@@ -137,7 +142,6 @@ def td_update(self, state, action, next_state):
             terminal (bool, optional): True if the current state of the game is terminal,
                 False otherwise. Defaults to False.
         """
-        callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler)
         target_output = self.get_target(state, action, next_state)
 
-        self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0, callbacks=[callback])
+        self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0)
diff --git a/output_representation.py b/output_representation.py
@@ -7,7 +7,7 @@ def get_legal_vals(board, q_value_vector):
     q_value_array = np.array(q_value_vector)[0]
 
     for move in board.legal_moves():
-        move_dict[move] = q_value_array[move[0] * board.m + move[1]]
+        move_dict[move] = q_value_array[move[0] * board.n + move[1]]
 
     return move_dict
 

diff --git a/play.py b/play.py
@@ -4,10 +4,10 @@
 import model
 import sys
 
-board = mnk.Board(3, 3, 3, form="flatten")
+board = mnk.Board(3, 3, 3)
 
 assert len(sys.argv) == 2, "Please specify which model you would like to play against (ex: python3 play.py models/PedrosModel).\n Tab complete works!"
-model = model.Model(sys.argv[1])
+model = model.Model((3, 3, 3), sys.argv[1])
 
 print("\n\n" + str(board))
 current_player = input("\nWho plays first (Me/AI)? ")
@@ -29,7 +29,7 @@
                 print("Invalid move! Try again")
         current_player = "AI"
     else:
-        agent.action(board, False, 0)
+        board.move(*agent.action(board))
         current_player = "Me"
 
     print(board)

diff --git a/plot.py b/plot.py
@@ -84,7 +84,7 @@ def save_plots(mnk, hof, model_name, winnersXO, winnersHOF):
     plt.clf()
 
     num_games = len(winnersXO)
-    step = max(1, num_games // 20)
+    step = max(1, num_games // 40)
     matrix = winrate_matrix(mnk, num_games, step)
     plt.imshow(matrix, cmap="bwr")
     plt.imsave("plots/{}/Matrix.png".format(model_name), matrix, cmap="bwr")

diff --git a/train.py b/train.py
@@ -73,17 +73,19 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=
 
 def main():
     # Hyperparameter List
-    num_cycles = 8000        # Total training games = num_cycles * games_per_cycle
-    games_per_cycle = 5
-    batch_size = 16
-    buffer_size = 1000
-    epsilon = 0.2               # Epsilon is the exploration factor: probability with which a random move is chosen to play
+    total_games = 100000
+    diagnostic_freq = 20
+    resample_freq = 10
+    hof_gate_freq = 1000
+    batch_size = 32
+    buffer_size = 4000
+    epsilon = 0.2  # probability with which a random move is chosen to play
 
     hof_folder = "menagerie"    # Folder to store the hall-of-fame models
     hof = HOF(mnk, folder=hof_folder)
 
     print("\nTraining model: {}\n".format(model_name))
-    model, winnersXO, winnersHOF, games = train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, Model(mnk))
+    model, winnersXO, winnersHOF, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk))
 
     save_model(model, model_name)
     save_plots(mnk, hof, model_name, winnersXO, winnersHOF)
@@ -103,10 +105,10 @@ def main():
         pass
 
 
-def train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, model):
-    winnersXO = []
-    winnersHOF = []
-    games = []
+def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, model):
+    winnersXO = [0 for _ in range(total_games//diagnostic_freq)]
+    winnersHOF = [0 for _ in range(total_games//diagnostic_freq)]
+    games = ["" for _ in range(total_games//diagnostic_freq)]
 
     # Initialize hall of fame
     hof.store(model)
@@ -115,42 +117,45 @@ def train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, mo
     replay_buffer = ReplayBuffer(buffer_size, batch_size)
 
     try:
-        for batch_number in range(num_cycles):
-            print("Batch:", batch_number, "(Games {}-{})".format(batch_number * games_per_cycle + 1, (batch_number + 1) * games_per_cycle))
-
-            # Runs a batch of games, after which we can play/save a diagnostic game to see if it improved and store current model to hof
-            for game in range(games_per_cycle):
-
-                # Randomly assign sides (X or O) for game to be played
+        for game in range(total_games):
+            if game % resample_freq == 0:
                 side_best = [-1, 1][random.random() > 0.5]
                 side_hof = side_best * -1
-
                 model_hof = hof.sample("uniform")
 
-                # Initialize the agents
-                agent_best = Agent(model, side_best)
-                agent_hof = Agent(model_hof, side_hof)
-
-                # Play game and train on its outcome
-                run_training_game(agent_best, agent_hof, replay_buffer, epsilon, mnk)
+            # Initialize the agents
+            agent_best = Agent(model, side_best)
+            agent_hof = Agent(model_hof, side_hof)
 
-            # Gate will determine if model is worthy, and store in hof only if it is (Currently, it just stores every game)
-            hof.gate(model)
+            # Play game and train on its outcome
+            run_training_game(agent_best, agent_hof, replay_buffer, epsilon, mnk)
 
-            # Switch sides and resample hof so diagnostic is not biased towards last game played
+            # Switch sides for next game
+            side_hof *= -1
             side_best *= -1
-            side_hof = side_best * -1
-            model_hof = hof.sample("uniform")
-            agent_best = Agent(model, side_best)
-            agent_hof = Agent(model_hof, side_hof)
 
-            # Run a diagnostic (non-training, no exploration) game to collect data
-            diagnostic_winner, game_data = run_game(agent_best, agent_hof, mnk=mnk, verbose=verbose)
+            # Gate the model for HOF
+            if game % hof_gate_freq == 0:
+                hof.gate(model)
+
+            if game % diagnostic_freq == 0:
+                print("Game: ", game)
+
+                # Resample hof so diagnostic is not biased towards last game played
+                temp_side_best = [-1, 1][random.random() > 0.5]
+                temp_side_hof = side_best * -1
+
+                temp_model_hof = hof.sample("uniform")
+                temp_agent_best = Agent(model, temp_side_best)
+                temp_agent_hof = Agent(temp_model_hof, temp_side_hof)
+
+                # Run a diagnostic (non-training, no exploration) game to collect data
+                diagnostic_winner, game_data = run_game(temp_agent_best, temp_agent_hof, mnk=mnk, verbose=verbose)
 
-            # Store data from diagnostic game for this batch
-            games.append(game_data)
-            winnersXO.append(diagnostic_winner)            # X or O
-            winnersHOF.append(diagnostic_winner*side_best)   # Best or HOF
+                # Store data from diagnostic game for this batch
+                games[game//diagnostic_freq] = game_data
+                winnersXO[game//diagnostic_freq] = diagnostic_winner              # X or O
+                winnersHOF[game//diagnostic_freq] = diagnostic_winner*side_best   # Best or HOF
 
     except KeyboardInterrupt:
         print("\n=======================")