From ad027eec2da5091e629e1e6fdeff87f07381dabc Mon Sep 17 00:00:00 2001 From: fshcat Date: Tue, 8 Feb 2022 19:51:10 -0500 Subject: [PATCH] See previous commit --- agent.py | 2 +- hof.py | 14 ++------------ mnk.py | 7 +++---- model.py | 17 +++++++++++------ train.py | 30 +++--------------------------- utils.py | 24 ++++++++++++++++++++++++ 6 files changed, 44 insertions(+), 50 deletions(-) diff --git a/agent.py b/agent.py index 047b1ff..f384547 100644 --- a/agent.py +++ b/agent.py @@ -34,7 +34,7 @@ def action(self, board, training=False, epsilon=0): assert len(legal_moves) > 0, "No legal moves can be played." greedy_move = self.greedy_action(board) - if training and len(board.history()) >= (2 + (self.player == -1)): + if training: self.model.td_update(board, greedy_move) # Exploration diff --git a/hof.py b/hof.py index 7f8220d..dfee6a6 100644 --- a/hof.py +++ b/hof.py @@ -6,6 +6,7 @@ from model import Model from agent import Agent from mnk import Board +from utils import run_game class HOF: @@ -67,21 +68,10 @@ def winrate_matrix(self, iterations): model_i = Model("{}/{}".format(self.folder, self.hof[i])) model_j = Model("{}/{}".format(self.folder, self.hof[j])) - value = self.run_game(Agent(model_i, 1), Agent(model_j, -1)) + value = run_game(Agent(model_i, 1), Agent(model_j, -1)) matrix[-1].append(value) pyplot.imshow(matrix, cmap="bwr") - def run_game(self, agent1, agent2): - board = Board(*self.mnk, form="multiplanar-2", hist_length=-1) - - while board.game_ongoing(): - if board.player == agent1.player: - agent1.action(board) - else: - agent2.action(board) - - return board.who_won() - diff --git a/mnk.py b/mnk.py index e74c987..90107bf 100644 --- a/mnk.py +++ b/mnk.py @@ -34,7 +34,7 @@ def del_history(self): if self.hist_length == -1 or len(self.board_history) < self.hist_length: self.board_history.pop() else: - for i in range(0,len(self.board_history)-1, -1): + for i in range(0, len(self.board_history)-1, -1): self.board_history[i+1] = self.board_history[i] self.board_history[0] = self.undo_buffer self.undo_buffer = np.zeros((self.m, self.n), dtype=int) @@ -93,7 +93,7 @@ def get_board(self): elif self.board[i][j] == -1: board_planes[i][j][1] = 1 return np.copy(board_planes.reshape(1, self.m, self.n, 2)) - elif self.form == "multiplanar-2": + elif self.form == "multiplanar-turnflipped": board_planes = np.zeros((self.m, self.n, 2), dtype=int) for i in range(self.m): for j in range(self.n): @@ -103,9 +103,8 @@ def get_board(self): board_planes[i][j][1] = 1 return np.copy(board_planes.reshape(1, self.m, self.n, 2)) - def game_ongoing(self): - return not ( self.player_has_lost() or (self.num_legal_moves() == 0) ) + return not (self.player_has_lost() or (self.num_legal_moves() == 0)) # Converting numbers to their respective game values @staticmethod diff --git a/model.py b/model.py index ca2be18..a85e7bf 100644 --- a/model.py +++ b/model.py @@ -4,21 +4,22 @@ from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D from tensorflow.keras.optimizers import SGD + class Model: - def __init__(self, location=False): + def __init__(self, location=None): # If a location is provided, retrieve the model stored at that location - if location != False: + if location is not None: self.model = self.retrieve(location) return opt = SGD(learning_rate=0.02, momentum=0.0) self.model = Sequential() - self.model.add(Conv2D(48, 3, activation='relu', input_shape=(3,3,2))) + self.model.add(Conv2D(48, 3, activation='relu', input_shape=(3, 3, 2))) self.model.add(Flatten()) - self.model.add(Dense(27, kernel_initializer='normal', activation='relu', input_shape=(1,18))) + self.model.add(Dense(27, kernel_initializer='normal', activation='relu', input_shape=(1, 18))) self.model.add(Dense(1, kernel_initializer='normal', activation='tanh')) self.model.compile(loss='mean_squared_error', optimizer=opt) @@ -73,11 +74,15 @@ def scheduler(self, epoch, lr): return 0.001 # Performs a temporal difference update of the model - def td_update(self, board, greedy_move=(), terminal=False): + def td_update(self, board, greedy_move=None, terminal=False): + # Ensures td_update is possible (agent has experienced 2 states) + if len(board.history()) < 3: + return + callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler) if terminal: assert board.who_won() != 2 - assert greedy_move == () + assert greedy_move is None self.model.fit(board.history()[-2], self.state_value(board), batch_size=1, verbose=0, callbacks=[callback]) else: self.model.fit(board.history()[-2], self.action_value(board, greedy_move), batch_size=1, verbose=0, callbacks=[callback]) diff --git a/train.py b/train.py index 1fcc308..d2a74b6 100644 --- a/train.py +++ b/train.py @@ -6,32 +6,10 @@ from model import Model from plot import plot_wins from hof import HOF +from utils import run_game mnk = (3, 3, 3) -# Runs a game from start to end -def run_game(agent_train, agent_versing, epsilon, training): - board = Board(*mnk, form="multiplanar-2", hist_length=-1) - game = [] - - while board.game_ongoing(): - # Select a move - if board.player == agent_versing.player: - agent_versing.action(board) - else: - agent_train.action(board, training, epsilon) - - # Store game for later analysis - game.append(board.__str__()) - - winner = board.who_won() - - # Back up the terminal state value to the last action chosen by training agent - if winner != agent_train.player and training: - agent_train.model.td_update(board, terminal=True) - - return winner, game - def train(hof, loops, loop_length, epsilon, model): end_states = [] @@ -43,7 +21,7 @@ def train(hof, loops, loop_length, epsilon, model): model_hof = hof.sample() for loop in range(loops): - print("\n loop: ",loop) + print("\n loop: ", loop) side_best = [-1, 1][random.random() > 0.5] side_hof = side_best * -1 @@ -71,14 +49,13 @@ def train(hof, loops, loop_length, epsilon, model): agent_hof = Agent(model_hof, side_hof) # Run a diagnostic (non-training, no exploration) game to collect data - diagnostic_winner, game_data = run_game(agent_best, agent_hof, 0, training=False) + diagnostic_winner, game_data = run_game(agent_best, agent_hof, 0, training=False, mnk=mnk) # Store data from loop games.append(game_data) end_states.append(diagnostic_winner) victories.append(diagnostic_winner*side_best) - return model, end_states, victories, games @@ -113,7 +90,6 @@ def train(hof, loops, loop_length, epsilon, model): hof.winrate_matrix(150) plt.show() - ind = 0 while ind != -1: ind = int(input("Query a game: ")) diff --git a/utils.py b/utils.py index e69de29..31475a4 100644 --- a/utils.py +++ b/utils.py @@ -0,0 +1,24 @@ +from mnk import Board + + +def run_game(agent_train, agent_versing, epsilon=0, training=False, mnk=(3, 3, 3)): + board = Board(*mnk, form="multiplanar-turnflipped", hist_length=-1) + game = [] + + while board.game_ongoing(): + # Select a move + if board.player == agent_versing.player: + agent_versing.action(board) + else: + agent_train.action(board, training, epsilon) + + # Store game for later analysis + game.append(board.__str__()) + + winner = board.who_won() + + # Back up the terminal state value to the last action chosen by training agent + if winner != agent_train.player and training: + agent_train.model.td_update(board, terminal=True) + + return winner, game