Skip to content

Commit

Permalink
Commented various functions and variables
Browse files Browse the repository at this point in the history
  • Loading branch information
fshcat committed Oct 11, 2022
1 parent 4ad968a commit 2dd420f
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 70 deletions.
2 changes: 1 addition & 1 deletion mnk.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def legal_moves(self):
def num_legal_moves(self):
return len(self.legal_moves())

# Reshapes board into the form needed for the model
# Returns tuple of board and player
def get_board(self):
return np.copy(self.board), self.player

Expand Down
44 changes: 22 additions & 22 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


class Model:
def __init__(self, mnk, location=None):
def __init__(self, mnk, lr=0.001, location=None):
"""Tic-Tac-Toe Game Evaluator Model.
Provides a Convolutional Neural Network that can be trained to evaluate different
board states, determining which player has the advantage at any given state.
Expand All @@ -29,7 +29,7 @@ def __init__(self, mnk, location=None):
self.model = self.retrieve(location)
return

opt = SGD(learning_rate=0.01)
opt = SGD(learning_rate=lr)

self.model = Sequential()
self.model.add(Flatten(input_shape=(m, n, 2)))
Expand All @@ -39,7 +39,8 @@ def __init__(self, mnk, location=None):

self.model.compile(loss='mean_squared_error', optimizer=opt)

def retrieve(self, location):
@staticmethod
def retrieve(location):
"""Retrieves keras model located at the given path and returns it.
Args:
Expand All @@ -59,14 +60,15 @@ def save_to(self, location):
self.model.save(location)

def state_value(self, board, player):
"""Evaluates the state of the board and returns the advantage of the current player.
Changes 1 to mean the supplied player is at advantage, -1 disadvantage.
"""Evaluates the state of the board and returns the advantage of the given player.
1 means the supplied player is at advantage, -1 disadvantage.
Args:
board (Board): Board object to be evaluated.
player: Player being used as point of reference.
Returns:
tf.Tensor(1,1): Value indicating the advantage of the current player.
tf.Tensor(shape=(1,1)): Value indicating the advantage of the current player.
"""

if board.who_won() != 2:
Expand All @@ -77,49 +79,47 @@ def state_value(self, board, player):
return max(legal_action_values.values())

def action_values(self, board):
"""Evaluates the advantage that the current player would have if he makes a
given move on the board. Returns the value of taking a move from the given
board state. Changes 1 to mean the supplied player would be at advantage, -1
disadvantage.
"""Returns the vector of action values for all actions in the current board state. This includes
illegal actions that cannot be taken.
Args:
board (Board): Board object where to make the move.
move ((int, int)): (x, y) coordinates of the move to be played.
board (Board): Board object representing current state.
Returns:
tf.Tensor(1,1): Value indicating the advantage the player who made the move
would have after making the move.
tf.Tensor(shape=(m * n)): Vector where entry i indicates the value of taking move i from the current state.
"""

return self.model(get_input_rep(board.get_board()))

def get_target(self, state, action, next_state):
m, n, k = self.mnk

# TODO: Is this actually necessary? Might be wasteful
start_board = Board(*self.mnk, state=state)
next_board = Board(*self.mnk, state=next_state)

prev_output = self.action_values(start_board)
# test leaving illegal action values alone (np.copy(prev_output) rather than fill -1)

# OPT 1: If this line is used, illegal actions will be ignored.
target_output = np.copy(prev_output)

#target_output = np.full(shape=prev_output.shape, fill_value=-1, dtype='float32')
# OPT 2: If this is used, illegal actions will be trained to have action value -1.
# target_output = np.full(shape=prev_output.shape, fill_value=-1, dtype='float32')
#
#for move in start_board.legal_moves():
# for move in start_board.legal_moves():
# index = move[0] * m + move[1]
# target_output[0][index] = prev_output[0][index]

target_output[0][action[0] * n + action[1]] = self.state_value(next_board, player=state[1])
return target_output

# Performs training on a single sample
def td_update(self, state, action, next_state):
"""Performs a temporal difference update of the model.
Args:
board (Board): Board representing the current state of the game.
greedy_move ((int, int)): Move to be played. Defaults to None.
terminal (bool, optional): True if the current state of the game is terminal,
False otherwise. Defaults to False.
state: Board representing the previous state of the game.
action: Move played after previous state.
next_state: Next state of the game after action was taken.
"""
target_output = self.get_target(state, action, next_state)

Expand Down
5 changes: 0 additions & 5 deletions save_model.py

This file was deleted.

97 changes: 55 additions & 42 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
from hof import HOF
from replay_buffer import ReplayBuffer
from state_representation import get_input_rep
from utils import run_game, arg_parser
from save_model import save_model
from utils import run_game, arg_parser, save_model
import sys
import os
import shutil
Expand All @@ -25,16 +24,18 @@
def train_on_replays(model, batch):
states = []
target_outputs = []

# Experiences are tuples (state, action, state')
for experience in batch:
target_outputs.append(model.get_target(*experience))
states.append(get_input_rep(experience[0])[0])

states = np.asarray(states)

target_outputs = np.asarray(target_outputs)

# Theres a parameter for sample weights. Use if we do importance sampling
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

model.model.fit(states, target_outputs, verbose=0, callbacks=[lr_scheduler])


Expand All @@ -51,7 +52,10 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=
move = agent_train.action(board, epsilon)

if state is not None and action is not None:
# Trains on only the last action
agent_train.model.td_update(state, action, board.get_board())

# Adds last action to replay buffer and trains on a batch
replay_buffer.store((state, action, board.get_board()))
train_on_replays(agent_train.model, replay_buffer.sample())

Expand All @@ -72,43 +76,9 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=
return winner, game


def main():
# Hyperparameter List
total_games = 100000
diagnostic_freq = 20
resample_freq = 10
hof_gate_freq = 500
batch_size = 32
buffer_size = 4000
epsilon = 0.2 # probability with which a random move is chosen to play

hof_folder = "menagerie" # Folder to store the hall-of-fame models
hof = HOF(mnk, folder=hof_folder)

print("\nTraining model: {}\n".format(model_name))
model, diagnostics, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk))

save_model(model, model_name)
save_plots(mnk, hof, model_name, diagnostics)
clear_hof(hof_folder)

# Can be used after looking at plot to analyze important milestones
ind = 0 # Put into a function
while ind != -1:
ind = int(input("Query a game: "))

if ind >= len(games):
print("Too large. Try again")
continue

for move in games[ind]:
print(move)
pass


def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, model):
diagnostics = Diagnostics()
games = ["" for _ in range(total_games//diagnostic_freq * 2)]
games = ["" for _ in range(total_games // diagnostic_freq * 2)]

# Initialize hall of fame
hof.store(model)
Expand All @@ -118,6 +88,7 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch

try:
for game in range(total_games):
# Regularly choose a new HOF opponent
if game % resample_freq == 0:
side_best = [-1, 1][random.random() > 0.5]
side_hof = side_best * -1
Expand All @@ -134,17 +105,20 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
side_hof *= -1
side_best = side_hof * -1

# Gate the model for HOF
# Regularly attempt to add the model into HOF
if game % hof_gate_freq == 0:
reward, improvement = diagnostics.get_recent_performance()

# Only add if reward is positive and improvement has plateaued
if reward > 0 and np.abs(improvement) < 10:
hof.gate(model)
diagnostics.add_gate_ind()

if game % diagnostic_freq == 0:
print("Game: ", game)

# Run a diagnostic (non-training, no exploration) game to collect data
# Run diagnostic (non-training, no exploration) games to collect data
# One game is played as player 1, one as player 2
diagnostic_winner, game_data = run_diagnostic(model, hof, 1)
games[game // diagnostic_freq * 2] = game_data
diagnostics.update_diagnostics(diagnostic_winner, 1)
Expand All @@ -162,23 +136,62 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch
return model, diagnostics, games


# Runs a diagnostic (non-training, no exploration) game to collect data
def run_diagnostic(model, hof, side_model):
side_hof = side_model * -1

model_hof = hof.sample("uniform")
agent_model = Agent(model, side_model)
agent_hof = Agent(model_hof, side_hof)

# Run a diagnostic (non-training, no exploration) game to collect data
return run_game(agent_model, agent_hof, mnk=mnk, verbose=verbose)


# Deletes entries in HOF folder
def clear_hof(folder):
if os.path.isdir(folder):
try:
shutil.rmtree(folder)
except:
print("Error while clearing HOF folder.")
print("Error while clearing HOF folder (Specified folder not found).")


def main():
# Hyperparameter List
diagnostic_freq = 20 # How often to run diagnostic games
resample_freq = 10 # How often to choose a new HOF opponent
hof_gate_freq = 500 # How often to gate a new model into the HOF

total_games = 1000000 # Total num of training games
batch_size = 32 # Batch size for training
lr = 0.001 # Learning rate for SGD
buffer_size = 4000 # Num of moves to store in replay buffer
epsilon = 0.1 # Probability with which a random move is chosen to play

hof_folder = "menagerie" # Folder to store the hall-of-fame models
hof = HOF(mnk, folder=hof_folder)

print("\nTraining model: {}\n".format(model_name))
model, diagnostics, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk, lr=lr))

save_model(model, model_name)
save_plots(mnk, hof, model_name, diagnostics)
clear_hof(hof_folder)

# Can be used after looking at plot to analyze important milestones
# TODO: Put into a function
ind = 0
while ind != -1:
ind = int(input("Query a game: "))

if ind >= len(games):
print("Too large. Try again")
continue

for move in games[ind]:
print(move)
pass


if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
import datetime


def save_model(model, model_name):
print("Saving trained model to models/{}".format(model_name))
model.save_to('models/{}'.format(model_name))


def run_game(agent_train, agent_versing, mnk=(3, 3, 3), verbose=False):
board = Board(*mnk, hist_length=-1)
game = []
Expand All @@ -21,6 +26,7 @@ def run_game(agent_train, agent_versing, mnk=(3, 3, 3), verbose=False):

return board.who_won(), game


def arg_parser(argv):
possible_arguments = ["-v", "-mcts"]

Expand Down

0 comments on commit 2dd420f

Please sign in to comment.