Skip to content

Commit

Permalink
Changed everything to use q-network and refactored
Browse files Browse the repository at this point in the history
td_update calls were consolidated to train.py, run_training_game and
run_game were split off, created option to make Board object with board
array
  • Loading branch information
fshcat committed Feb 26, 2022
1 parent 503b601 commit a47c42c
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 105 deletions.
19 changes: 7 additions & 12 deletions agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import tensorflow as tf
import random

import output_representation as output_rep


class Agent:

Expand All @@ -14,34 +16,27 @@ def greedy_action(self, board):
legal_moves = board.legal_moves()
assert len(legal_moves) > 0, "No legal moves can be played."

best_move = legal_moves[0]
max_evaluation = -1

for move in legal_moves:
val = self.model.action_value(board, move)
if val > max_evaluation:
best_move = move
max_evaluation = val
action_value_vector = self.model.action_values(board)
legal_action_values = output_rep.get_legal_vals(board, action_value_vector)
best_move = max(legal_action_values, key=legal_action_values.get)

return best_move

def random_action(self, board):
legal_moves = board.legal_moves()
return legal_moves[random.randint(0, len(legal_moves) - 1)]

def action(self, board, training=False, epsilon=0):
def action(self, board, epsilon=0):
legal_moves = board.legal_moves()
assert len(legal_moves) > 0, "No legal moves can be played."

greedy_move = self.greedy_action(board)
if training:
self.model.td_update(board, greedy_move)

# Exploration
if random.random() < epsilon:
move = self.random_action(board)
else:
move = greedy_move

board.move(*move)
return move

2 changes: 1 addition & 1 deletion hof.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def sample(self, method='uniform'):
self.sample_history.append(ind)

name = self.hof[ind]
return Model("{}/{}".format(self.folder, name))
return Model(self.mnk, "{}/{}".format(self.folder, name))

''' === MOVED TO PLOT.PY LMK IF I CAN DELETE IT FROM HERE ===
# Displays a histogram of the model iterations sampled from the hall of fame
Expand Down
18 changes: 12 additions & 6 deletions mnk.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,26 @@


class Board:
def __init__(self, m, n, k, form="flatten", hist_length=-1):
def __init__(self, m, n, k, hist_length=-1, state=None):
if state is None:
self.board = np.zeros((m, n), dtype=int)
self.player, self.opponent = 1, -1
else:
self.board, self.player = state
self.opponent = self.player * -1

self.m = m
self.n = n
self.k = k
self.form = form
self.hist_length = hist_length
self.board = np.zeros((m, n), dtype=int)
self.empty = 0
self.player = 1
self.opponent = -1
self.board_history = []
self.undo_buffer = np.zeros((m, n), dtype=int)
self.move_history = []

def shape(self):
return self.m, self.n

def history(self):
return self.board_history

Expand Down Expand Up @@ -86,7 +92,7 @@ def num_legal_moves(self):

# Reshapes board into the form needed for the model
def get_board(self):
return (self.board, self.player)
return self.board, self.player

def game_ongoing(self):
return not (self.player_has_lost() or (self.num_legal_moves() == 0))
Expand Down
105 changes: 36 additions & 69 deletions model.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import mnk
import tensorflow as tf
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.optimizers import SGD
from state_representation import get_input_rep
import output_representation as output_rep
from mnk import Board


class Model:
def __init__(self, location=None):
def __init__(self, mnk, location=None):
"""Tic-Tac-Toe Game Evaluator Model.
Provides a Convolutional Neural Network that can be trained to evaluate different
board states, determining which player has the advantage at any given state.
Expand All @@ -16,6 +19,8 @@ def __init__(self, location=None):
location (str, optional): Path to where the model is located. If none
is provided a new model is initialized. Defaults to None.
"""
self.mnk = mnk
m, n, k = mnk

# If a location is provided, retrieve the model stored at that location
if location is not None:
Expand All @@ -25,10 +30,12 @@ def __init__(self, location=None):
opt = SGD(learning_rate=0.02, momentum=0.0)

self.model = Sequential()
self.model.add(Conv2D(48, 3, activation='relu', input_shape=(3, 3, 2)))
self.model.add(Conv2D(8, 3, activation='relu', padding="same", input_shape=(m, n, 2)))
self.model.add(Conv2D(8, 3, activation='relu', padding="same"))

self.model.add(Flatten())
self.model.add(Dense(27, kernel_initializer='normal', activation='relu', input_shape=(1, 18)))
self.model.add(Dense(1, kernel_initializer='normal', activation='tanh'))
self.model.add(Dense(8, kernel_initializer='normal', activation='relu', input_shape=(1, m * n * 2)))
self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', activation='tanh'))

self.model.compile(loss='mean_squared_error', optimizer=opt)

Expand All @@ -51,47 +58,7 @@ def save_to(self, location):
"""
self.model.save(location)

def raw_value(self, board):
"""Evaluates the players advantage of given a board state.
Given board state, the model evaluates and it returns a value in the range
(-1, 1) indicating which player has the advantage at the current state.
Values closer to 1 mean "X" advantage, -1 means "O" advantage.
Args:
board (Board): Board object to be evaluated.
Returns:
tf.Tensor(1,1): Value indicating which player has the advantage according
to the model. (advantage): (-1) "O" <--...0...--> "X" (+1)
"""
if board.who_won() != 2:
return tf.constant(board.who_won(), dtype="float32", shape=(1, 1))
else:
return board.player*self.model(get_input_rep(board.get_board()))

def raw_action_value(self, board, move):
"""Evaluates the players advantage if a given move was made on the board.
Given a board state and a move to be played, the model evaluates the board that
results from this move and returns a value in the range (-1, 1) indicating which
player has the advantage after the move. Values closer to 1 mean "X" advantage,
-1 means "O" advantage.
Args:
board (Board): Board object where to make the move.
move ((int, int)): (x, y) coordinates of the move to be played.
Returns:
tf.Tensor(1,1): Value indicating which player has the advantage after the move
according to the model. (advantage): (-1) "O" <--...0...--> "X" (+1)
"""
board.move(*move)
val = self.raw_value(board)
board.undo_move()

return val


def state_value(self, board):
def state_value(self, board, player):
"""Evaluates the state of the board and returns the advantage of the current player.
Changes 1 to mean the supplied player is at advantage, -1 disadvantage.
Expand All @@ -101,17 +68,15 @@ def state_value(self, board):
Returns:
tf.Tensor(1,1): Value indicating the advantage of the current player.
"""
if board.who_won() == 0:
return tf.constant(0, dtype="float32", shape=(1, 1))
elif board.who_won() == board.player:
return tf.constant(1, dtype="float32", shape=(1, 1))
elif board.who_won() == -1*board.player:
return tf.constant(-1, dtype="float32", shape=(1, 1))

if board.who_won() != 2:
return tf.constant(player * board.who_won(), dtype="float32", shape=(1, 1))
else:
return self.model(get_input_rep(board.get_board()))
action_value_vector = self.action_values(board)
legal_action_values = output_rep.get_legal_vals(board, action_value_vector)
return max(legal_action_values.values())

#
def action_value(self, board, move):
def action_values(self, board):
"""Evaluates the advantage that the current player would have if he makes a
given move on the board. Returns the value of taking a move from the given
board state. Changes 1 to mean the supplied player would be at advantage, -1
Expand All @@ -125,10 +90,8 @@ def action_value(self, board, move):
tf.Tensor(1,1): Value indicating the advantage the player who made the move
would have after making the move.
"""
board.move(*move)
val = self.state_value(board)
board.undo_move()
return val

return self.model(get_input_rep(board.get_board()))

def scheduler(self, epoch, lr):
"""Returns an epsilon value as a function of the current epoch.
Expand All @@ -151,8 +114,7 @@ def scheduler(self, epoch, lr):
else:
return 0.001


def td_update(self, board, greedy_move=None, terminal=False):
def td_update(self, state, action, next_state):
"""Performs a temporal difference update of the model.
Args:
Expand All @@ -161,15 +123,20 @@ def td_update(self, board, greedy_move=None, terminal=False):
terminal (bool, optional): True if the current state of the game is terminal,
False otherwise. Defaults to False.
"""
# Ensures td_update is possible (agent has experienced 2 states)
if len(board.history()) < 3:
return
m, n, k = self.mnk

callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler)
if terminal:
assert board.who_won() != 2
assert greedy_move is None
self.model.fit(get_input_rep(board.history()[-2]), self.state_value(board), batch_size=1, verbose=0, callbacks=[callback])
else:
self.model.fit(get_input_rep(board.history()[-2]), self.action_value(board, greedy_move), batch_size=1, verbose=0, callbacks=[callback])

start_board = Board(*self.mnk, state=state)
next_board = Board(*self.mnk, state=next_state)

prev_output = self.action_values(start_board)
target_output = np.zeros(shape=prev_output.shape, dtype='float32')

for move in start_board.legal_moves():
index = move[0] * m + move[1]
target_output[0][index] = prev_output[0][index]

target_output[0][action[0] * m + action[1]] = self.state_value(next_board, player=state[1])

self.model.fit(get_input_rep(start_board.get_board()), target_output, batch_size=1, verbose=0, callbacks=[callback])
14 changes: 14 additions & 0 deletions output_representation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import tensorflow as tf
import numpy as np


def get_legal_vals(board, q_value_vector):
move_dict = {}
q_value_array = np.array(q_value_vector)[0]

for move in board.legal_moves():
move_dict[move] = q_value_array[move[0] * board.m + move[1]]

return move_dict


2 changes: 1 addition & 1 deletion save_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

def save_model(model, model_name):
print("Saving trained model to models/{}".format(model_name))
model.save_to('models/{}'.format(model_name))
model.save_to('models/{}'.format(model_name))
43 changes: 38 additions & 5 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,50 @@
verbose, mcts, model_name = arg_parser(sys.argv)
mnk = (3, 3, 3)

def main():

def run_training_game(agent_train, agent_versing, epsilon=0, mnk=(3, 3, 3), verbose=False):
board = Board(*mnk, hist_length=-1)
game = []
state, action = None, None

while board.game_ongoing():
# Select a move
if board.player == agent_versing.player:
board.move(*agent_versing.action(board))
else:
move = agent_train.action(board, epsilon)

if state is not None and action is not None:
agent_train.model.td_update(state, action, board.get_board())

state, action = board.get_board(), move
board.move(*move)

# Store game for later analysis
game.append(board.__str__())

winner = board.who_won()

# Back up the terminal state value to the last action chosen by training agent
if winner != agent_train.player:
agent_train.model.td_update(state, action, board.get_board())

if verbose:
print(board)

return winner, game


def main():
# Hyperparameter List
num_batches = 20_000 # Total training games = num_batches * games_per_batch
num_batches = 20 # Total training games = num_batches * games_per_batch
games_per_batch = 5
epsilon = 0.2 # Epsilon is the exploration factor: probability with which a random move is chosen to play

hof = HOF(mnk, folder="menagerie")

print("\nTraining model: {}\n".format(model_name))
model, winnersXO, winnersHOF, games = train(hof, num_batches, games_per_batch, epsilon, Model())
model, winnersXO, winnersHOF, games = train(hof, num_batches, games_per_batch, epsilon, Model(mnk))

save_model(model, model_name)
save_plots(hof, model_name, winnersXO, winnersHOF)
Expand Down Expand Up @@ -63,7 +96,7 @@ def train(hof, num_batches, games_per_batch, epsilon, model):
agent_hof = Agent(model_hof, side_hof)

# Play game and train on its outcome
run_game(agent_best, agent_hof, epsilon, training=True)
run_training_game(agent_best, agent_hof, epsilon, mnk)

# Gate will determine if model is worthy, and store in hof only if it is (Currently, it just stores every game)
hof.gate(model)
Expand All @@ -76,7 +109,7 @@ def train(hof, num_batches, games_per_batch, epsilon, model):
agent_hof = Agent(model_hof, side_hof)

# Run a diagnostic (non-training, no exploration) game to collect data
diagnostic_winner, game_data = run_game(agent_best, agent_hof, 0, training=False, mnk=mnk, verbose=verbose)
diagnostic_winner, game_data = run_game(agent_best, agent_hof, mnk=mnk, verbose=verbose)

# Store data from diagnostic game for this batch
games.append(game_data)
Expand Down
17 changes: 6 additions & 11 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,25 @@
from mnk import Board
import datetime

def run_game(agent_train, agent_versing, epsilon=0, training=False, mnk=(3, 3, 3), verbose=False):
board = Board(*mnk, form="multiplanar-turnflipped", hist_length=-1)

def run_game(agent_train, agent_versing, mnk=(3, 3, 3), verbose=False):
board = Board(*mnk, hist_length=-1)
game = []

while board.game_ongoing():
# Select a move
if board.player == agent_versing.player:
agent_versing.action(board)
board.move(*agent_versing.action(board))
else:
agent_train.action(board, training, epsilon)
board.move(*agent_train.action(board))

# Store game for later analysis
game.append(board.__str__())

winner = board.who_won()

# Back up the terminal state value to the last action chosen by training agent
if winner != agent_train.player and training:
agent_train.model.td_update(board, terminal=True)

if verbose:
print(board)

return winner, game
return board.who_won(), game

def arg_parser(argv):
possible_arguments = ["-v", "-mcts"]
Expand Down

0 comments on commit a47c42c

Please sign in to comment.