Skip to content

Commit

Permalink
A whole bunch of changes
Browse files Browse the repository at this point in the history
- double q learning
- lagging model
- fixed win evaluation bug
- added option for n-step td targets
- adam optimizer
- epsilon annealing
- learning_rate annealing
  • Loading branch information
fshcat committed Nov 3, 2022
1 parent 2dd420f commit ea08cc6
Show file tree
Hide file tree
Showing 9 changed files with 263 additions and 161 deletions.
39 changes: 6 additions & 33 deletions hof.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,8 @@ def store(self, model):
self.pop_size += 1
self.basel += 1 / self.pop_size**2

# Gating method decides whether to add the model to the hall of fame
def gate(self, model):
# Simple gating method, stores model after every training episode
self.store(model)

# Samples from the hall of fame with the provided method
def sample(self, method='uniform'):
def sample(self, method='limit-uniform', index=None):
if method == 'limit-uniform': # Performs poorly. Do not use.
threshold = random.random()*self.basel

Expand All @@ -48,32 +43,10 @@ def sample(self, method='uniform'):
elif method == 'naive':
ind = self.pop_size-1

self.sample_history.append(ind)
if index is not None:
name = self.hof[index]
else:
self.sample_history.append(ind)
name = self.hof[ind]

name = self.hof[ind]
return Model(self.mnk, "{}/{}".format(self.folder, name))

''' === MOVED TO PLOT.PY LMK IF I CAN DELETE IT FROM HERE ===
# Displays a histogram of the model iterations sampled from the hall of fame
def sample_histogram(self, num=100):
pyplot.hist(self.sample_history, num)
pyplot.title("Sampling of Model Indices from HOF")
pyplot.show()
'''

''' === MOVED TO PLOT.PY LMK IF I CAN DELETE IT FROM HERE ===
# Displays a winrate matrix of the historical policies for the given player
def winrate_matrix(self, iterations):
matrix = []
for i in range (0, self.pop_size, iterations):
matrix.append([])
for j in range (0, self.pop_size, iterations):
model_i = Model("{}/{}".format(self.folder, self.hof[i]))
model_j = Model("{}/{}".format(self.folder, self.hof[j]))
value = run_game(Agent(model_i, 1), Agent(model_j, -1))[0]
matrix[-1].append(value)
pyplot.imshow(matrix, cmap="bwr")
pyplot.imsave("plots/Matrix.png", matrix, cmap="bwr")
'''
4 changes: 2 additions & 2 deletions mcts_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def rollout(board):
class Node:
def __init__(self, last_move = None):
self.last_move = last_move
self.q = 0
self.n = 0
self.q = 0 # Average of rewards from rollouts
self.n = 0 # Number of times node has been visited
self.children = []
self.isLeaf = True

Expand Down
60 changes: 15 additions & 45 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import SGD, Adam
from state_representation import get_input_rep
import output_representation as output_rep
from mnk import Board


class Model:
def __init__(self, mnk, lr=0.001, location=None):
def __init__(self, mnk, lr=0.001, location=None, model=None):
"""Tic-Tac-Toe Game Evaluator Model.
Provides a Convolutional Neural Network that can be trained to evaluate different
board states, determining which player has the advantage at any given state.
Expand All @@ -29,13 +29,19 @@ def __init__(self, mnk, lr=0.001, location=None):
self.model = self.retrieve(location)
return

opt = SGD(learning_rate=lr)
if model is not None:
self.model = model
return

opt = Adam(learning_rate=lr)

self.model = Sequential()
self.model.add(Flatten(input_shape=(m, n, 2)))
self.model.add(Dense(24, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(24, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', activation='tanh'))
self.model.add(Conv2D(filters=32, kernel_size=3, input_shape=(m, n, 2)))
self.model.add(Flatten())
self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal'))

self.model.compile(loss='mean_squared_error', optimizer=opt)

Expand Down Expand Up @@ -90,45 +96,9 @@ def action_values(self, board):

return self.model(get_input_rep(board.get_board()))

def get_target(self, state, action, next_state):
m, n, k = self.mnk

# TODO: Is this actually necessary? Might be wasteful
start_board = Board(*self.mnk, state=state)
next_board = Board(*self.mnk, state=next_state)

prev_output = self.action_values(start_board)

# OPT 1: If this line is used, illegal actions will be ignored.
target_output = np.copy(prev_output)

# OPT 2: If this is used, illegal actions will be trained to have action value -1.
# target_output = np.full(shape=prev_output.shape, fill_value=-1, dtype='float32')
#
# for move in start_board.legal_moves():
# index = move[0] * m + move[1]
# target_output[0][index] = prev_output[0][index]

target_output[0][action[0] * n + action[1]] = self.state_value(next_board, player=state[1])
return target_output

# Performs training on a single sample
def td_update(self, state, action, next_state):
"""Performs a temporal difference update of the model.
Args:
state: Board representing the previous state of the game.
action: Move played after previous state.
next_state: Next state of the game after action was taken.
"""
target_output = self.get_target(state, action, next_state)

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0, callbacks=[lr_scheduler])


def scheduler(epoch, lr):
if lr > 0.0005:
return lr * tf.math.exp(-0.00005)
if lr > 0.0001:
return lr * tf.math.exp(-0.0009)
else:
return lr
6 changes: 4 additions & 2 deletions play.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@

board = mnk.Board(3, 3, 3)

assert len(sys.argv) == 2, "Please specify which model you would like to play against (ex: python3 play.py models/PedrosModel).\n Tab complete works!"
model = model.Model((3, 3, 3), sys.argv[1])
#assert len(sys.argv) == 2, "Please specify which model you would like to play against (ex: python3 play.py models/PedrosModel).\n Tab complete works!"
#model = model.Model((3, 3, 3), sys.argv[1])

model = model.Model((3, 3, 3), "new_model")

print("\n\n" + str(board))
current_player = input("\nWho plays first (Me/AI)? ")
Expand Down
93 changes: 61 additions & 32 deletions plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,33 @@
from agent import Agent
from utils import run_game
import random
import numpy as np
import os


class Diagnostics:
def __init__(self, run_length=100):
self.run_length = run_length
self.xo_outcomes = []
self.model_outcomes = []
self.xo_outcomes = [[], [], []]
self.model_outcomes = [[], [], []]
self.rewards = []
self.reward_totals = []
self.reward_deltas = []
self.gating_indices = []
self.index = 0

def update_diagnostics(self, outcome, player):
self.xo_outcomes.append(outcome)
def update_xo(self, x_outcome, o_outcome):
self.xo_outcomes[0].append(x_outcome)
self.xo_outcomes[1].append(o_outcome)
self.xo_outcomes[2].append(1 - x_outcome - o_outcome)

reward = outcome*player
self.model_outcomes.append(reward)
def update_outcome(self, train_outcome, hof_outcome):
self.model_outcomes[0].append(train_outcome)
self.model_outcomes[1].append(hof_outcome)
self.model_outcomes[2].append(1 - train_outcome - hof_outcome)

def update_reward(self, reward):
self.rewards.append(reward)
self.reward_totals.append(reward)
self.reward_deltas.append(reward)

Expand All @@ -30,11 +38,11 @@ def update_diagnostics(self, outcome, player):
self.reward_deltas[-1] += self.reward_deltas[-2]

if self.index >= self.run_length:
self.reward_totals[-1] -= self.model_outcomes[self.index - self.run_length]
self.reward_deltas[-1] -= 2 * self.model_outcomes[self.index - self.run_length]
self.reward_totals[-1] -= self.rewards[self.index - self.run_length]
self.reward_deltas[-1] -= 2 * self.rewards[self.index - self.run_length]

if self.index >= 2 * self.run_length:
self.reward_deltas[-1] += self.model_outcomes[self.index - 2 * self.run_length]
self.reward_deltas[-1] += self.rewards[self.index - 2 * self.run_length]

self.index += 1

Expand All @@ -45,7 +53,7 @@ def get_recent_performance(self):
if self.index == 0:
return 0, 0

return self.reward_totals[-1], self.reward_deltas[-1]
return self.reward_totals[-1] / self.run_length, self.reward_deltas[-1] / self.run_length


def plot_wins(outcomes, model_name, players):
Expand Down Expand Up @@ -90,28 +98,35 @@ def add_gating_markers(gating_indices):
def sample_histogram(sample_history, bins=100):
plt.hist(sample_history, bins)
plt.title("Sampling of Model Indices from HOF")
plt.show()


# 1v1 matrix for historical models: ideally, newer versions beating earlier ones
def winrate_matrix(mnk, num_games, step):
print("Calculating winrate matrix... (may take a few mins)")
matrix = []
for i in range (0, num_games, step):
matrix.append([])
for j in range (0, num_games, step):
print("Calculating winrate matrix... (may take a while)")
matrix = np.zeros((num_games // step, num_games // step))
for i in range(0, num_games, step):
for j in range(0, num_games, step):
model_i = Model(mnk, "menagerie/{}".format(i))
model_j = Model(mnk, "menagerie/{}".format(j))

side_i = [-1, 1][random.random() > 0.5]
side_i = 1
side_j = side_i * -1

value = run_game(Agent(model_i, side_i), Agent(model_j, side_j))[0]
matrix[-1].append(value)
matrix[i // step, j // step] = value

return matrix


def get_moving_avg(data, run_length=50):
arr = []
for i in range(len(data)):
avg = sum(data[max(0, i - run_length):i+1]) / min(run_length, (i + 1))
arr.append(avg)

return arr


def save_plots(mnk, hof, model_name, diagnostics):

# Create model's plots folder
Expand All @@ -120,40 +135,54 @@ def save_plots(mnk, hof, model_name, diagnostics):
os.makedirs(plots_dir)

# Graph and save each plot
plt.figure()
plot_wins(diagnostics.xo_outcomes, model_name, ['X', 'O'])
add_gating_markers(diagnostics.gating_indices)
plt.savefig("{}/XO.png".format(plots_dir))
plt.clf()

plot_wins(diagnostics.model_outcomes, model_name, ["Best", "HOF"])
add_gating_markers(diagnostics.gating_indices)
plt.savefig("{}/HOF.png".format(plots_dir))
plt.clf()

plt.plot(range(diagnostics.index), diagnostics.reward_totals)
plt.plot(range(diagnostics.index), np.array(diagnostics.reward_totals) / diagnostics.run_length)
add_gating_markers(diagnostics.gating_indices)
plt.title("{}: Reward for {} diagnostic games".format(model_name, diagnostics.index+1))
plt.xlabel("Game #")
plt.ylabel("Cumulative reward over previous {} games".format(diagnostics.run_length))
plt.savefig("{}/Reward.png".format(plots_dir))
plt.clf()

plt.plot(range(diagnostics.index), diagnostics.reward_deltas)
plt.plot(range(diagnostics.index), np.array(diagnostics.reward_deltas) / diagnostics.run_length)
add_gating_markers(diagnostics.gating_indices)
plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, diagnostics.index+1))
plt.xlabel("Game #")
plt.ylabel("Difference in cumulative reward for previous two {} length runs".format(diagnostics.run_length))
plt.savefig("{}/Improvement.png".format(plots_dir))
plt.clf()

sample_histogram(hof.sample_history, 20)
sample_histogram(hof.sample_history, hof.pop_size if hof.pop_size < 40 else 20)
plt.savefig("{}/Sampling.png".format(plots_dir))
plt.clf()

num_games = diagnostics.index
plt.figure()
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[0], run_length=diagnostics.run_length), label="X")
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[1], run_length=diagnostics.run_length), label="O")
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[2], run_length=diagnostics.run_length), label="Tie")
plt.legend()
plt.title("{}: XO wins for {} diagnostic games".format(model_name, diagnostics.index + 1))
plt.xlabel("Game #")
plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length))
add_gating_markers(diagnostics.gating_indices)
plt.savefig("{}/XO.png".format(plots_dir))
plt.clf()

plt.figure()
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[0], run_length=diagnostics.run_length), label="Best")
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[1], run_length=diagnostics.run_length), label="HOF")
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[2], run_length=diagnostics.run_length), label="Tie")
plt.legend()
plt.title("{}: Model v Best wins for {} diagnostic games".format(model_name, diagnostics.index + 1))
plt.xlabel("Game #")
plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length))
add_gating_markers(diagnostics.gating_indices)
plt.savefig("{}/HOF.png".format(plots_dir))
plt.clf()

step = max(1, hof.pop_size // 40)
matrix = winrate_matrix(mnk, hof.pop_size, step)
plt.imshow(matrix, cmap="bwr")
plt.imsave("plots/{}/Matrix.png".format(model_name), matrix, cmap="bwr")
plt.clf()

10 changes: 6 additions & 4 deletions replay_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ def __init__(self, capacity, batch_size):
self.capacity = capacity
self.batch_size = batch_size
self.buffer = []
self.index = 0

def store(self, experience):
self.buffer.append(experience)

if len(self.buffer) > self.capacity:
del self.buffer[0]
if len(self.buffer) >= self.capacity:
self.buffer[self.index] = experience
self.index = (self.index + 1) % self.capacity
else:
self.buffer.append(experience)

def sample(self):
if len(self.buffer) < self.batch_size:
Expand Down
2 changes: 1 addition & 1 deletion state_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ def get_input_rep(board, form="multiplanar-turnflipped"):
board_planes[i][j][0] = 1
elif board[i][j] == -1 * player:
board_planes[i][j][1] = 1
return np.copy(board_planes.reshape(1, m, n, 2))
return np.copy(np.expand_dims(board_planes, axis=0))
Loading

0 comments on commit ea08cc6

Please sign in to comment.