Skip to content

Commit

Permalink
Reorganized evaluation, added gating mechanism
Browse files Browse the repository at this point in the history
Succesful gating is plotted with red lines
  • Loading branch information
fshcat committed Apr 19, 2022
1 parent 626a3f0 commit f29be17
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 113 deletions.
2 changes: 1 addition & 1 deletion mnk.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def num_legal_moves(self):

# Reshapes board into the form needed for the model
def get_board(self):
return self.board, self.player
return np.copy(self.board), self.player

def game_ongoing(self):
return not (self.player_has_lost() or (self.num_legal_moves() == 0))
Expand Down
33 changes: 10 additions & 23 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self, mnk, location=None):
self.model = self.retrieve(location)
return

opt = SGD(learning_rate=0.001)
opt = SGD(learning_rate=0.01)

self.model = Sequential()
self.model.add(Flatten(input_shape=(m, n, 2)))
Expand Down Expand Up @@ -93,27 +93,6 @@ def action_values(self, board):

return self.model(get_input_rep(board.get_board()))

def scheduler(self, epoch, lr):
"""Returns an epsilon value as a function of the current epoch.
As a function of the epoch number, it returns a decreasing epsilon value
used in the Epsilon-Greedy Method.
Args:
epoch (int): Number of training epoch.
lr (???): ??? (Is this for the decay?)
Returns:
double: Epsilon value. Probability of choosing to explore.
"""
if epoch < 5000:
return 0.02
elif epoch < 15000:
return 0.01
elif epoch < 25000:
return 0.002
else:
return 0.001

def get_target(self, state, action, next_state):
m, n, k = self.mnk

Expand Down Expand Up @@ -144,4 +123,12 @@ def td_update(self, state, action, next_state):
"""
target_output = self.get_target(state, action, next_state)

self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0)
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0, callbacks=[lr_scheduler])


def scheduler(epoch, lr):
if lr > 0.0005:
return lr * tf.math.exp(-0.00005)
else:
return lr
127 changes: 67 additions & 60 deletions plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,57 @@
import random
import os

def plot_wins(outcomes, model_name, players):

class Diagnostics:
def __init__(self, run_length=100):
self.run_length = run_length
self.xo_outcomes = []
self.model_outcomes = []
self.reward_totals = []
self.reward_deltas = []
self.gating_indices = []
self.index = 0

def update_diagnostics(self, outcome, player):
self.xo_outcomes.append(outcome)

reward = outcome*player
self.model_outcomes.append(reward)

self.reward_totals.append(reward)
self.reward_deltas.append(reward)

if self.index > 0:
self.reward_totals[-1] += self.reward_totals[-2]
self.reward_deltas[-1] += self.reward_deltas[-2]

if self.index >= self.run_length:
self.reward_totals[-1] -= self.model_outcomes[self.index - self.run_length]
self.reward_deltas[-1] -= 2 * self.model_outcomes[self.index - self.run_length]

if self.index >= 2 * self.run_length:
self.reward_deltas[-1] += self.model_outcomes[self.index - 2 * self.run_length]

self.index += 1

def add_gate_ind(self):
self.gating_indices.append(self.index)

def get_recent_performance(self):
if self.index == 0:
return 0, 0

return self.reward_totals[-1], self.reward_deltas[-1]


def plot_wins(outcomes, model_name, players):
# We don't plot total wins for each player bc the graph would always increase, making performance evaluation harder.
# Instead, we plot runs: how many of the previous n games were won. This way, if a model begins performing worse, its line will decrease.

player1_wins, player2_wins, ties = [], [], []
run_totals = [0, 0, 0]
num_games = len(outcomes)
run_length = max(min(num_games // 100, 100), 1)
run_length = 100

for i, outcome in enumerate(outcomes):
if i < run_length:
Expand All @@ -38,64 +80,19 @@ def plot_wins(outcomes, model_name, players):
plt.ylabel("Wins out of previous {} games".format(run_length))


def plot_reward(outcomes, model_name):

# We don't plot total wins for each player bc the graph would always increase, making performance evaluation harder.
# Instead, we plot runs: how many of the previous n games were won. This way, if a model begins performing worse, its line will decrease.

run_totals = []
num_games = len(outcomes)
run_length = max(min(num_games // 100, 100), 1)
# Vertical lines where the model was gated
def add_gating_markers(gating_indices):
for ind in gating_indices:
plt.axvline(x=ind, c='red')

for i, outcome in enumerate(outcomes):
if i == 0:
run_totals.append(outcome)
elif i < run_length:
run_totals.append(run_totals[-1] + outcome)
else:
run_totals.append(run_totals[-1] + outcome - outcomes[i - run_length])

game = range(num_games)

plt.plot(game, run_totals)

plt.title("{}: Reward for {} diagnostic games".format(model_name, num_games))
plt.xlabel("Game #")
plt.ylabel("Cumulative reward over previous {} games".format(run_length))

def plot_improvement(outcomes, model_name):

# We don't plot total wins for each player bc the graph would always increase, making performance evaluation harder.
# Instead, we plot runs: how many of the previous n games were won. This way, if a model begins performing worse, its line will decrease.

run_deltas = []
num_games = len(outcomes)
run_length = max(min(num_games // 100, 100), 1)

for i, outcome in enumerate(outcomes):
if i == 0:
run_deltas.append(outcome)
elif i < run_length:
run_deltas.append(run_deltas[-1] + outcome)
elif i < 2 * run_length:
run_deltas.append(run_deltas[-1] + outcome - 2 * outcomes[i-run_length])
else:
run_deltas.append(run_deltas[-1] + outcome - 2 * outcomes[i-run_length] + outcomes[i-2*run_length])

game = range(num_games)

plt.plot(game, run_deltas)

plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, num_games))
plt.xlabel("Game #")
plt.ylabel("Difference in cumulative reward for previous two {} length runs".format(run_length))

# Displays a histogram of the model iterations sampled from the hall of fame
def sample_histogram(sample_history, bins=100):
plt.hist(sample_history, bins)
plt.title("Sampling of Model Indices from HOF")
plt.show()


# 1v1 matrix for historical models: ideally, newer versions beating earlier ones
def winrate_matrix(mnk, num_games, step):
print("Calculating winrate matrix... (may take a few mins)")
Expand All @@ -115,7 +112,7 @@ def winrate_matrix(mnk, num_games, step):
return matrix


def save_plots(mnk, hof, model_name, winnersXO, winnersHOF):
def save_plots(mnk, hof, model_name, diagnostics):

# Create model's plots folder
plots_dir = "plots/{}".format(model_name)
Expand All @@ -124,29 +121,39 @@ def save_plots(mnk, hof, model_name, winnersXO, winnersHOF):

# Graph and save each plot
plt.figure()
plot_wins(winnersXO, model_name, ['X', 'O'])
plot_wins(diagnostics.xo_outcomes, model_name, ['X', 'O'])
add_gating_markers(diagnostics.gating_indices)
plt.savefig("{}/XO.png".format(plots_dir))
plt.clf()

plot_wins(winnersHOF, model_name, ["Best", "HOF"])
plot_wins(diagnostics.model_outcomes, model_name, ["Best", "HOF"])
add_gating_markers(diagnostics.gating_indices)
plt.savefig("{}/HOF.png".format(plots_dir))
plt.clf()

plot_reward(winnersHOF, model_name)
plt.plot(range(diagnostics.index), diagnostics.reward_totals)
add_gating_markers(diagnostics.gating_indices)
plt.title("{}: Reward for {} diagnostic games".format(model_name, diagnostics.index+1))
plt.xlabel("Game #")
plt.ylabel("Cumulative reward over previous {} games".format(diagnostics.run_length))
plt.savefig("{}/Reward.png".format(plots_dir))
plt.clf()

plot_improvement(winnersHOF, model_name)
plt.plot(range(diagnostics.index), diagnostics.reward_deltas)
add_gating_markers(diagnostics.gating_indices)
plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, diagnostics.index+1))
plt.xlabel("Game #")
plt.ylabel("Difference in cumulative reward for previous two {} length runs".format(diagnostics.run_length))
plt.savefig("{}/Improvement.png".format(plots_dir))
plt.clf()

sample_histogram(hof.sample_history, 20)
plt.savefig("{}/Sampling.png".format(plots_dir))
plt.clf()

num_games = len(winnersXO)
step = max(1, num_games // 40)
matrix = winrate_matrix(mnk, num_games, step)
num_games = diagnostics.index
step = max(1, hof.pop_size // 40)
matrix = winrate_matrix(mnk, hof.pop_size, step)
plt.imshow(matrix, cmap="bwr")
plt.imsave("plots/{}/Matrix.png".format(model_name), matrix, cmap="bwr")
plt.clf()
66 changes: 37 additions & 29 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

from mnk import Board
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from agent import Agent
from model import Model
from plot import plot_wins, save_plots
from model import Model, scheduler
from plot import Diagnostics, save_plots
from hof import HOF
from replay_buffer import ReplayBuffer
from state_representation import get_input_rep
Expand All @@ -32,8 +33,9 @@ def train_on_replays(model, batch):

target_outputs = np.asarray(target_outputs)

# Theres a parameter for train_on_batch for sample weights. Use if we do importance sampling
model.model.fit(states, target_outputs, verbose=0)
# Theres a parameter for sample weights. Use if we do importance sampling
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
model.model.fit(states, target_outputs, verbose=0, callbacks=[lr_scheduler])


def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=(3, 3, 3), verbose=False):
Expand Down Expand Up @@ -62,8 +64,7 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=
winner = board.who_won()

# Back up the terminal state value to the last action chosen by training agent
if winner != agent_train.player:
agent_train.model.td_update(state, action, board.get_board())
agent_train.model.td_update(state, action, board.get_board())

if verbose:
print(board)
Expand All @@ -73,10 +74,10 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=

def main():
# Hyperparameter List
total_games = 50000
total_games = 100000
diagnostic_freq = 20
resample_freq = 10
hof_gate_freq = 2000
hof_gate_freq = 500
batch_size = 32
buffer_size = 4000
epsilon = 0.2 # probability with which a random move is chosen to play
Expand All @@ -85,10 +86,10 @@ def main():
hof = HOF(mnk, folder=hof_folder)

print("\nTraining model: {}\n".format(model_name))
model, winnersXO, winnersHOF, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk))
model, diagnostics, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk))

save_model(model, model_name)
save_plots(mnk, hof, model_name, winnersXO, winnersHOF)
save_plots(mnk, hof, model_name, diagnostics)
clear_hof(hof_folder)

# Can be used after looking at plot to analyze important milestones
Expand All @@ -106,9 +107,8 @@ def main():


def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, model):
winnersXO = [0 for _ in range(total_games//diagnostic_freq)]
winnersHOF = [0 for _ in range(total_games//diagnostic_freq)]
games = ["" for _ in range(total_games//diagnostic_freq)]
diagnostics = Diagnostics()
games = ["" for _ in range(total_games//diagnostic_freq * 2)]

# Initialize hall of fame
hof.store(model)
Expand All @@ -132,38 +132,46 @@ def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch

# Switch sides for next game
side_hof *= -1
side_best *= -1
side_best = side_hof * -1

# Gate the model for HOF
if game % hof_gate_freq == 0:
hof.gate(model)
reward, improvement = diagnostics.get_recent_performance()
if reward > 0 and np.abs(improvement) < 10:
hof.gate(model)
diagnostics.add_gate_ind()

if game % diagnostic_freq == 0:
print("Game: ", game)

# Resample hof so diagnostic is not biased towards last game played
temp_side_best = [-1, 1][random.random() > 0.5]
temp_side_hof = side_best * -1

temp_model_hof = hof.sample("uniform")
temp_agent_best = Agent(model, temp_side_best)
temp_agent_hof = Agent(temp_model_hof, temp_side_hof)

# Run a diagnostic (non-training, no exploration) game to collect data
diagnostic_winner, game_data = run_game(temp_agent_best, temp_agent_hof, mnk=mnk, verbose=verbose)
diagnostic_winner, game_data = run_diagnostic(model, hof, 1)
games[game // diagnostic_freq * 2] = game_data
diagnostics.update_diagnostics(diagnostic_winner, 1)

# Store data from diagnostic game for this batch
games[game//diagnostic_freq] = game_data
winnersXO[game//diagnostic_freq] = diagnostic_winner # X or O
winnersHOF[game//diagnostic_freq] = diagnostic_winner*side_best # Best or HOF
diagnostic_winner, game_data = run_diagnostic(model, hof, -1)
games[game // diagnostic_freq * 2 + 1] = game_data
diagnostics.update_diagnostics(diagnostic_winner, -1)

except KeyboardInterrupt:
print("\n=======================")
print("Training interrupted.")
print("=======================")

print("Training completed.")
return model, winnersXO, winnersHOF, games
return model, diagnostics, games


def run_diagnostic(model, hof, side_model):
side_hof = side_model * -1

model_hof = hof.sample("uniform")
agent_model = Agent(model, side_model)
agent_hof = Agent(model_hof, side_hof)

# Run a diagnostic (non-training, no exploration) game to collect data
return run_game(agent_model, agent_hof, mnk=mnk, verbose=verbose)


def clear_hof(folder):
if os.path.isdir(folder):
Expand Down

0 comments on commit f29be17

Please sign in to comment.