Skip to content

Commit

Permalink
Training on batches from replay
Browse files Browse the repository at this point in the history
  • Loading branch information
fshcat committed Apr 8, 2022
1 parent a0d1e9e commit 0fd653e
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 33 deletions.
35 changes: 18 additions & 17 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,9 @@ def __init__(self, mnk, location=None):
opt = SGD(learning_rate=0.02, momentum=0.0)

self.model = Sequential()
self.model.add(Conv2D(8, 3, activation='relu', padding="same", input_shape=(m, n, 2)))
self.model.add(Conv2D(8, 3, activation='relu', padding="same"))

self.model.add(Flatten())
self.model.add(Dense(8, kernel_initializer='normal', activation='relu', input_shape=(1, m * n * 2)))
self.model.add(Flatten(input_shape=(m, n, 2)))
self.model.add(Dense(16, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(16, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', activation='tanh'))

self.model.compile(loss='mean_squared_error', optimizer=opt)
Expand Down Expand Up @@ -114,19 +112,9 @@ def scheduler(self, epoch, lr):
else:
return 0.001

def td_update(self, state, action, next_state):
"""Performs a temporal difference update of the model.
Args:
board (Board): Board representing the current state of the game.
greedy_move ((int, int)): Move to be played. Defaults to None.
terminal (bool, optional): True if the current state of the game is terminal,
False otherwise. Defaults to False.
"""
def get_target(self, state, action, next_state):
m, n, k = self.mnk

callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler)

start_board = Board(*self.mnk, state=state)
next_board = Board(*self.mnk, state=next_state)

Expand All @@ -138,5 +126,18 @@ def td_update(self, state, action, next_state):
target_output[0][index] = prev_output[0][index]

target_output[0][action[0] * m + action[1]] = self.state_value(next_board, player=state[1])
return target_output

def td_update(self, state, action, next_state):
"""Performs a temporal difference update of the model.
Args:
board (Board): Board representing the current state of the game.
greedy_move ((int, int)): Move to be played. Defaults to None.
terminal (bool, optional): True if the current state of the game is terminal,
False otherwise. Defaults to False.
"""
callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler)
target_output = self.get_target(state, action, next_state)

self.model.fit(get_input_rep(start_board.get_board()), target_output, batch_size=1, verbose=0, callbacks=[callback])
self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0, callbacks=[callback])
12 changes: 6 additions & 6 deletions plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def plot_wins(outcomes, model_name, players):
player1_wins, player2_wins, ties = [], [], []
run_totals = [0, 0, 0]
num_games = len(outcomes)
run_length = max(num_games // 10 , 1)
run_length = max(num_games // 20, 1)

for i, outcome in enumerate(outcomes):
if i < run_length:
Expand Down Expand Up @@ -44,14 +44,14 @@ def sample_histogram(sample_history, bins=100):
plt.show()

# 1v1 matrix for historical models: ideally, newer versions beating earlier ones
def winrate_matrix(num_games, step):
def winrate_matrix(mnk, num_games, step):
print("Calculating winrate matrix... (may take a few mins)")
matrix = []
for i in range (0, num_games, step):
matrix.append([])
for j in range (0, num_games, step):
model_i = Model("menagerie/{}".format(i))
model_j = Model("menagerie/{}".format(j))
model_i = Model(mnk, "menagerie/{}".format(i))
model_j = Model(mnk, "menagerie/{}".format(j))

side_i = [-1, 1][random.random() > 0.5]
side_j = side_i * -1
Expand All @@ -62,7 +62,7 @@ def winrate_matrix(num_games, step):
return matrix


def save_plots(hof, model_name, winnersXO, winnersHOF):
def save_plots(mnk, hof, model_name, winnersXO, winnersHOF):

# Create model's plots folder
plots_dir = "plots/{}".format(model_name)
Expand All @@ -85,7 +85,7 @@ def save_plots(hof, model_name, winnersXO, winnersHOF):

num_games = len(winnersXO)
step = max(1, num_games // 20)
matrix = winrate_matrix(num_games, step)
matrix = winrate_matrix(mnk, num_games, step)
plt.imshow(matrix, cmap="bwr")
plt.imsave("plots/{}/Matrix.png".format(model_name), matrix, cmap="bwr")
plt.clf()
23 changes: 23 additions & 0 deletions replay_buffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import random


class ReplayBuffer:
def __init__(self, capacity, batch_size):
self.capacity = capacity
self.batch_size = batch_size
self.buffer = []

def store(self, experience):
self.buffer.append(experience)

if len(self.buffer) > self.capacity:
del self.buffer[0]

def sample(self):
if len(self.buffer) < self.batch_size:
return self.buffer

return random.sample(self.buffer, self.batch_size)



52 changes: 42 additions & 10 deletions train.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import numpy as np

from mnk import Board
import random
import matplotlib.pyplot as plt
from agent import Agent
from model import Model
from plot import plot_wins, save_plots
from hof import HOF
from replay_buffer import ReplayBuffer
from state_representation import get_input_rep
from utils import run_game, arg_parser
from save_model import save_model
import sys
Expand All @@ -13,10 +17,26 @@

# Set cmd-line training arguments
verbose, mcts, model_name = arg_parser(sys.argv)
model_name = "new_model"
mnk = (3, 3, 3)


def run_training_game(agent_train, agent_versing, epsilon=0, mnk=(3, 3, 3), verbose=False):
def train_on_replays(model, batch):
states = []
target_outputs = []
for experience in batch:
target_outputs.append(model.get_target(*experience))
states.append(get_input_rep(experience[0])[0])

states = np.asarray(states)

target_outputs = np.asarray(target_outputs)

# Theres a parameter for train_on_batch for sample weights. Use if we do importance sampling
model.model.fit(states, target_outputs, verbose=0)


def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=(3, 3, 3), verbose=False):
board = Board(*mnk, hist_length=-1)
game = []
state, action = None, None
Expand All @@ -30,6 +50,8 @@ def run_training_game(agent_train, agent_versing, epsilon=0, mnk=(3, 3, 3), verb

if state is not None and action is not None:
agent_train.model.td_update(state, action, board.get_board())
replay_buffer.store((state, action, board.get_board()))
train_on_replays(agent_train.model, replay_buffer.sample())

state, action = board.get_board(), move
board.move(*move)
Expand All @@ -51,43 +73,53 @@ def run_training_game(agent_train, agent_versing, epsilon=0, mnk=(3, 3, 3), verb

def main():
# Hyperparameter List
num_batches = 20 # Total training games = num_batches * games_per_batch
games_per_batch = 5
num_cycles = 8000 # Total training games = num_cycles * games_per_cycle
games_per_cycle = 5
batch_size = 16
buffer_size = 1000
epsilon = 0.2 # Epsilon is the exploration factor: probability with which a random move is chosen to play

hof_folder = "menagerie" # Folder to store the hall-of-fame models
hof = HOF(mnk, folder=hof_folder)

print("\nTraining model: {}\n".format(model_name))
model, winnersXO, winnersHOF, games = train(hof, num_batches, games_per_batch, epsilon, Model(mnk))
model, winnersXO, winnersHOF, games = train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, Model(mnk))

save_model(model, model_name)
save_plots(hof, model_name, winnersXO, winnersHOF)
save_plots(mnk, hof, model_name, winnersXO, winnersHOF)
clear_hof(hof_folder)

# Can be used after looking at plot to analyze important milestones
ind = 0 # Put into a function
while ind != -1:
ind = int(input("Query a game: "))

if ind >= len(games):
print("Too large. Try again")
continue

for move in games[ind]:
print(move)
pass


def train(hof, num_batches, games_per_batch, epsilon, model):
def train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, model):
winnersXO = []
winnersHOF = []
games = []

# Initialize hall of fame
hof.store(model)

# Initialize replay buffer
replay_buffer = ReplayBuffer(buffer_size, batch_size)

try:
for batch_number in range(num_batches):
print("Batch:", batch_number, "(Games {}-{})".format(batch_number * games_per_batch + 1, (batch_number + 1) * games_per_batch))
for batch_number in range(num_cycles):
print("Batch:", batch_number, "(Games {}-{})".format(batch_number * games_per_cycle + 1, (batch_number + 1) * games_per_cycle))

# Runs a batch of games, after which we can play/save a diagnostic game to see if it improved and store current model to hof
for game in range(games_per_batch):
for game in range(games_per_cycle):

# Randomly assign sides (X or O) for game to be played
side_best = [-1, 1][random.random() > 0.5]
Expand All @@ -100,7 +132,7 @@ def train(hof, num_batches, games_per_batch, epsilon, model):
agent_hof = Agent(model_hof, side_hof)

# Play game and train on its outcome
run_training_game(agent_best, agent_hof, epsilon, mnk)
run_training_game(agent_best, agent_hof, replay_buffer, epsilon, mnk)

# Gate will determine if model is worthy, and store in hof only if it is (Currently, it just stores every game)
hof.gate(model)
Expand Down

0 comments on commit 0fd653e

Please sign in to comment.