Skip to content

Commit

Permalink
Fixed play.py, reorganized training
Browse files Browse the repository at this point in the history
  • Loading branch information
fshcat committed Apr 14, 2022
1 parent 0fd653e commit 38257d0
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 53 deletions.
1 change: 0 additions & 1 deletion agent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import mnk
import keras.models
import tensorflow as tf
import random

Expand Down
24 changes: 14 additions & 10 deletions model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from tensorflow.keras.optimizers import Adam

import mnk
import tensorflow as tf
import numpy as np
Expand Down Expand Up @@ -27,12 +29,12 @@ def __init__(self, mnk, location=None):
self.model = self.retrieve(location)
return

opt = SGD(learning_rate=0.02, momentum=0.0)
opt = SGD(learning_rate=0.001)

self.model = Sequential()
self.model.add(Flatten(input_shape=(m, n, 2)))
self.model.add(Dense(16, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(16, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(24, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(24, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', activation='tanh'))

self.model.compile(loss='mean_squared_error', optimizer=opt)
Expand Down Expand Up @@ -119,13 +121,16 @@ def get_target(self, state, action, next_state):
next_board = Board(*self.mnk, state=next_state)

prev_output = self.action_values(start_board)
target_output = np.zeros(shape=prev_output.shape, dtype='float32')
# test leaving illegal action values alone (np.copy(prev_output) rather than fill -1)
target_output = np.copy(prev_output)

for move in start_board.legal_moves():
index = move[0] * m + move[1]
target_output[0][index] = prev_output[0][index]
#target_output = np.full(shape=prev_output.shape, fill_value=-1, dtype='float32')
#
#for move in start_board.legal_moves():
# index = move[0] * m + move[1]
# target_output[0][index] = prev_output[0][index]

target_output[0][action[0] * m + action[1]] = self.state_value(next_board, player=state[1])
target_output[0][action[0] * n + action[1]] = self.state_value(next_board, player=state[1])
return target_output

def td_update(self, state, action, next_state):
Expand All @@ -137,7 +142,6 @@ def td_update(self, state, action, next_state):
terminal (bool, optional): True if the current state of the game is terminal,
False otherwise. Defaults to False.
"""
callback = tf.keras.callbacks.LearningRateScheduler(self.scheduler)
target_output = self.get_target(state, action, next_state)

self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0, callbacks=[callback])
self.model.fit(get_input_rep(state), target_output, batch_size=1, verbose=0)
2 changes: 1 addition & 1 deletion output_representation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def get_legal_vals(board, q_value_vector):
q_value_array = np.array(q_value_vector)[0]

for move in board.legal_moves():
move_dict[move] = q_value_array[move[0] * board.m + move[1]]
move_dict[move] = q_value_array[move[0] * board.n + move[1]]

return move_dict

Expand Down
6 changes: 3 additions & 3 deletions play.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import model
import sys

board = mnk.Board(3, 3, 3, form="flatten")
board = mnk.Board(3, 3, 3)

assert len(sys.argv) == 2, "Please specify which model you would like to play against (ex: python3 play.py models/PedrosModel).\n Tab complete works!"
model = model.Model(sys.argv[1])
model = model.Model((3, 3, 3), sys.argv[1])

print("\n\n" + str(board))
current_player = input("\nWho plays first (Me/AI)? ")
Expand All @@ -29,7 +29,7 @@
print("Invalid move! Try again")
current_player = "AI"
else:
agent.action(board, False, 0)
board.move(*agent.action(board))
current_player = "Me"

print(board)
Expand Down
2 changes: 1 addition & 1 deletion plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def save_plots(mnk, hof, model_name, winnersXO, winnersHOF):
plt.clf()

num_games = len(winnersXO)
step = max(1, num_games // 20)
step = max(1, num_games // 40)
matrix = winrate_matrix(mnk, num_games, step)
plt.imshow(matrix, cmap="bwr")
plt.imsave("plots/{}/Matrix.png".format(model_name), matrix, cmap="bwr")
Expand Down
79 changes: 42 additions & 37 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,19 @@ def run_training_game(agent_train, agent_versing, replay_buffer, epsilon=0, mnk=

def main():
# Hyperparameter List
num_cycles = 8000 # Total training games = num_cycles * games_per_cycle
games_per_cycle = 5
batch_size = 16
buffer_size = 1000
epsilon = 0.2 # Epsilon is the exploration factor: probability with which a random move is chosen to play
total_games = 100000
diagnostic_freq = 20
resample_freq = 10
hof_gate_freq = 1000
batch_size = 32
buffer_size = 4000
epsilon = 0.2 # probability with which a random move is chosen to play

hof_folder = "menagerie" # Folder to store the hall-of-fame models
hof = HOF(mnk, folder=hof_folder)

print("\nTraining model: {}\n".format(model_name))
model, winnersXO, winnersHOF, games = train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, Model(mnk))
model, winnersXO, winnersHOF, games = train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, Model(mnk))

save_model(model, model_name)
save_plots(mnk, hof, model_name, winnersXO, winnersHOF)
Expand All @@ -103,10 +105,10 @@ def main():
pass


def train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, model):
winnersXO = []
winnersHOF = []
games = []
def train(hof, total_games, diagnostic_freq, resample_freq, hof_gate_freq, batch_size, epsilon, buffer_size, model):
winnersXO = [0 for _ in range(total_games//diagnostic_freq)]
winnersHOF = [0 for _ in range(total_games//diagnostic_freq)]
games = ["" for _ in range(total_games//diagnostic_freq)]

# Initialize hall of fame
hof.store(model)
Expand All @@ -115,42 +117,45 @@ def train(hof, num_cycles, games_per_cycle, batch_size, epsilon, buffer_size, mo
replay_buffer = ReplayBuffer(buffer_size, batch_size)

try:
for batch_number in range(num_cycles):
print("Batch:", batch_number, "(Games {}-{})".format(batch_number * games_per_cycle + 1, (batch_number + 1) * games_per_cycle))

# Runs a batch of games, after which we can play/save a diagnostic game to see if it improved and store current model to hof
for game in range(games_per_cycle):

# Randomly assign sides (X or O) for game to be played
for game in range(total_games):
if game % resample_freq == 0:
side_best = [-1, 1][random.random() > 0.5]
side_hof = side_best * -1

model_hof = hof.sample("uniform")

# Initialize the agents
agent_best = Agent(model, side_best)
agent_hof = Agent(model_hof, side_hof)

# Play game and train on its outcome
run_training_game(agent_best, agent_hof, replay_buffer, epsilon, mnk)
# Initialize the agents
agent_best = Agent(model, side_best)
agent_hof = Agent(model_hof, side_hof)

# Gate will determine if model is worthy, and store in hof only if it is (Currently, it just stores every game)
hof.gate(model)
# Play game and train on its outcome
run_training_game(agent_best, agent_hof, replay_buffer, epsilon, mnk)

# Switch sides and resample hof so diagnostic is not biased towards last game played
# Switch sides for next game
side_hof *= -1
side_best *= -1
side_hof = side_best * -1
model_hof = hof.sample("uniform")
agent_best = Agent(model, side_best)
agent_hof = Agent(model_hof, side_hof)

# Run a diagnostic (non-training, no exploration) game to collect data
diagnostic_winner, game_data = run_game(agent_best, agent_hof, mnk=mnk, verbose=verbose)
# Gate the model for HOF
if game % hof_gate_freq == 0:
hof.gate(model)

if game % diagnostic_freq == 0:
print("Game: ", game)

# Resample hof so diagnostic is not biased towards last game played
temp_side_best = [-1, 1][random.random() > 0.5]
temp_side_hof = side_best * -1

temp_model_hof = hof.sample("uniform")
temp_agent_best = Agent(model, temp_side_best)
temp_agent_hof = Agent(temp_model_hof, temp_side_hof)

# Run a diagnostic (non-training, no exploration) game to collect data
diagnostic_winner, game_data = run_game(temp_agent_best, temp_agent_hof, mnk=mnk, verbose=verbose)

# Store data from diagnostic game for this batch
games.append(game_data)
winnersXO.append(diagnostic_winner) # X or O
winnersHOF.append(diagnostic_winner*side_best) # Best or HOF
# Store data from diagnostic game for this batch
games[game//diagnostic_freq] = game_data
winnersXO[game//diagnostic_freq] = diagnostic_winner # X or O
winnersHOF[game//diagnostic_freq] = diagnostic_winner*side_best # Best or HOF

except KeyboardInterrupt:
print("\n=======================")
Expand Down

0 comments on commit 38257d0

Please sign in to comment.