Skip to content

Commit

Permalink
Bug fixes, improved diagnostics, new exploration method
Browse files Browse the repository at this point in the history
  • Loading branch information
fshcat committed Nov 15, 2022
1 parent eb26bfb commit 85a18ec
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 88 deletions.
15 changes: 12 additions & 3 deletions agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,26 @@ def random_action(self, board):
legal_moves = board.legal_moves()
return legal_moves[random.randint(0, len(legal_moves) - 1)]

def action(self, board, epsilon=0):
def softmax_action(self, board, beta):
action_value_vector = self.model.action_values(board)
legal_action_values = output_rep.get_legal_vals(board, action_value_vector)

legal_val_tensor = tf.constant([list(legal_action_values.values())])
sampled_ind = tf.random.categorical(tf.math.log(tf.nn.softmax(beta * legal_val_tensor)), 1)[0, 0]

return list(legal_action_values.keys())[sampled_ind]

def action(self, board, epsilon=0, beta=1):
legal_moves = board.legal_moves()
assert len(legal_moves) > 0, "No legal moves can be played."

greedy_move = self.greedy_action(board)
best_move = self.softmax_action(board, beta)

# Exploration
if random.random() < epsilon:
move = self.random_action(board)
else:
move = greedy_move
move = best_move

return move

4 changes: 2 additions & 2 deletions hof.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def store(self, model):
self.basel += 1 / self.pop_size**2

# Samples from the hall of fame with the provided method
def sample(self, method='limit-uniform', index=None):
def sample(self, method='uniform', index=None):
if method == 'limit-uniform': # Performs poorly. Do not use.
threshold = random.random()*self.basel

Expand All @@ -49,4 +49,4 @@ def sample(self, method='limit-uniform', index=None):
self.sample_history.append(ind)
name = self.hof[ind]

return Model(self.mnk, "{}/{}".format(self.folder, name))
return Model(self.mnk, location="{}/{}".format(self.folder, name))
14 changes: 7 additions & 7 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from keras.regularizers import l2
from tensorflow.keras.optimizers import SGD, Adam
from state_representation import get_input_rep
import output_representation as output_rep
Expand Down Expand Up @@ -33,17 +34,16 @@ def __init__(self, mnk, lr=0.001, location=None, model=None):
self.model = model
return

opt = Adam(learning_rate=lr)
self.opt = Adam(learning_rate=lr)
regularization = 0.0001

self.model = Sequential()
self.model.add(Conv2D(filters=32, kernel_size=3, input_shape=(m, n, 2)))
self.model.add(Conv2D(filters=32, kernel_size=3, input_shape=(m, n, 2), kernel_regularizer=l2(regularization)))
self.model.add(Flatten())
self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(54, kernel_initializer='normal', activation='relu'))
self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal'))
self.model.add(Dense(128, kernel_initializer='normal', activation='relu', kernel_regularizer=l2(regularization)))
self.model.add(Dense(mnk[0] * mnk[1], kernel_initializer='normal', kernel_regularizer=l2(regularization)))

self.model.compile(loss='mean_squared_error', optimizer=opt)
self.model.compile(loss='mean_squared_error', optimizer=self.opt)

@staticmethod
def retrieve(location):
Expand Down
54 changes: 21 additions & 33 deletions plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,14 @@


class Diagnostics:
def __init__(self, run_length=100):
def __init__(self, run_length=50):
self.run_length = run_length
self.xo_outcomes = [[], [], []]
self.model_outcomes = [[], [], []]
self.rewards = []
self.reward_totals = []
self.reward_avg = []
self.reward_deltas = []
self.gating_indices = []
self.index = 0

def update_xo(self, x_outcome, o_outcome):
self.xo_outcomes[0].append(x_outcome)
Expand All @@ -29,31 +28,20 @@ def update_outcome(self, train_outcome, hof_outcome):
self.model_outcomes[2].append(1 - train_outcome - hof_outcome)

def update_reward(self, reward):
self.rewards.append(reward)
self.reward_totals.append(reward)
self.reward_deltas.append(reward)

if self.index > 0:
self.reward_totals[-1] += self.reward_totals[-2]
self.reward_deltas[-1] += self.reward_deltas[-2]

if self.index >= self.run_length:
self.reward_totals[-1] -= self.rewards[self.index - self.run_length]
self.reward_deltas[-1] -= 2 * self.rewards[self.index - self.run_length]
n = min(self.run_length, len(self.rewards))

if self.index >= 2 * self.run_length:
self.reward_deltas[-1] += self.rewards[self.index - 2 * self.run_length]

self.index += 1
self.rewards.append(reward)
self.reward_avg.append(np.mean(self.rewards[-n:]))
self.reward_deltas.append(np.mean(self.rewards[-(n//2):]) - np.mean(self.rewards[-n:-(n//2)]))

def add_gate_ind(self):
self.gating_indices.append(self.index)
self.gating_indices.append(len(self.rewards) - 1)

def get_recent_performance(self):
if self.index == 0:
if len(self.rewards) == 0:
return 0, 0

return self.reward_totals[-1] / self.run_length, self.reward_deltas[-1] / self.run_length
return self.reward_avg[-1], self.reward_deltas[-1]


def plot_wins(outcomes, model_name, players):
Expand Down Expand Up @@ -136,17 +124,17 @@ def save_plots(mnk, hof, model_name, diagnostics):

# Graph and save each plot

plt.plot(range(diagnostics.index), np.array(diagnostics.reward_totals) / diagnostics.run_length)
plt.plot(range(len(diagnostics.rewards)), np.array(diagnostics.reward_avg))
add_gating_markers(diagnostics.gating_indices)
plt.title("{}: Reward for {} diagnostic games".format(model_name, diagnostics.index+1))
plt.title("{}: Reward for {} diagnostic games".format(model_name, len(diagnostics.rewards)+1))
plt.xlabel("Game #")
plt.ylabel("Cumulative reward over previous {} games".format(diagnostics.run_length))
plt.savefig("{}/Reward.png".format(plots_dir))
plt.clf()

plt.plot(range(diagnostics.index), np.array(diagnostics.reward_deltas) / diagnostics.run_length)
plt.plot(range(len(diagnostics.rewards)), np.array(diagnostics.reward_deltas))
add_gating_markers(diagnostics.gating_indices)
plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, diagnostics.index+1))
plt.title("{}: Cumulative reward derivative for {} diagnostic games".format(model_name, len(diagnostics.rewards)+1))
plt.xlabel("Game #")
plt.ylabel("Difference in cumulative reward for previous two {} length runs".format(diagnostics.run_length))
plt.savefig("{}/Improvement.png".format(plots_dir))
Expand All @@ -157,23 +145,23 @@ def save_plots(mnk, hof, model_name, diagnostics):
plt.clf()

plt.figure()
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[0], run_length=diagnostics.run_length), label="X")
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[1], run_length=diagnostics.run_length), label="O")
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.xo_outcomes[2], run_length=diagnostics.run_length), label="Tie")
plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.xo_outcomes[0], run_length=diagnostics.run_length), label="X")
plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.xo_outcomes[1], run_length=diagnostics.run_length), label="O")
plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.xo_outcomes[2], run_length=diagnostics.run_length), label="Tie")
plt.legend()
plt.title("{}: XO wins for {} diagnostic games".format(model_name, diagnostics.index + 1))
plt.title("{}: XO wins for {} diagnostic games".format(model_name, len(diagnostics.rewards) + 1))
plt.xlabel("Game #")
plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length))
add_gating_markers(diagnostics.gating_indices)
plt.savefig("{}/XO.png".format(plots_dir))
plt.clf()

plt.figure()
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[0], run_length=diagnostics.run_length), label="Best")
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[1], run_length=diagnostics.run_length), label="HOF")
plt.plot(range(diagnostics.index), get_moving_avg(diagnostics.model_outcomes[2], run_length=diagnostics.run_length), label="Tie")
plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.model_outcomes[0], run_length=diagnostics.run_length), label="Best")
plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.model_outcomes[1], run_length=diagnostics.run_length), label="HOF")
plt.plot(range(len(diagnostics.rewards)), get_moving_avg(diagnostics.model_outcomes[2], run_length=diagnostics.run_length), label="Tie")
plt.legend()
plt.title("{}: Model v Best wins for {} diagnostic games".format(model_name, diagnostics.index + 1))
plt.title("{}: Model v Best wins for {} diagnostic games".format(model_name, len(diagnostics.rewards) + 1))
plt.xlabel("Game #")
plt.ylabel("Proportion of wins averaged over previous {} games".format(diagnostics.run_length))
add_gating_markers(diagnostics.gating_indices)
Expand Down
4 changes: 4 additions & 0 deletions replay_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ def __init__(self, capacity, batch_size):
self.buffer = []
self.index = 0

def clear(self):
self.buffer = []
self.index = 0

def store(self, experience):
if len(self.buffer) >= self.capacity:
self.buffer[self.index] = experience
Expand Down
Loading

0 comments on commit 85a18ec

Please sign in to comment.