-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtrain.py
91 lines (68 loc) · 2.71 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# TODO: PLOT LOSS CURVES
import tensorflow as tf
import numpy as np
import mnk
import random
from agent import Agent
from model import modelXO
from plot import plot_wins
from hof import HOF
m, n, k = 3, 3, 3
hof = HOF("menagerie")
hof.store(modelXO, "init")
modelHOF = hof.sample_hof()
hof_freq = 10 # how often to save the model to the HOF
hof_duration = 2 # how long to keep using the same HOF model before loading a new one
num_games = 100000
epsilon = 0.15 # exploration constant
decay_freq = 30 # number of games between each epsilon decrement
decay_factor = 0.0005 # how much to decrease by
print_freq = 100 # number of games between each print
end_states = []
victories = []
stored_games = []
for game in range(num_games):
board = mnk.Board(m, n, k, flatten=False, hist_length=-1)
# Decrease exploration over time
if game % decay_freq == 0 and game != 0:
epsilon -= decay_factor
# Choose hall of fame model by sampling
if game % hof_duration == 0 and game != 0:
modelHOF = hof.sample_hof()
# Determine who will play as X and 0
sideT = [-1, 1][random.random() > 0.5]
sideHOF = [None, -1, 1][sideT]
# Initialize agents
agentT = Agent(board, modelXO, sideT)
agentHOF = Agent(board, modelHOF, sideHOF)
move = 1
# Gameplay loop
while board.game_ongoing():
# Select a move
if board.player == sideHOF:
agentHOF.action(epsilon)
else:
agentT.action(epsilon)
# Back up the current board evaluation to the last action chosen by the current agent
if move > 2:
evaluation = modelXO(board.get_board())
modelXO.fit(board.history()[-3], evaluation, batch_size=1, verbose=0)
move += 1
if game % print_freq == 0:
print(board)
# Back up the terminal state value to the last actions chosen by either agent
terminal_eval = tf.constant(board.who_won(), dtype="float32", shape=(1, 1))
modelXO.fit(board.history()[-3], terminal_eval, batch_size=1, verbose=0)
modelXO.fit(board.history()[-2], terminal_eval, batch_size=1, verbose=0)
# Occasionally save new model to hall of fame
if game % hof_freq == 0 and game != 0:
hof.store(modelXO, game)
end_states.append(board.who_won())
victories.append(board.who_won()*sideT)
if game % 10 == 0:
print("Game {} goes to {} ({})".format(str(game), ["tie", "best", "hof"][board.who_won()*sideT], ['Tie', 'X', 'O'][board.who_won()]))
print("Training complete.")
print("Saving trained model to models/modelXO and chart to plots folder")
plot_wins(end_states, run_length=50, labels=['X', 'O'])
plot_wins(victories, run_length=50, labels=["Best", "HOF"])
modelXO.save('models/modelXO')