-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathhanoi.py
142 lines (124 loc) · 6.19 KB
/
hanoi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
''' Reinforcement learning of the Towers of Hanoi game.
Reference: Watkins and Dayan, "Q-Learning", Machine Learning, 8, 279-292 (1992).'''
import numpy as np
import itertools
import pandas as pd
import matplotlib.pyplot as plt
class TowersOfHanoi:
def __init__(self, state):
self.state = state # "State" is a tuple of length N, where N is the number of discs, and the elements are peg indices in [0,1,2]
self.discs = len(self.state)
def discs_on_peg(self, peg):
return [disc for disc in range(self.discs) if self.state[disc] == peg]
def move_allowed(self, move):
discs_from = self.discs_on_peg(move[0])
discs_to = self.discs_on_peg(move[1])
if discs_from:
return (min(discs_to) > min(discs_from)) if discs_to else True
else:
return False
def get_moved_state(self, move):
if self.move_allowed(move):
disc_to_move = min(self.discs_on_peg(move[0]))
moved_state = list(self.state)
moved_state[disc_to_move] = move[1]
return tuple(moved_state)
# Generates the reward matrix for the Towers of Hanoi game as a Pandas DataFrame
def generate_reward_matrix(N): # N is the number of discs
states = list(itertools.product(list(range(3)), repeat=N))
moves = list(itertools.permutations(list(range(3)), 2))
R = pd.DataFrame(index=states, columns=states, data=-np.inf)
for state in states:
tower = TowersOfHanoi(state=state)
for move in moves:
if tower.move_allowed(move):
next_state = tower.get_moved_state(move)
R[state][next_state] = 0
final_state = tuple([2]*N) # Define final state as all discs being on the last peg
R[final_state] += 100 # Add a reward for all moves leading to the final state
return R.values
def learn_Q(R, gamma=0.8, alpha=1.0, N_episodes=1000):
Q = np.zeros(R.shape)
states=list(range(R.shape[0]))
for n in range(N_episodes):
Q_previous = Q
state = np.random.choice(states) # Randomly select initial state
next_states = np.where(R[state,:] >= 0)[0] # Generate a list of possible next states
next_state = np.random.choice(next_states) # Randomly select next state from the list of possible next states
V = np.max(Q[next_state,:]) # Maximum Q-value of the states accessible from the next state
Q[state, next_state] = (1-alpha)*Q[state, next_state] + alpha*(R[state, next_state] + gamma*V) # Update Q-values
if np.max(Q) > 0:
Q /= np.max(Q) # Normalize Q to its maximum value
return Q
def get_policy(Q, R):
Q_allowed = pd.DataFrame(Q)[pd.DataFrame(R) >= 0].values
policy = []
for i in range(Q_allowed.shape[0]):
row = Q_allowed[i,:]
sorted_vals = np.sort(row)
sorted_vals = sorted_vals[~np.isnan(sorted_vals)][::-1]
sorted_args = row.argsort()[np.where(~np.isnan(sorted_vals))][::-1]
max_vals = [val for val in sorted_vals if val==sorted_vals[0]]
max_args = [sorted_args[i] for i,val in enumerate(sorted_vals) if val==sorted_vals[0]]
policy.append(max_args)
return policy
def play(policy):
start_state = 0
end_state = len(policy)-1
state = start_state
moves = 0
while state != end_state:
state = np.random.choice(policy[state])
moves += 1
return moves
def play_average(policy, play_times=100):
moves = np.zeros(play_times)
for n in range(play_times):
moves[n] = play(policy)
return np.mean(moves), np.std(moves)
def Q_performance(R, episodes, play_times=100):
means = np.zeros(len(episodes))
stds = np.zeros(len(episodes))
for n, N_episodes in enumerate(episodes):
Q = learn_Q(R, N_episodes = N_episodes)
policy = get_policy(Q,R)
means[n], stds[n] = play_average(policy, play_times)
return means, stds
def Q_performance_average(R, episodes, learn_times = 100, play_times=100):
means_times = np.zeros((learn_times, len(episodes)))
stds_times = np.zeros((learn_times, len(episodes)))
for n in range(learn_times):
means_times[n,:], stds_times[n,:] = Q_performance(R, episodes, play_times=play_times)
means_averaged = np.mean(means_times, axis = 0)
stds_averaged = np.mean(stds_times, axis = 0)
return means_averaged, stds_averaged
def plot_results(episodes, means_averaged, stds_averaged, N, block=False):
fig = plt.figure()
plt.loglog(episodes, means_averaged,'b.-',label='Average performance')
plt.loglog(episodes, means_averaged + stds_averaged, 'b', alpha=0.5)
plt.loglog(episodes, means_averaged - stds_averaged, 'b', alpha=0.5)
plt.fill_between(episodes, means_averaged-stds_averaged, means_averaged+stds_averaged, facecolor='blue', alpha=0.5)
optimum_moves = 2**N - 1
plt.axhline(y=optimum_moves, color='g', label='Optimum (=%s moves)' % optimum_moves)
plt.xlabel('Number of training episodes')
plt.ylabel('Number of moves')
plt.grid('on', which='both')
plt.title('Q-learning the Towers of Hanoi game with %s discs' % N)
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles, labels)
plt.show(block=block)
N = 2 # Number of discs in the Towers of Hanoi game
R = generate_reward_matrix(N)
episodes = [0, 1, 10, 30, 60, 100, 300, 600, 1000, 3000]
means_averaged, stds_averaged = Q_performance_average(R, episodes, learn_times=100, play_times=100)
plot_results(episodes, means_averaged, stds_averaged, N)
N = 3 # Number of discs in the Towers of Hanoi game
R = generate_reward_matrix(N)
episodes = [0, 1, 10, 30, 100, 300, 600, 1000, 3000, 6000]
means_averaged, stds_averaged = Q_performance_average(R, episodes, learn_times=10, play_times=10)
plot_results(episodes, means_averaged, stds_averaged, N)
N = 4 # Number of discs in the Towers of Hanoi game
R = generate_reward_matrix(N)
episodes = [1, 10, 100, 200, 300, 1000, 2000, 3000, 6000, 10000, 30000, 60000]
means_averaged, stds_averaged = Q_performance_average(R, episodes, learn_times=10, play_times=10)
plot_results(episodes, means_averaged, stds_averaged, N, block=True)