forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmonte_carlo_random.py
131 lines (115 loc) · 3.77 KB
/
monte_carlo_random.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
# https://www.udemy.com/artificial-intelligence-reinforcement-learning-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
import numpy as np
from grid_world import standard_grid, negative_grid
from iterative_policy_evaluation import print_values, print_policy
SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')
# NOTE: this is only policy evaluation, not optimization
def random_action(a):
# choose given a with probability 0.5
# choose some other a' != a with probability 0.5/3
p = np.random.random()
if p < 0.5:
return a
else:
tmp = list(ALL_POSSIBLE_ACTIONS)
tmp.remove(a)
return np.random.choice(tmp)
def play_game(grid, policy):
# returns a list of states and corresponding returns
# reset game to start at a random position
# we need to do this, because given our current deterministic policy
# we would never end up at certain states, but we still want to measure their value
start_states = list(grid.actions.keys())
start_idx = np.random.choice(len(start_states))
grid.set_state(start_states[start_idx])
s = grid.current_state()
states_and_rewards = [(s, 0)] # list of tuples of (state, reward)
while not grid.game_over():
a = policy[s]
a = random_action(a)
r = grid.move(a)
s = grid.current_state()
states_and_rewards.append((s, r))
# calculate the returns by working backwards from the terminal state
G = 0
states_and_returns = []
first = True
for s, r in reversed(states_and_rewards):
# the value of the terminal state is 0 by definition
# we should ignore the first state we encounter
# and ignore the last G, which is meaningless since it doesn't correspond to any move
if first:
first = False
else:
states_and_returns.append((s, G))
G = r + GAMMA*G
states_and_returns.reverse() # we want it to be in order of state visited
return states_and_returns
if __name__ == '__main__':
# use the standard grid again (0 for every step) so that we can compare
# to iterative policy evaluation
grid = standard_grid()
# print rewards
print("rewards:")
print_values(grid.rewards, grid)
# state -> action
# found by policy_iteration_random on standard_grid
# MC method won't get exactly this, but should be close
# values:
# ---------------------------
# 0.43| 0.56| 0.72| 0.00|
# ---------------------------
# 0.33| 0.00| 0.21| 0.00|
# ---------------------------
# 0.25| 0.18| 0.11| -0.17|
# policy:
# ---------------------------
# R | R | R | |
# ---------------------------
# U | | U | |
# ---------------------------
# U | L | U | L |
policy = {
(2, 0): 'U',
(1, 0): 'U',
(0, 0): 'R',
(0, 1): 'R',
(0, 2): 'R',
(1, 2): 'U',
(2, 1): 'L',
(2, 2): 'U',
(2, 3): 'L',
}
# initialize V(s) and returns
V = {}
returns = {} # dictionary of state -> list of returns we've received
states = grid.all_states()
for s in states:
if s in grid.actions:
returns[s] = []
else:
# terminal state or state we can't otherwise get to
V[s] = 0
# repeat until convergence
for t in range(5000):
# generate an episode using pi
states_and_returns = play_game(grid, policy)
seen_states = set()
for s, G in states_and_returns:
# check if we have already seen s
# called "first-visit" MC policy evaluation
if s not in seen_states:
returns[s].append(G)
V[s] = np.mean(returns[s])
seen_states.add(s)
print("values:")
print_values(V, grid)
print("policy:")
print_policy(policy, grid)