-
Notifications
You must be signed in to change notification settings - Fork 339
/
Copy pathgridworld_dp.html
627 lines (531 loc) · 26.2 KB
/
gridworld_dp.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>REINFORCEjs: Gridworld with Dynamic Programming</title>
<meta name="description" content="">
<meta name="author" content="">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<!-- jquery and jqueryui -->
<script src="https://code.jquery.com/jquery-2.1.3.min.js"></script>
<link href="external/jquery-ui.min.css" rel="stylesheet">
<script src="external/jquery-ui.min.js"></script>
<!-- bootstrap -->
<script src="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
<link href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css" rel="stylesheet">
<!-- d3js -->
<script type="text/javascript" src="external/d3.min.js"></script>
<!-- markdown -->
<script type="text/javascript" src="external/marked.js"></script>
<script type="text/javascript" src="external/highlight.pack.js"></script>
<link rel="stylesheet" href="external/highlight_default.css">
<script>hljs.initHighlightingOnLoad();</script>
<!-- mathjax: nvm now loading dynamically
<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
-->
<!-- rljs -->
<script type="text/javascript" src="lib/rl.js"></script>
<!-- GA -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-3698471-24', 'auto');
ga('send', 'pageview');
</script>
<style>
#wrap {
width:800px;
margin-left: auto;
margin-right: auto;
}
body {
font-family: Arial, "Helvetica Neue", Helvetica, sans-serif;
}
#draw {
margin-left: 100px;
}
#exp {
margin-top: 20px;
font-size: 16px;
}
svg {
cursor: pointer;
}
h2 {
text-align: center;
font-size: 30px;
}
#rewardui {
font-weight: bold;
font-size: 16px;
}
</style>
<script type="application/javascript">
// Gridworld
var Gridworld = function(){
this.Rarr = null; // reward array
this.T = null; // cell types, 0 = normal, 1 = cliff
this.reset()
}
Gridworld.prototype = {
reset: function() {
// hardcoding one gridworld for now
this.gh = 10;
this.gw = 10;
this.gs = this.gh * this.gw; // number of states
// specify some rewards
var Rarr = R.zeros(this.gs);
var T = R.zeros(this.gs);
Rarr[55] = 1;
Rarr[54] = -1;
//Rarr[63] = -1;
Rarr[64] = -1;
Rarr[65] = -1;
Rarr[85] = -1;
Rarr[86] = -1;
Rarr[37] = -1;
Rarr[33] = -1;
//Rarr[77] = -1;
Rarr[67] = -1;
Rarr[57] = -1;
// make some cliffs
for(q=0;q<8;q++) { var off = (q+1)*this.gh+2; T[off] = 1; Rarr[off] = 0; }
for(q=0;q<6;q++) { var off = 4*this.gh+q+2; T[off] = 1; Rarr[off] = 0; }
T[5*this.gh+2] = 0; Rarr[5*this.gh+2] = 0; // make a hole
this.Rarr = Rarr;
this.T = T;
},
reward: function(s,a,ns) {
// reward of being in s, taking action a, and ending up in ns
return this.Rarr[s];
},
nextStateDistribution: function(s,a) {
// given (s,a) return distribution over s' (in sparse form)
if(this.T[s] === 1) {
// cliff! oh no!
// var ns = 0; // reset to state zero (start)
} else if(s === 55) {
// agent wins! teleport to start
var ns = this.startState();
while(this.T[ns] === 1) {
var ns = this.randomState();
}
} else {
// ordinary space
var nx, ny;
var x = this.stox(s);
var y = this.stoy(s);
if(a === 0) {nx=x-1; ny=y;}
if(a === 1) {nx=x; ny=y-1;}
if(a === 2) {nx=x; ny=y+1;}
if(a === 3) {nx=x+1; ny=y;}
var ns = nx*this.gh+ny;
if(this.T[ns] === 1) {
// actually never mind, this is a wall. reset the agent
var ns = s;
}
}
// gridworld is deterministic, so return only a single next state
return ns;
},
sampleNextState: function(s,a) {
// gridworld is deterministic, so this is easy
var ns = this.nextStateDistribution(s,a);
var r = this.Rarr[s]; // observe the raw reward of being in s, taking a, and ending up in ns
r -= 0.01; // every step takes a bit of negative reward
var out = {'ns':ns, 'r':r};
if(s === 55 && ns === 0) {
// episode is over
out.reset_episode = true;
}
return out;
},
allowedActions: function(s) {
var x = this.stox(s);
var y = this.stoy(s);
var as = [];
if(x > 0) { as.push(0); }
if(y > 0) { as.push(1); }
if(y < this.gh-1) { as.push(2); }
if(x < this.gw-1) { as.push(3); }
return as;
},
randomState: function() { return Math.floor(Math.random()*this.gs); },
startState: function() { return 0; },
getNumStates: function() { return this.gs; },
getMaxNumActions: function() { return 4; },
// private functions
stox: function(s) { return Math.floor(s/this.gh); },
stoy: function(s) { return s % this.gh; },
xytos: function(x,y) { return x*this.gh + y; },
}
// ------
// UI
// ------
var rs = {};
var trs = {};
var tvs = {};
var pas = {};
var cs = 60; // cell size
var initGrid = function() {
var d3elt = d3.select('#draw');
d3elt.html('');
rs = {};
trs = {};
tvs = {};
pas = {};
var gh= env.gh; // height in cells
var gw = env.gw; // width in cells
var gs = env.gs; // total number of cells
var w = 600;
var h = 600;
svg = d3elt.append('svg').attr('width', w).attr('height', h)
.append('g').attr('transform', 'scale(1)');
// define a marker for drawing arrowheads
svg.append("defs").append("marker")
.attr("id", "arrowhead")
.attr("refX", 3)
.attr("refY", 2)
.attr("markerWidth", 3)
.attr("markerHeight", 4)
.attr("orient", "auto")
.append("path")
.attr("d", "M 0,0 V 4 L3,2 Z");
for(var y=0;y<gh;y++) {
for(var x=0;x<gw;x++) {
var xcoord = x*cs;
var ycoord = y*cs;
var s = env.xytos(x,y);
var g = svg.append('g');
// click callbackfor group
g.on('click', function(ss) {
return function() { cellClicked(ss); } // close over s
}(s));
// set up cell rectangles
var r = g.append('rect')
.attr('x', xcoord)
.attr('y', ycoord)
.attr('height', cs)
.attr('width', cs)
.attr('fill', '#FFF')
.attr('stroke', 'black')
.attr('stroke-width', 2);
rs[s] = r;
// reward text
var tr = g.append('text')
.attr('x', xcoord + 5)
.attr('y', ycoord + 55)
.attr('font-size', 10)
.text('');
trs[s] = tr;
// skip rest for cliffs
if(env.T[s] === 1) { continue; }
// value text
var tv = g.append('text')
.attr('x', xcoord + 5)
.attr('y', ycoord + 20)
.text('');
tvs[s] = tv;
// policy arrows
pas[s] = []
for(var a=0;a<4;a++) {
var pa = g.append('line')
.attr('x1', xcoord)
.attr('y1', ycoord)
.attr('x2', xcoord)
.attr('y2', ycoord)
.attr('stroke', 'black')
.attr('stroke-width', '2')
.attr("marker-end", "url(#arrowhead)");
pas[s].push(pa);
}
}
}
}
var drawGrid = function() {
var gh= env.gh; // height in cells
var gw = env.gw; // width in cells
var gs = env.gs; // total number of cells
// updates the grid with current state of world/agent
for(var y=0;y<gh;y++) {
for(var x=0;x<gw;x++) {
var xcoord = x*cs;
var ycoord = y*cs;
var r=255,g=255,b=255;
var s = env.xytos(x,y);
var vv = agent.V[s];
var ms = 100;
if(vv > 0) { g = 255; r = 255 - vv*ms; b = 255 - vv*ms; }
if(vv < 0) { g = 255 + vv*ms; r = 255; b = 255 + vv*ms; }
var vcol = 'rgb('+Math.floor(r)+','+Math.floor(g)+','+Math.floor(b)+')';
if(env.T[s] === 1) { vcol = "#AAA"; rcol = "#AAA"; }
// update colors of rectangles based on value
var r = rs[s];
if(s === selected) {
// highlight selected cell
r.attr('fill', '#FF0');
} else {
r.attr('fill', vcol);
}
// write reward texts
var rv = env.Rarr[s];
var tr = trs[s];
if(rv !== 0) {
tr.text('R ' + rv.toFixed(1))
}
// skip rest for cliff
if(env.T[s] === 1) continue;
// write value
var tv = tvs[s];
tv.text(agent.V[s].toFixed(2));
// update policy arrows
var paa = pas[s];
for(var a=0;a<4;a++) {
var pa = paa[a];
var prob = agent.P[a*gs+s];
if(prob === 0) { pa.attr('visibility', 'hidden'); }
else { pa.attr('visibility', 'visible'); }
var ss = cs/2 * prob * 0.9;
if(a === 0) {nx=-ss; ny=0;}
if(a === 1) {nx=0; ny=-ss;}
if(a === 2) {nx=0; ny=ss;}
if(a === 3) {nx=ss; ny=0;}
pa.attr('x1', xcoord+cs/2)
.attr('y1', ycoord+cs/2)
.attr('x2', xcoord+cs/2+nx)
.attr('y2', ycoord+cs/2+ny);
}
}
}
}
var selected = -1;
var cellClicked = function(s) {
if(s === selected) {
selected = -1; // toggle off
$("#creward").html('(select a cell)');
} else {
selected = s;
$("#creward").html(env.Rarr[s].toFixed(2));
$("#rewardslider").slider('value', env.Rarr[s]);
}
drawGrid(); // redraw
}
var updatePolicy = function() {
agent.updatePolicy();
drawGrid();
}
var evaluatePolicy = function() {
agent.evaluatePolicy();
drawGrid();
}
var sid = -1;
var runValueIteration = function() {
if(sid === -1) {
sid = setInterval(function(){
agent.evaluatePolicy();
agent.updatePolicy();
drawGrid();
}, 100);
} else {
clearInterval(sid);
sid = -1;
}
}
function resetAll() {
env.reset();
agent.reset();
drawGrid();
}
var agent, env;
function start() {
env = new Gridworld(); // create environment
agent = new RL.DPAgent(env, {'gamma':0.9}); // create an agent, yay!
initGrid();
drawGrid();
$("#rewardslider").slider({
min: -5,
max: 5.1,
value: 0,
step: 0.1,
slide: function(event, ui) {
if(selected >= 0) {
env.Rarr[selected] = ui.value;
$("#creward").html(ui.value.toFixed(2));
drawGrid();
} else {
$("#creward").html('(select a cell)');
}
}
});
// suntax highlighting
//marked.setOptions({highlight:function(code){ return hljs.highlightAuto(code).value; }});
$(".md").each(function(){
$(this).html(marked($(this).html()));
});
renderJax();
}
var jaxrendered = false;
function renderJax() {
if(jaxrendered) { return; }
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
document.getElementsByTagName("head")[0].appendChild(script);
jaxrendered = true;
})();
}
</script>
</head>
<body onload="start();">
<a href="https://github.com/karpathy/reinforcejs"><img style="position: absolute; top: 0; right: 0; border: 0;" src="https://camo.githubusercontent.com/38ef81f8aca64bb9a64448d0d70f1308ef5341ab/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f6461726b626c75655f3132313632312e706e67" alt="Fork me on GitHub" data-canonical-src="https://s3.amazonaws.com/github/ribbons/forkme_right_darkblue_121621.png"></a>
<!-- content -->
<div id="wrap">
<div id="mynav" style="border-bottom:1px solid #999; padding-bottom: 10px; margin-bottom:50px;">
<div>
<img src="loop.svg" style="width:50px;height:50px;float:left;">
<h1 style="font-size:50px;">REINFORCE<span style="color:#058;">js</span></h1>
</div>
<ul class="nav nav-pills">
<li role="presentation"><a href="index.html">About</a></li>
<li role="presentation" class="active"><a href="gridworld_dp.html">GridWorld: DP</a></li>
<li role="presentation"><a href="gridworld_td.html">GridWorld: TD</a></li>
<li role="presentation"><a href="puckworld.html">PuckWorld: DQN</a></li>
<li role="presentation"><a href="waterworld.html">WaterWorld: DQN</a></li>
</ul>
</div>
<h2>GridWorld: Dynamic Programming Demo</h2>
<div style="text-align:center;">
<button class="btn btn-warning" onclick="evaluatePolicy()" style="width:220px;height:50px;margin-bottom:5px;">Policy Evaluation (one sweep)</button>
<button class="btn btn-warning" onclick="updatePolicy()" style="width:170px;height:50px;margin-bottom:5px;">Policy Update</button>
<button class="btn btn-success" onclick="runValueIteration()" style="width:170px;height:50px;margin-bottom:5px;">Toggle Value Iteration</button>
<button class="btn btn-danger" onclick="resetAll()" style="width:170px;height:50px;margin-bottom:5px;">Reset</button>
</div>
<br>
<div id="draw"></div>
<br>
<div id="rewardui">Cell reward: <span id="creward">(select a cell)</span> <div id="rewardslider"></div></div>
<hr>
<div id="exp" class="md">
### Setup
This is a toy environment called **Gridworld** that is often used as a toy model in the Reinforcement Learning literature. In this particular case:
- **State space**: GridWorld has 10x10 = 100 distinct states. The start state is the top left cell. The gray cells are walls and cannot be moved to.
- **Actions**: The agent can choose from up to 4 actions to move around. In this example
- **Environment Dynamics**: GridWorld is deterministic, leading to the same new state given each state and action
- **Rewards**: The agent receives +1 reward when it is in the center square (the one that shows R 1.0), and -1 reward in a few states (R -1.0 is shown for these). The state with +1.0 reward is the goal state and resets the agent back to start.
In other words, this is a deterministic, finite Markov Decision Process (MDP) and as always the goal is to find an agent policy (shown here by arrows) that maximizes the future discounted reward. My favorite part is letting Value iteration converge, then change the cell rewards and watch the policy adjust.
**Interface**. The color of the cells (initially all white) shows the current estimate of the Value (discounted reward) of that state, with the current policy. Note that you can select any cell and change its reward with the *Cell reward* slider.
### Dynamic Programming
An interested reader should refer to **Richard Sutton's Free Online Book on Reinforcement Learning**, in this particular case [Chapter 4](http://webdocs.cs.ualberta.ca/~sutton/book/ebook/node40.html). Briefly, an agent interacts with the environment based on its policy \\(\pi(a \mid s)\\). This is a function from states \\(s\\) to an action \\(a\\), or more generally to a distribution over the possible actions. After every action, the agent also receives a reward \\(r\\) from the environment. The interaction between the agent and the environment hence takes the form \\(s\_t, a\_t, r\_t, s\_{t+1}, a\_{t+1}, r\_{t+1}, s\_{t+2}, \ldots \\), indexed by t (time) and our goal is to find the policy \\(\pi^\*\\) that maximizes the amount of reward over time. In the DP approach, it is helpful to define a **Value function** \\(V^\pi(s)\\) that gives the expected reward when starting from the state \\(s\\) and then interacting with the environment according to \\(\pi\\):
$$
V^\pi(s) = E \\{ r\_t + \gamma r\_{t+1} + \gamma^2 r\_{t+2} + \ldots \mid s\_t = s \\}
$$
Note that the expectation is over both 1. the agent's (in general) stochastic policy and 2. the environment dynamics; That is, how the environment evolves when the agent takes an action \\(a\\) in state \\(s\\) and what the obtained reward is in that case. The constant \\(\gamma\\) is a discount factor that gives more weight to earlier rewards than the later ones. We can start writing out the expectation. To get the value of state \\(s\\) we sum over all the actions the agent would take, then over all ways the environment could respond, and so on back and forth. When you do this, you'll find that:
$$
V^\pi(s) = \sum\_{a} \pi(s,a) \sum\_{s'} \mathcal{P}\_{ss'}^{a} \left[ \mathcal{R}\_{ss'}^{a} + \gamma V^\pi(s') \right]
$$
In the above equation \\(\mathcal{P}\_{ss'}^{a} , \mathcal{R}\_{ss'}^{a} \\) are fixed constants specific to the environment, and give the probability of the next state \\(s'\\) given that the agent took action \\(a\\) in state \\(s\\), and the expected reward for that transition, respectively. This equation is called the **Bellman Equation** and interestingly, it describes a recursive relationship that the value function for a given policy should satisfy.
With our goal of finding the optimal policy \\(\pi^\*(s,a)\\) that gets the most Value from all states, our strategy will be to follow the **Policy Iteration** scheme: We start out with some diffuse initial policy and evaluate the Value function of every state under that policy by turning the Bellman equation into an update. The value function effectively diffuses the rewards backwards through the environment dynamics and the agent's expected actions, as given by its current policy. Once we estimate the values of all states we will update the policy to be greedy with respect the Value function. We then re-estimate the Value of each state, and continue iterating until we converge to the optimal policy (the process can be shown to converge). These two steps can be controlled by the buttons in the interface:
- The **Policy Evaluation (one sweep)** button turns the Bellman equation into a synchronous update and performs a single step of Value Function estimation. Intuitively, this update is diffusing the raw Rewards at each state to other nearby states through 1. the the dynamics of the environment and 2. the current policy.
- The **Policy Update** button iterates over all states and updates the policy at each state to take the action that leads to the state with the best Value (integrating over the next state distribution of the environment for each action).
- The **Value Iteration** button starts a timer that presses the two buttons in turns. In particular, note that Value Iteration doesn't wait for the Value function to be fully estimates, but only a single synchronous sweep of Bellman update is carried out. In full policy iteration there would be many sweeps (until convergence) of backups before the policy is updated.
### Sketch of the Code
The goal of **Policy Evaluation** is to update the value of every state by diffusing the rewards backwards through the dynamics of the world and current policy (this is called a *backup*). The code looks like this:
<pre><code class="js">
evaluatePolicy: function() {
// perform a synchronous update of the value function
var Vnew = zeros(this.ns); // initialize new value function array for each state
for(var s=0;s < this.ns;s++) {
var v = 0.0;
var poss = this.env.allowedActions(s); // fetch all possible actions
for(var i=0,n=poss.length;i < n;i++) {
var a = poss[i];
var prob = this.P[a*this.ns+s]; // probability of taking action under current policy
var ns = this.env.nextStateDistribution(s,a); // look up the next state
var rs = this.env.reward(s,a,ns); // get reward for s->a->ns transition
v += prob * (rs + this.gamma * this.V[ns]);
}
Vnew[s] = v;
}
this.V = Vnew; // swap
},
</code></pre>
Here, we see `this.ns` (number of states), `this.P` which stores the current policy, and `this.env`, which is a pointer to the Environment object. The policy array is one-dimensional in this implementation, but stores the probability of taking any action in any state, so I'm using funny indexing (`this.P[a*this.ns+s]`) to not have to deal with 2D arrays in Javascript. The code implements the synchronous Value backup for each state:
\begin{align}
V^\pi(s) & = E\_\pi \\{ r\_t + \\gamma r\_{t+1} + \\gamma^2 r\_{t+2} + \\ldots \\mid s\_t = s \\} \\\\
& = E\_\pi \\{ r\_t + \\gamma V^\pi(s\_{t+1}) \\mid s\_t = s \\} \\\\
& = \sum\_a \pi(s,a) \sum\_{s'} \mathcal{P}\_{ss'}^a \left[ \mathcal{R}\_{ss'}^a + \\gamma V(s') \right]
\end{align}
However, note that in the code we only took expectation over the policy actions (the first sum above), while the second sum above was not evaluated because this Gridworld is deterministic (i.e. `ns` in the code is just a single integer indicating the next state). Hence, the entire probability mass is on a single next state and no expectation is needed.
Once the Value function was re-estimated, we can perform a **Policy Update**, making the policy greedy with respect to the estimate Value in each state. This is a very simple process, but the code below is a little bloated because we're handling ties between actions carefully and distributing the policy probabilities equally over all winning actions:
<pre><code class="js">
updatePolicy: function() {
// update policy to be greedy w.r.t. learned Value function
// iterate over all states...
for(var s=0;s < this.ns;s++) {
var poss = this.env.allowedActions(s);
// compute value of taking each allowed action
var vmax, nmax;
var vs = [];
for(var i=0,n=poss.length;i < n;i++) {
var a = poss[i];
// compute the value of taking action a
var ns = this.env.nextStateDistribution(s,a);
var rs = this.env.reward(s,a,ns);
var v = rs + this.gamma * this.V[ns];
// bookeeping: store it and maintain max
vs.push(v);
if(i === 0 || v > vmax) { vmax = v; nmax = 1; }
else if(v === vmax) { nmax += 1; }
}
// update policy smoothly across all argmaxy actions
for(var i=0,n=poss.length;i < n;i++) {
var a = poss[i];
this.P[a*this.ns+s] = (vs[i] === vmax) ? 1.0/nmax : 0.0;
}
}
},
</code></pre>
Here, we are computing the action value function at each state \\(Q(s,a)\\), which measures how much expected reward we would get by taking the action \\(a\\) in the state \\(s\\). The function has the form:
\begin{align}
Q^\pi (s,a) &= E\_\pi \\{ r\_t + \\gamma V^\pi(s\_{t+1}) \\mid s\_t = s, a\_t = a \\} \\\\
&= \sum\_{s'} \mathcal{P}\_{ss'}^a \left[ \mathcal{R}\_{ss'}^a + \\gamma V^\pi(s') \right]
\end{align}
In our Gridworld example, we are looping over all states and evaluating the Q function for each of the (up to) four possible actions. Then we update the policy to take the argmaxy actions at each state. That is,
$$
\pi'(s) = \arg\max_a Q(s,a)
$$
It can be shown (see Richard Sutton's book) that performing these updates over and over again is guaranteed to converge to the optimal policy. In this Gridworld example, this corresponds to arrows that perfectly guide the agent to the terminal state where it gets reward +1.
### REINFORCEjs API use of DP
If you'd like to use the REINFORCEjs Dynamic Programming for your MDP, you have to define an environment object `env` that has a few methods that the DP agent will need:
- `env.getNumStates()` returns an integer of total number of states
- `env.getMaxNumActions()` returns an integer with max number of actions in any state
- `env.allowedActions(s)` takes an integer `s` and returns a list of available actions, which should be integers from zero to `maxNumActions`
- `env.nextStateDistribution(s,a)`, which is a misnomer, since right now the library assumes deterministic MDPs that have a single unique new state for every (state, action) pair. Therefore, the function should return a single integer that identifies the next state of the world
- `env.reward(s,a,ns)`, which returns a float for the reward achieved by the agent for the `s`, `a`, `ns` transition. In the simplest case, the reward is usually based only the state `s`.
See the GridWorld environment in this demo's source code for an example. An example of creating and training the agent will then look something like:
<pre><code class="js">
// create environment
env = new Gridworld();
// create the agent, yay! Discount factor 0.9
agent = new RL.DPAgent(env, {'gamma':0.9});
// call this function repeatedly until convergence:
agent.learn();
// once trained, get the agent's behavior with:
var action = agent.act(); // returns the index of the chosen action
</code></pre>
### Pros and Cons
In practice you'll rarely see people use Dynamic Programming to solve Reinforcement Learning problems. There are numerous reasons for this, but the two biggest ones are probably that:
- It's not obvious how one can extend this to continuous actions and states
- To calculate these updates one must have access to the environment model \\(P\_{ss'}^a\\), which is in practice almost never available. The best we can usually hope for is to get samples from this distribution by having an agent interacting with the environment and collecting experience, which we can use to approximately evaluate the expectations by sampling. More on this in TD methods!
However, the nice parts are that:
- Everything is mathematically exact, expressible and analyzable.
- If your problem is relatively small (maybe a few thousand states and up to few tens/hundreds actions), DP methods might be the best solution for you because they are the most stable, safest and come with simplest convergence guarantees.
</div>
</div>
<br><br><br><br><br>
</body>
</html>