-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
LazyDok
committed
Jan 26, 2018
1 parent
567331a
commit 2e7f5a8
Showing
6 changed files
with
163 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
import numpy as np | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
import torch | ||
import gym | ||
from torch.autograd import Variable | ||
import random | ||
|
||
HIDDEN_LAYER = 24 # NN hidden layer size | ||
LR = 0.01 | ||
GAMMA = 0.99 | ||
|
||
INPUT_SIZE = 4 | ||
OUTPUT_SIZE = 2 | ||
|
||
ENV = gym.make('CartPole-v0').unwrapped | ||
HISTORY = [] | ||
|
||
class Network(nn.Module): | ||
def __init__(self): | ||
super(Network, self).__init__() | ||
self.l1 = nn.Linear(INPUT_SIZE, HIDDEN_LAYER) | ||
# nn.init.xavier_uniform(self.l1.weight) | ||
self.l2 = nn.Linear(HIDDEN_LAYER, OUTPUT_SIZE) | ||
# nn.init.xavier_uniform(self.l2.weight) | ||
def forward(self, x): | ||
x = F.relu(self.l1(x)) | ||
x = F.softmax(self.l2(x)) | ||
return x | ||
|
||
model = Network() | ||
|
||
use_cuda = torch.cuda.is_available() | ||
if use_cuda: | ||
model.cuda() | ||
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor | ||
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor | ||
|
||
optim = torch.optim.Adam(model.parameters(), lr=LR) | ||
|
||
def discount_rewards(r): | ||
discounted_r = torch.zeros(r.size()) | ||
running_add = 0 | ||
for t in reversed(range(len(r))): | ||
running_add = running_add * GAMMA + r[t] | ||
discounted_r[t] = running_add | ||
|
||
return discounted_r | ||
|
||
def run_episode(net, e, env): | ||
state = env.reset() | ||
reward_sum = 0 | ||
xs = FloatTensor([]) | ||
ys = FloatTensor([]) | ||
rewards = FloatTensor([]) | ||
steps = 0 | ||
|
||
while True: | ||
# env.render() | ||
|
||
x = FloatTensor([state]) | ||
xs = torch.cat([xs, x]) | ||
|
||
action_prob = net(Variable(x)) | ||
|
||
# select an action depends on probability | ||
action = 0 if random.random() < action_prob.data[0][0] else 1 | ||
|
||
y = FloatTensor([[1, 0]] if action == 0 else [[0, 1]]) | ||
ys = torch.cat([ys, y]) | ||
|
||
state, reward, done, _ = env.step(action) | ||
rewards = torch.cat([rewards, FloatTensor([[reward]])]) | ||
reward_sum += reward | ||
steps += 1 | ||
|
||
if done or steps >= 500: | ||
adv = discount_rewards(rewards) | ||
# adv = (adv - adv.mean()) | ||
adv = (adv - adv.mean())/(adv.std() + 1e-7) | ||
# print(adv) | ||
loss = learn(xs, ys, adv) | ||
HISTORY.append(reward_sum) | ||
print("[Episode {:>5}] steps: {:>5} loss: {:>5}".format(e, steps, loss)) | ||
if sum(HISTORY[-5:])/5 > 490: | ||
return True | ||
else: | ||
return False | ||
|
||
def learn(x, y, adv): | ||
# Loss function, ∑ Ai*logp(yi∣xi), but we need fake lable Y due to autodiff | ||
action_pred = model(Variable(x)) | ||
y = Variable(y, requires_grad=True) | ||
adv = Variable(adv).cuda() | ||
# print(action_pred) | ||
log_lik = -y * torch.log(action_pred) | ||
# print(y) | ||
log_lik_adv = log_lik * adv | ||
# print(torch.sum(log_lik_adv, 1)) | ||
loss = torch.sum(log_lik_adv, 1).mean() | ||
|
||
optim.zero_grad() | ||
loss.backward() | ||
optim.step() | ||
|
||
return loss.data[0] | ||
|
||
|
||
for e in range(10000): | ||
complete = run_episode(model, e, ENV) | ||
|
||
if complete: | ||
print('complete...!') | ||
break | ||
|
||
import matplotlib.pyplot as plt | ||
from moviepy.editor import ImageSequenceClip | ||
|
||
def botPlay(env): | ||
state = env.reset() | ||
steps = 0 | ||
frames = [] | ||
while True: | ||
frame = env.render(mode='rgb_array') | ||
frames.append(frame) | ||
action = torch.max(model(Variable(FloatTensor([state]))), 1)[1].data[0] | ||
next_state, reward, done, _ = env.step(action) | ||
|
||
state = next_state | ||
steps += 1 | ||
|
||
if done or steps >= 1000: | ||
break | ||
|
||
clip = ImageSequenceClip(frames, fps=20) | ||
clip.write_gif('4_policy_gradient_play.gif', fps=20) | ||
|
||
def plot_durations(d): | ||
plt.figure(2) | ||
plt.clf() | ||
plt.title('Training...') | ||
plt.xlabel('Episode') | ||
plt.ylabel('Duration') | ||
plt.plot(d) | ||
|
||
plt.savefig('4_policy_gradient_score.png') | ||
|
||
botPlay(ENV) | ||
plot_durations(HISTORY) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.