Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
LazyDok committed Jan 26, 2018
1 parent 567331a commit 2e7f5a8
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 5 deletions.
4 changes: 2 additions & 2 deletions 2_double_dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def forward(self, x):
# plt.ylabel('Duration')
# plt.plot(d)
#
# plt.savefig('3_dueling_dqn_score.png')
# plt.savefig('4_policy_gradient_score.png')

def select_action(state, train=True):
global steps_done
Expand Down Expand Up @@ -172,7 +172,7 @@ def botPlay():
break

clip = ImageSequenceClip(frames, fps=20)
clip.write_gif('3_dueling_dqn_play.gif', fps=20)
clip.write_gif('4_policy_gradient_play.gif', fps=20)

for e in range(EPISODES):
complete = run_episode(e, env)
Expand Down
4 changes: 2 additions & 2 deletions 3_dueling_dqn.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def plot_durations(d):
plt.ylabel('Duration')
plt.plot(d)

plt.savefig('3_dueling_dqn_score.png')
plt.savefig('4_policy_gradient_score.png')

def select_action(state, train=True):
global steps_done
Expand Down Expand Up @@ -171,7 +171,7 @@ def botPlay():
break

clip = ImageSequenceClip(frames, fps=20)
clip.write_gif('3_dueling_dqn_play.gif', fps=20)
clip.write_gif('4_policy_gradient_play.gif', fps=20)

for e in range(EPISODES):
complete = run_episode(e, env)
Expand Down
149 changes: 149 additions & 0 deletions 4_policy_gradient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch
import gym
from torch.autograd import Variable
import random

HIDDEN_LAYER = 24 # NN hidden layer size
LR = 0.01
GAMMA = 0.99

INPUT_SIZE = 4
OUTPUT_SIZE = 2

ENV = gym.make('CartPole-v0').unwrapped
HISTORY = []

class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.l1 = nn.Linear(INPUT_SIZE, HIDDEN_LAYER)
# nn.init.xavier_uniform(self.l1.weight)
self.l2 = nn.Linear(HIDDEN_LAYER, OUTPUT_SIZE)
# nn.init.xavier_uniform(self.l2.weight)
def forward(self, x):
x = F.relu(self.l1(x))
x = F.softmax(self.l2(x))
return x

model = Network()

use_cuda = torch.cuda.is_available()
if use_cuda:
model.cuda()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor

optim = torch.optim.Adam(model.parameters(), lr=LR)

def discount_rewards(r):
discounted_r = torch.zeros(r.size())
running_add = 0
for t in reversed(range(len(r))):
running_add = running_add * GAMMA + r[t]
discounted_r[t] = running_add

return discounted_r

def run_episode(net, e, env):
state = env.reset()
reward_sum = 0
xs = FloatTensor([])
ys = FloatTensor([])
rewards = FloatTensor([])
steps = 0

while True:
# env.render()

x = FloatTensor([state])
xs = torch.cat([xs, x])

action_prob = net(Variable(x))

# select an action depends on probability
action = 0 if random.random() < action_prob.data[0][0] else 1

y = FloatTensor([[1, 0]] if action == 0 else [[0, 1]])
ys = torch.cat([ys, y])

state, reward, done, _ = env.step(action)
rewards = torch.cat([rewards, FloatTensor([[reward]])])
reward_sum += reward
steps += 1

if done or steps >= 500:
adv = discount_rewards(rewards)
# adv = (adv - adv.mean())
adv = (adv - adv.mean())/(adv.std() + 1e-7)
# print(adv)
loss = learn(xs, ys, adv)
HISTORY.append(reward_sum)
print("[Episode {:>5}] steps: {:>5} loss: {:>5}".format(e, steps, loss))
if sum(HISTORY[-5:])/5 > 490:
return True
else:
return False

def learn(x, y, adv):
# Loss function, ∑ Ai*logp(yi∣xi), but we need fake lable Y due to autodiff
action_pred = model(Variable(x))
y = Variable(y, requires_grad=True)
adv = Variable(adv).cuda()
# print(action_pred)
log_lik = -y * torch.log(action_pred)
# print(y)
log_lik_adv = log_lik * adv
# print(torch.sum(log_lik_adv, 1))
loss = torch.sum(log_lik_adv, 1).mean()

optim.zero_grad()
loss.backward()
optim.step()

return loss.data[0]


for e in range(10000):
complete = run_episode(model, e, ENV)

if complete:
print('complete...!')
break

import matplotlib.pyplot as plt
from moviepy.editor import ImageSequenceClip

def botPlay(env):
state = env.reset()
steps = 0
frames = []
while True:
frame = env.render(mode='rgb_array')
frames.append(frame)
action = torch.max(model(Variable(FloatTensor([state]))), 1)[1].data[0]
next_state, reward, done, _ = env.step(action)

state = next_state
steps += 1

if done or steps >= 1000:
break

clip = ImageSequenceClip(frames, fps=20)
clip.write_gif('4_policy_gradient_play.gif', fps=20)

def plot_durations(d):
plt.figure(2)
plt.clf()
plt.title('Training...')
plt.xlabel('Episode')
plt.ylabel('Duration')
plt.plot(d)

plt.savefig('4_policy_gradient_score.png')

botPlay(ENV)
plot_durations(HISTORY)
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# RL-Pytorch-Cartpole
Reinforcement Learning tutorial with pytorch
Reinforcement Learning tutorial by pytorch

> Implemented algorithms:
Expand Down Expand Up @@ -33,3 +33,12 @@ Playing :
Playing :

![alt text](/img/3_dueling_dqn_play.gif "Playing")

> Policy Gradient
More stable, Faster(not needed replay memory), more simple(not needed customizing policy)
![alt text](/img/4_policy_gradient_score.png "Learning")

Playing :

![alt text](/img/4_policy_gradient_play.gif "Playing")
Binary file added img/4_policy_gradient_play.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/4_policy_gradient_score.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 2e7f5a8

Please sign in to comment.