Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
lukau2357 committed May 7, 2022
0 parents commit aeca317
Show file tree
Hide file tree
Showing 7 changed files with 499 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
**/__pycache__
# Reward history
**/history
# Model weights, too big for a GitHub repository
**/saved
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
## Project description
Implementation of the TD3 - twin delayed DDPG algorithm for reinforcement learning ([original publication link](https://arxiv.org/pdf/1802.09477.pdf)), particularlly usefull for continuous action space-continuous state space problems.

The algorithm was tested on the [BipedalWalker-v3](https://gym.openai.com/envs/BipedalWalker-v2/) environment (even though the official documentation says that v2 is latest, it is deprecated!). We trained the agent on a high-performance GPU with CUDA, and after 550 episodes the following results were obtained:
![walk_demo](https://drive.google.com/uc?id=1y0_Z9uhuqt7hOb3m1wWrZzy1cKKR6NfV)

Project dependencies can be found in requirements.txt file, as usual.

## Todo
- To trully estimate how good is TD3 for this environment, we planned on repeating the training process 10-60 times, and from here we wanted to estimate the uncertainty of obtained reward for each agent-episode pair. We planned on constructing simple 95% confidence intervals for these quantities.
- Paper and presentation for this project are still being worked on.
107 changes: 107 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: win-64
blas=1.0=mkl
bottleneck=1.3.4=py39h080aedc_0
box2d-py=2.3.8=py39h415ef7b_5
brotli=1.0.9=ha925a31_2
ca-certificates=2022.4.26=haa95532_0
cairo=1.16.0=hb19e0ff_1008
certifi=2021.10.8=py39haa95532_2
cloudpickle=2.0.0=pyhd3eb1b0_0
cudatoolkit=11.3.1=h59b6b97_2
cycler=0.11.0=pyhd3eb1b0_0
expat=2.4.8=h39d44d4_0
ffmpeg=4.2.2=he774522_0
fontconfig=2.14.0=hce3cb01_0
fonttools=4.25.0=pyhd3eb1b0_0
freetype=2.10.4=hd328e21_0
fribidi=1.0.10=h62dcd97_0
future=0.18.2=py39haa95532_1
getopt-win32=0.1=h8ffe710_0
gettext=0.19.8.1=ha2e2712_1008
graphviz=2.47.1=h8277ce1_0
gts=0.7.6=h7c369d9_2
gym=0.21.0=py39h832f523_2
icu=68.1=h6c2663c_0
intel-openmp=2021.4.0=haa95532_3556
jpeg=9e=h2bbff1b_0
kiwisolver=1.3.2=py39hd77b12b_0
libclang=11.1.0=default_h5c34c98_1
libffi=3.4.2=h604cdb4_1
libgd=2.3.2=h138e682_0
libglib=2.68.4=h3be07f2_1
libiconv=1.16=h2bbff1b_2
libpng=1.6.37=h2a8f88b_0
libtiff=4.2.0=hd0e1b90_0
libuv=1.40.0=he774522_0
libwebp=1.2.2=h2bbff1b_0
libwebp-base=1.2.2=h2bbff1b_0
libxcb=1.13=hcd874cb_1004
lz4-c=1.9.3=h2bbff1b_1
m2w64-gcc-libgfortran=5.3.0=6
m2w64-gcc-libs=5.3.0=7
m2w64-gcc-libs-core=5.3.0=7
m2w64-gmp=6.1.0=2
m2w64-libwinpthread-git=5.0.0.4634.697f757=2
matplotlib=3.5.1=py39haa95532_1
matplotlib-base=3.5.1=py39hd77b12b_1
mkl=2021.4.0=haa95532_640
mkl-service=2.4.0=py39h2bbff1b_0
mkl_fft=1.3.1=py39h277e83a_0
mkl_random=1.2.2=py39hf11a4ad_0
msys2-conda-epoch=20160418=1
munkres=1.1.4=py_0
numexpr=2.8.1=py39hb80d3ca_0
numpy=1.21.5=py39ha4e8547_0
numpy-base=1.21.5=py39hc2deb75_0
openssl=1.1.1n=h2bbff1b_0
packaging=21.3=pyhd3eb1b0_0
pandas=1.4.1=py39hd77b12b_1
pango=1.42.4=hec34917_5
pcre=8.45=hd77b12b_0
pillow=9.0.1=py39hdc2b20a_0
pip=21.2.4=py39haa95532_0
pixman=0.40.0=h2bbff1b_1
pthread-stubs=0.3=h3c9f919_1
pyglet=1.5.16=py39hcbf5309_1
pyparsing=3.0.4=pyhd3eb1b0_0
pyqt=5.12.3=py39hcbf5309_8
pyqt-impl=5.12.3=py39h415ef7b_8
pyqt5-sip=4.19.18=py39h415ef7b_8
pyqtchart=5.12=py39h415ef7b_8
pyqtwebengine=5.12.1=py39h415ef7b_8
python=3.9.12=h6244533_0
python-dateutil=2.8.2=pyhd3eb1b0_0
python-graphviz=0.19.2=pypi_0
python_abi=3.9=2_cp39
pytorch=1.11.0=py3.9_cuda11.3_cudnn8_0
pytorch-mutex=1.0=cuda
pytz=2021.3=pyhd3eb1b0_0
qt=5.12.9=h5909a2a_4
setuptools=61.2.0=py39haa95532_0
six=1.16.0=pyhd3eb1b0_1
sqlite=3.38.2=h2bbff1b_0
tk=8.6.11=h2bbff1b_0
torchviz=0.0.2=pypi_0
tornado=6.1=py39h2bbff1b_0
typing_extensions=4.1.1=pyh06a4308_0
tzdata=2022a=hda174b7_0
vc=14.2=h21ff451_1
vs2015_runtime=14.27.29016=h5e58377_2
wheel=0.37.1=pyhd3eb1b0_0
wincertstore=0.2=py39haa95532_2
xorg-kbproto=1.0.7=hcd874cb_1002
xorg-libice=1.0.10=hcd874cb_0
xorg-libsm=1.2.3=hcd874cb_1000
xorg-libx11=1.7.2=hcd874cb_0
xorg-libxau=1.0.9=hcd874cb_0
xorg-libxdmcp=1.1.3=hcd874cb_0
xorg-libxext=1.3.4=hcd874cb_1
xorg-libxpm=3.5.13=hcd874cb_0
xorg-libxt=1.2.1=hcd874cb_2
xorg-xextproto=7.3.0=hcd874cb_1002
xorg-xproto=7.0.31=hcd874cb_1007
xz=5.2.5=h8cc25b3_1
zlib=1.2.12=h8cc25b3_2
zstd=1.4.9=h19a0ad4_0
160 changes: 160 additions & 0 deletions src/Agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import numpy as np
import os
import pickle
import torch as T
import torch.nn as nn
from copy import deepcopy
from gym.core import Env
from Buffer import Buffer
from Network import Network

class Agent():
def __init__(
self, env: Env, learningRate: float, gamma: float, tau: float,
shouldLoad: bool=True, saveFolder: str='saved'
):
self.observationDim = env.observation_space.shape[0]
self.actionDim = env.action_space.shape[0]
self.gamma = gamma
self.tau = tau
# check if the saveFolder path exists
if not os.path.isdir(saveFolder):
os.mkdir(saveFolder)
self.envName = os.path.join(saveFolder, env.name + '.')
name = self.envName

self.device = T.device("cuda" if T.cuda.is_available() else 'cpu')

self.buffer = pickle.load(open(name + 'Replay', 'rb'))\
if shouldLoad and os.path.exists(name + 'Replay') else Buffer(
self.observationDim, self.actionDim
)
# initialize the actor and critics
self.actor = pickle.load(open(name + 'Actor', 'rb'))\
if shouldLoad and os.path.exists(name + 'Actor') else Network(
[self.observationDim, 400, 300, self.actionDim],
nn.Tanh,
learningRate,
self.device
)

self.critic1 = pickle.load(open(name + 'Critic1', 'rb'))\
if shouldLoad and os.path.exists(name + 'Critic1') else Network(
[self.observationDim + self.actionDim, 400, 300, 1],
nn.Identity,
learningRate,
self.device
)

self.critic2 = pickle.load(open(name + 'Critic2', 'rb'))\
if shouldLoad and os.path.exists(name + 'Critic2') else Network(
[self.observationDim + self.actionDim, 400, 300, 1],
nn.Identity,
learningRate,
self.device
)

# create target networks
self.targetActor = pickle.load(open(name + 'TargetActor', 'rb'))\
if shouldLoad and os.path.exists(name + 'TargetActor') else\
deepcopy(self.actor)

self.targetCritic1 = pickle.load(open(name + 'TargetCritic1', 'rb'))\
if shouldLoad and os.path.exists(name + 'TargetCritic1') else\
deepcopy(self.critic1)

self.targetCritic2 = pickle.load(open(name + 'TargetCritic2', 'rb'))\
if shouldLoad and os.path.exists(name + 'TargetCritic2') else\
deepcopy(self.critic2)

def getNoisyAction(self, state: np.ndarray, sigma: float) -> np.ndarray:
deterministicAction = self.getDeterministicAction(state)
noise = np.random.normal(0, sigma, deterministicAction.shape)
return np.clip(deterministicAction + noise, -1, +1)

def getDeterministicAction(self, state: np.ndarray) -> np.ndarray:
actions: T.Tensor = self.actor.forward(T.tensor(state, device=self.device))
return actions.cpu().detach().numpy()

def update(
self, miniBatchSize: int, trainingSigma: float, trainingClip: float,
updatePolicy: bool
):
# randomly sample a mini-batch from the replay buffer
miniBatch = self.buffer.getMiniBatch(miniBatchSize)
# create tensors to start generating computational graph
states = T.tensor(miniBatch["states"], requires_grad=True, device=self.device)
actions = T.tensor(miniBatch["actions"], requires_grad=True, device=self.device)
rewards = T.tensor(miniBatch["rewards"], requires_grad=True, device=self.device)
nextStates = T.tensor(
miniBatch["nextStates"], requires_grad=True, device=self.device
)
dones = T.tensor(miniBatch["doneFlags"], requires_grad=True, device=self.device)
# compute the targets
targets = self.computeTargets(
rewards, nextStates, dones, trainingSigma, trainingClip
)
# do a single step on each critic network
Q1Loss = self.computeQLoss(self.critic1, states, actions, targets)
self.critic1.gradientDescentStep(Q1Loss, True)
Q2Loss = self.computeQLoss(self.critic2, states, actions, targets)
self.critic2.gradientDescentStep(Q2Loss)
if updatePolicy:
# do a single step on the actor network
policyLoss = self.computePolicyLoss(states)
self.actor.gradientDescentStep(policyLoss)
# update target networks
self.updateTargetNetwork(self.targetActor, self.actor)
self.updateTargetNetwork(self.targetCritic1, self.critic1)
self.updateTargetNetwork(self.targetCritic2, self.critic2)

def computeTargets(
self, rewards: T.Tensor, nextStates: T.Tensor, dones: T.Tensor,
trainingSigma: float, trainingClip: float
) -> T.Tensor:
targetActions = self.targetActor.forward(nextStates.float())
# create additive noise for target actions
noise = np.random.normal(0, trainingSigma, targetActions.shape)
clippedNoise = T.tensor(
np.clip(noise, -trainingClip, +trainingClip), device=self.device
)
targetActions = T.clip(targetActions + clippedNoise, -1, +1)
# compute targets
targetQ1Values = T.squeeze(
self.targetCritic1.forward(T.hstack([nextStates, targetActions]).float())
)
targetQ2Values = T.squeeze(
self.targetCritic2.forward(T.hstack([nextStates, targetActions]).float())
)
targetQValues = T.minimum(targetQ1Values, targetQ2Values)
return rewards + self.gamma*(1 - dones)*targetQValues

def computeQLoss(
self, network: Network, states: T.Tensor, actions: T.Tensor, targets: T.Tensor
) -> T.Tensor:
# compute the MSE of the Q function with respect to the targets
QValues = T.squeeze(network.forward(T.hstack([states, actions]).float()))
return T.square(QValues - targets).mean()

def computePolicyLoss(self, states: T.Tensor):
actions = self.actor.forward(states.float())
QValues = T.squeeze(self.critic1.forward(T.hstack([states, actions]).float()))
return -QValues.mean()

def updateTargetNetwork(self, targetNetwork: Network, network: Network):
with T.no_grad():
for targetParameter, parameter in zip(
targetNetwork.parameters(), network.parameters()
):
targetParameter.mul_(1 - self.tau)
targetParameter.add_(self.tau*parameter)

def save(self, suffix = ""):
name = self.envName
pickle.dump(self.buffer, open(name + 'Replay' + suffix, 'wb'))
pickle.dump(self.actor, open(name + 'Actor' + suffix, 'wb'))
pickle.dump(self.critic1, open(name + 'Critic1' + suffix, 'wb'))
pickle.dump(self.critic2, open(name + 'Critic2' + suffix, 'wb'))
pickle.dump(self.targetActor, open(name + 'TargetActor' + suffix, 'wb'))
pickle.dump(self.targetCritic1, open(name + 'TargetCritic1' + suffix, 'wb'))
pickle.dump(self.targetCritic2, open(name + 'TargetCritic2' + suffix, 'wb'))
44 changes: 44 additions & 0 deletions src/Buffer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import numpy as np

class Buffer():
def __init__(self, observationDim: int, actionDim: int, size: int=1_000_000):
# use a fixed-size buffer to prevent constant list instantiations
self.states = np.zeros((size, observationDim))
self.actions = np.zeros((size, actionDim))
self.rewards = np.zeros(size)
self.nextStates = np.zeros((size, observationDim))
self.doneFlags = np.zeros(size)
# use a pointer to keep track of where in the buffer we are
self.pointer = 0
# use current size to ensure we don't train on any non-existent data points
self.currentSize = 0
self.size = size

def store(
self, state: np.ndarray, action: np.ndarray, reward: float, nextState: np.ndarray,
doneFlag: bool
):
# store all the data for this transition
ptr = self.pointer
self.states[ptr] = state
self.actions[ptr] = action
self.rewards[ptr] = reward
self.nextStates[ptr] = nextState
self.doneFlags[ptr] = doneFlag
# update the pointer and current size
self.pointer = (self.pointer + 1) % self.size
self.currentSize = min(self.currentSize + 1, self.size)

def getMiniBatch(self, size: int) -> dict:
# ensure size is not bigger than the current size of the buffer
size = min(size, self.currentSize)
# generate random indices
indices = np.random.choice(self.currentSize, size, replace=False)
# return the mini-batch of transitions
return {
"states": self.states[indices],
"actions": self.actions[indices],
"rewards": self.rewards[indices],
"nextStates": self.nextStates[indices],
"doneFlags": self.doneFlags[indices],
}
33 changes: 33 additions & 0 deletions src/Network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from typing import Callable
import torch as T
import torch.nn as nn
import torch.optim as optim

class Network(nn.Module):
def __init__(
self, shape: list, outputActivation: Callable, learningRate: float,
device: T.device
):
super().__init__()
# initialize the network
layers = []
for i in range(1, len(shape)):
dim1 = shape[i - 1]
dim2 = shape[i]
layers.append(nn.Linear(dim1, dim2))
if i < len(shape) - 1:
layers.append(nn.ReLU())
layers.append(outputActivation())
self.network = nn.Sequential(*layers)

self.optimizer = optim.Adam(self.parameters(), lr=learningRate)
self.device = device
self.to(self.device)

def forward(self, state: object) -> object:
return self.network(state)

def gradientDescentStep(self, loss: T.Tensor, retainGraph: bool=False):
self.optimizer.zero_grad()
loss.backward(retain_graph=retainGraph)
self.optimizer.step()
Loading

0 comments on commit aeca317

Please sign in to comment.