-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit aeca317
Showing
7 changed files
with
499 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
**/__pycache__ | ||
# Reward history | ||
**/history | ||
# Model weights, too big for a GitHub repository | ||
**/saved |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
## Project description | ||
Implementation of the TD3 - twin delayed DDPG algorithm for reinforcement learning ([original publication link](https://arxiv.org/pdf/1802.09477.pdf)), particularlly usefull for continuous action space-continuous state space problems. | ||
|
||
The algorithm was tested on the [BipedalWalker-v3](https://gym.openai.com/envs/BipedalWalker-v2/) environment (even though the official documentation says that v2 is latest, it is deprecated!). We trained the agent on a high-performance GPU with CUDA, and after 550 episodes the following results were obtained: | ||
 | ||
|
||
Project dependencies can be found in requirements.txt file, as usual. | ||
|
||
## Todo | ||
- To trully estimate how good is TD3 for this environment, we planned on repeating the training process 10-60 times, and from here we wanted to estimate the uncertainty of obtained reward for each agent-episode pair. We planned on constructing simple 95% confidence intervals for these quantities. | ||
- Paper and presentation for this project are still being worked on. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
# This file may be used to create an environment using: | ||
# $ conda create --name <env> --file <this file> | ||
# platform: win-64 | ||
blas=1.0=mkl | ||
bottleneck=1.3.4=py39h080aedc_0 | ||
box2d-py=2.3.8=py39h415ef7b_5 | ||
brotli=1.0.9=ha925a31_2 | ||
ca-certificates=2022.4.26=haa95532_0 | ||
cairo=1.16.0=hb19e0ff_1008 | ||
certifi=2021.10.8=py39haa95532_2 | ||
cloudpickle=2.0.0=pyhd3eb1b0_0 | ||
cudatoolkit=11.3.1=h59b6b97_2 | ||
cycler=0.11.0=pyhd3eb1b0_0 | ||
expat=2.4.8=h39d44d4_0 | ||
ffmpeg=4.2.2=he774522_0 | ||
fontconfig=2.14.0=hce3cb01_0 | ||
fonttools=4.25.0=pyhd3eb1b0_0 | ||
freetype=2.10.4=hd328e21_0 | ||
fribidi=1.0.10=h62dcd97_0 | ||
future=0.18.2=py39haa95532_1 | ||
getopt-win32=0.1=h8ffe710_0 | ||
gettext=0.19.8.1=ha2e2712_1008 | ||
graphviz=2.47.1=h8277ce1_0 | ||
gts=0.7.6=h7c369d9_2 | ||
gym=0.21.0=py39h832f523_2 | ||
icu=68.1=h6c2663c_0 | ||
intel-openmp=2021.4.0=haa95532_3556 | ||
jpeg=9e=h2bbff1b_0 | ||
kiwisolver=1.3.2=py39hd77b12b_0 | ||
libclang=11.1.0=default_h5c34c98_1 | ||
libffi=3.4.2=h604cdb4_1 | ||
libgd=2.3.2=h138e682_0 | ||
libglib=2.68.4=h3be07f2_1 | ||
libiconv=1.16=h2bbff1b_2 | ||
libpng=1.6.37=h2a8f88b_0 | ||
libtiff=4.2.0=hd0e1b90_0 | ||
libuv=1.40.0=he774522_0 | ||
libwebp=1.2.2=h2bbff1b_0 | ||
libwebp-base=1.2.2=h2bbff1b_0 | ||
libxcb=1.13=hcd874cb_1004 | ||
lz4-c=1.9.3=h2bbff1b_1 | ||
m2w64-gcc-libgfortran=5.3.0=6 | ||
m2w64-gcc-libs=5.3.0=7 | ||
m2w64-gcc-libs-core=5.3.0=7 | ||
m2w64-gmp=6.1.0=2 | ||
m2w64-libwinpthread-git=5.0.0.4634.697f757=2 | ||
matplotlib=3.5.1=py39haa95532_1 | ||
matplotlib-base=3.5.1=py39hd77b12b_1 | ||
mkl=2021.4.0=haa95532_640 | ||
mkl-service=2.4.0=py39h2bbff1b_0 | ||
mkl_fft=1.3.1=py39h277e83a_0 | ||
mkl_random=1.2.2=py39hf11a4ad_0 | ||
msys2-conda-epoch=20160418=1 | ||
munkres=1.1.4=py_0 | ||
numexpr=2.8.1=py39hb80d3ca_0 | ||
numpy=1.21.5=py39ha4e8547_0 | ||
numpy-base=1.21.5=py39hc2deb75_0 | ||
openssl=1.1.1n=h2bbff1b_0 | ||
packaging=21.3=pyhd3eb1b0_0 | ||
pandas=1.4.1=py39hd77b12b_1 | ||
pango=1.42.4=hec34917_5 | ||
pcre=8.45=hd77b12b_0 | ||
pillow=9.0.1=py39hdc2b20a_0 | ||
pip=21.2.4=py39haa95532_0 | ||
pixman=0.40.0=h2bbff1b_1 | ||
pthread-stubs=0.3=h3c9f919_1 | ||
pyglet=1.5.16=py39hcbf5309_1 | ||
pyparsing=3.0.4=pyhd3eb1b0_0 | ||
pyqt=5.12.3=py39hcbf5309_8 | ||
pyqt-impl=5.12.3=py39h415ef7b_8 | ||
pyqt5-sip=4.19.18=py39h415ef7b_8 | ||
pyqtchart=5.12=py39h415ef7b_8 | ||
pyqtwebengine=5.12.1=py39h415ef7b_8 | ||
python=3.9.12=h6244533_0 | ||
python-dateutil=2.8.2=pyhd3eb1b0_0 | ||
python-graphviz=0.19.2=pypi_0 | ||
python_abi=3.9=2_cp39 | ||
pytorch=1.11.0=py3.9_cuda11.3_cudnn8_0 | ||
pytorch-mutex=1.0=cuda | ||
pytz=2021.3=pyhd3eb1b0_0 | ||
qt=5.12.9=h5909a2a_4 | ||
setuptools=61.2.0=py39haa95532_0 | ||
six=1.16.0=pyhd3eb1b0_1 | ||
sqlite=3.38.2=h2bbff1b_0 | ||
tk=8.6.11=h2bbff1b_0 | ||
torchviz=0.0.2=pypi_0 | ||
tornado=6.1=py39h2bbff1b_0 | ||
typing_extensions=4.1.1=pyh06a4308_0 | ||
tzdata=2022a=hda174b7_0 | ||
vc=14.2=h21ff451_1 | ||
vs2015_runtime=14.27.29016=h5e58377_2 | ||
wheel=0.37.1=pyhd3eb1b0_0 | ||
wincertstore=0.2=py39haa95532_2 | ||
xorg-kbproto=1.0.7=hcd874cb_1002 | ||
xorg-libice=1.0.10=hcd874cb_0 | ||
xorg-libsm=1.2.3=hcd874cb_1000 | ||
xorg-libx11=1.7.2=hcd874cb_0 | ||
xorg-libxau=1.0.9=hcd874cb_0 | ||
xorg-libxdmcp=1.1.3=hcd874cb_0 | ||
xorg-libxext=1.3.4=hcd874cb_1 | ||
xorg-libxpm=3.5.13=hcd874cb_0 | ||
xorg-libxt=1.2.1=hcd874cb_2 | ||
xorg-xextproto=7.3.0=hcd874cb_1002 | ||
xorg-xproto=7.0.31=hcd874cb_1007 | ||
xz=5.2.5=h8cc25b3_1 | ||
zlib=1.2.12=h8cc25b3_2 | ||
zstd=1.4.9=h19a0ad4_0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
import numpy as np | ||
import os | ||
import pickle | ||
import torch as T | ||
import torch.nn as nn | ||
from copy import deepcopy | ||
from gym.core import Env | ||
from Buffer import Buffer | ||
from Network import Network | ||
|
||
class Agent(): | ||
def __init__( | ||
self, env: Env, learningRate: float, gamma: float, tau: float, | ||
shouldLoad: bool=True, saveFolder: str='saved' | ||
): | ||
self.observationDim = env.observation_space.shape[0] | ||
self.actionDim = env.action_space.shape[0] | ||
self.gamma = gamma | ||
self.tau = tau | ||
# check if the saveFolder path exists | ||
if not os.path.isdir(saveFolder): | ||
os.mkdir(saveFolder) | ||
self.envName = os.path.join(saveFolder, env.name + '.') | ||
name = self.envName | ||
|
||
self.device = T.device("cuda" if T.cuda.is_available() else 'cpu') | ||
|
||
self.buffer = pickle.load(open(name + 'Replay', 'rb'))\ | ||
if shouldLoad and os.path.exists(name + 'Replay') else Buffer( | ||
self.observationDim, self.actionDim | ||
) | ||
# initialize the actor and critics | ||
self.actor = pickle.load(open(name + 'Actor', 'rb'))\ | ||
if shouldLoad and os.path.exists(name + 'Actor') else Network( | ||
[self.observationDim, 400, 300, self.actionDim], | ||
nn.Tanh, | ||
learningRate, | ||
self.device | ||
) | ||
|
||
self.critic1 = pickle.load(open(name + 'Critic1', 'rb'))\ | ||
if shouldLoad and os.path.exists(name + 'Critic1') else Network( | ||
[self.observationDim + self.actionDim, 400, 300, 1], | ||
nn.Identity, | ||
learningRate, | ||
self.device | ||
) | ||
|
||
self.critic2 = pickle.load(open(name + 'Critic2', 'rb'))\ | ||
if shouldLoad and os.path.exists(name + 'Critic2') else Network( | ||
[self.observationDim + self.actionDim, 400, 300, 1], | ||
nn.Identity, | ||
learningRate, | ||
self.device | ||
) | ||
|
||
# create target networks | ||
self.targetActor = pickle.load(open(name + 'TargetActor', 'rb'))\ | ||
if shouldLoad and os.path.exists(name + 'TargetActor') else\ | ||
deepcopy(self.actor) | ||
|
||
self.targetCritic1 = pickle.load(open(name + 'TargetCritic1', 'rb'))\ | ||
if shouldLoad and os.path.exists(name + 'TargetCritic1') else\ | ||
deepcopy(self.critic1) | ||
|
||
self.targetCritic2 = pickle.load(open(name + 'TargetCritic2', 'rb'))\ | ||
if shouldLoad and os.path.exists(name + 'TargetCritic2') else\ | ||
deepcopy(self.critic2) | ||
|
||
def getNoisyAction(self, state: np.ndarray, sigma: float) -> np.ndarray: | ||
deterministicAction = self.getDeterministicAction(state) | ||
noise = np.random.normal(0, sigma, deterministicAction.shape) | ||
return np.clip(deterministicAction + noise, -1, +1) | ||
|
||
def getDeterministicAction(self, state: np.ndarray) -> np.ndarray: | ||
actions: T.Tensor = self.actor.forward(T.tensor(state, device=self.device)) | ||
return actions.cpu().detach().numpy() | ||
|
||
def update( | ||
self, miniBatchSize: int, trainingSigma: float, trainingClip: float, | ||
updatePolicy: bool | ||
): | ||
# randomly sample a mini-batch from the replay buffer | ||
miniBatch = self.buffer.getMiniBatch(miniBatchSize) | ||
# create tensors to start generating computational graph | ||
states = T.tensor(miniBatch["states"], requires_grad=True, device=self.device) | ||
actions = T.tensor(miniBatch["actions"], requires_grad=True, device=self.device) | ||
rewards = T.tensor(miniBatch["rewards"], requires_grad=True, device=self.device) | ||
nextStates = T.tensor( | ||
miniBatch["nextStates"], requires_grad=True, device=self.device | ||
) | ||
dones = T.tensor(miniBatch["doneFlags"], requires_grad=True, device=self.device) | ||
# compute the targets | ||
targets = self.computeTargets( | ||
rewards, nextStates, dones, trainingSigma, trainingClip | ||
) | ||
# do a single step on each critic network | ||
Q1Loss = self.computeQLoss(self.critic1, states, actions, targets) | ||
self.critic1.gradientDescentStep(Q1Loss, True) | ||
Q2Loss = self.computeQLoss(self.critic2, states, actions, targets) | ||
self.critic2.gradientDescentStep(Q2Loss) | ||
if updatePolicy: | ||
# do a single step on the actor network | ||
policyLoss = self.computePolicyLoss(states) | ||
self.actor.gradientDescentStep(policyLoss) | ||
# update target networks | ||
self.updateTargetNetwork(self.targetActor, self.actor) | ||
self.updateTargetNetwork(self.targetCritic1, self.critic1) | ||
self.updateTargetNetwork(self.targetCritic2, self.critic2) | ||
|
||
def computeTargets( | ||
self, rewards: T.Tensor, nextStates: T.Tensor, dones: T.Tensor, | ||
trainingSigma: float, trainingClip: float | ||
) -> T.Tensor: | ||
targetActions = self.targetActor.forward(nextStates.float()) | ||
# create additive noise for target actions | ||
noise = np.random.normal(0, trainingSigma, targetActions.shape) | ||
clippedNoise = T.tensor( | ||
np.clip(noise, -trainingClip, +trainingClip), device=self.device | ||
) | ||
targetActions = T.clip(targetActions + clippedNoise, -1, +1) | ||
# compute targets | ||
targetQ1Values = T.squeeze( | ||
self.targetCritic1.forward(T.hstack([nextStates, targetActions]).float()) | ||
) | ||
targetQ2Values = T.squeeze( | ||
self.targetCritic2.forward(T.hstack([nextStates, targetActions]).float()) | ||
) | ||
targetQValues = T.minimum(targetQ1Values, targetQ2Values) | ||
return rewards + self.gamma*(1 - dones)*targetQValues | ||
|
||
def computeQLoss( | ||
self, network: Network, states: T.Tensor, actions: T.Tensor, targets: T.Tensor | ||
) -> T.Tensor: | ||
# compute the MSE of the Q function with respect to the targets | ||
QValues = T.squeeze(network.forward(T.hstack([states, actions]).float())) | ||
return T.square(QValues - targets).mean() | ||
|
||
def computePolicyLoss(self, states: T.Tensor): | ||
actions = self.actor.forward(states.float()) | ||
QValues = T.squeeze(self.critic1.forward(T.hstack([states, actions]).float())) | ||
return -QValues.mean() | ||
|
||
def updateTargetNetwork(self, targetNetwork: Network, network: Network): | ||
with T.no_grad(): | ||
for targetParameter, parameter in zip( | ||
targetNetwork.parameters(), network.parameters() | ||
): | ||
targetParameter.mul_(1 - self.tau) | ||
targetParameter.add_(self.tau*parameter) | ||
|
||
def save(self, suffix = ""): | ||
name = self.envName | ||
pickle.dump(self.buffer, open(name + 'Replay' + suffix, 'wb')) | ||
pickle.dump(self.actor, open(name + 'Actor' + suffix, 'wb')) | ||
pickle.dump(self.critic1, open(name + 'Critic1' + suffix, 'wb')) | ||
pickle.dump(self.critic2, open(name + 'Critic2' + suffix, 'wb')) | ||
pickle.dump(self.targetActor, open(name + 'TargetActor' + suffix, 'wb')) | ||
pickle.dump(self.targetCritic1, open(name + 'TargetCritic1' + suffix, 'wb')) | ||
pickle.dump(self.targetCritic2, open(name + 'TargetCritic2' + suffix, 'wb')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import numpy as np | ||
|
||
class Buffer(): | ||
def __init__(self, observationDim: int, actionDim: int, size: int=1_000_000): | ||
# use a fixed-size buffer to prevent constant list instantiations | ||
self.states = np.zeros((size, observationDim)) | ||
self.actions = np.zeros((size, actionDim)) | ||
self.rewards = np.zeros(size) | ||
self.nextStates = np.zeros((size, observationDim)) | ||
self.doneFlags = np.zeros(size) | ||
# use a pointer to keep track of where in the buffer we are | ||
self.pointer = 0 | ||
# use current size to ensure we don't train on any non-existent data points | ||
self.currentSize = 0 | ||
self.size = size | ||
|
||
def store( | ||
self, state: np.ndarray, action: np.ndarray, reward: float, nextState: np.ndarray, | ||
doneFlag: bool | ||
): | ||
# store all the data for this transition | ||
ptr = self.pointer | ||
self.states[ptr] = state | ||
self.actions[ptr] = action | ||
self.rewards[ptr] = reward | ||
self.nextStates[ptr] = nextState | ||
self.doneFlags[ptr] = doneFlag | ||
# update the pointer and current size | ||
self.pointer = (self.pointer + 1) % self.size | ||
self.currentSize = min(self.currentSize + 1, self.size) | ||
|
||
def getMiniBatch(self, size: int) -> dict: | ||
# ensure size is not bigger than the current size of the buffer | ||
size = min(size, self.currentSize) | ||
# generate random indices | ||
indices = np.random.choice(self.currentSize, size, replace=False) | ||
# return the mini-batch of transitions | ||
return { | ||
"states": self.states[indices], | ||
"actions": self.actions[indices], | ||
"rewards": self.rewards[indices], | ||
"nextStates": self.nextStates[indices], | ||
"doneFlags": self.doneFlags[indices], | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from typing import Callable | ||
import torch as T | ||
import torch.nn as nn | ||
import torch.optim as optim | ||
|
||
class Network(nn.Module): | ||
def __init__( | ||
self, shape: list, outputActivation: Callable, learningRate: float, | ||
device: T.device | ||
): | ||
super().__init__() | ||
# initialize the network | ||
layers = [] | ||
for i in range(1, len(shape)): | ||
dim1 = shape[i - 1] | ||
dim2 = shape[i] | ||
layers.append(nn.Linear(dim1, dim2)) | ||
if i < len(shape) - 1: | ||
layers.append(nn.ReLU()) | ||
layers.append(outputActivation()) | ||
self.network = nn.Sequential(*layers) | ||
|
||
self.optimizer = optim.Adam(self.parameters(), lr=learningRate) | ||
self.device = device | ||
self.to(self.device) | ||
|
||
def forward(self, state: object) -> object: | ||
return self.network(state) | ||
|
||
def gradientDescentStep(self, loss: T.Tensor, retainGraph: bool=False): | ||
self.optimizer.zero_grad() | ||
loss.backward(retain_graph=retainGraph) | ||
self.optimizer.step() |
Oops, something went wrong.