initial commit

lukau2357 · May 7, 2022 · aeca317 · aeca317
commit aeca317
Show file tree

Hide file tree

Showing 7 changed files with 499 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+**/__pycache__
+# Reward history
+**/history
+# Model weights, too big for a GitHub repository
+**/saved
diff --git a/README.md b/README.md
@@ -0,0 +1,11 @@
+## Project description
+Implementation of the TD3 - twin delayed DDPG algorithm for reinforcement learning ([original publication link](https://arxiv.org/pdf/1802.09477.pdf)), particularlly usefull for continuous action space-continuous state space problems.
+
+The algorithm was tested on the [BipedalWalker-v3](https://gym.openai.com/envs/BipedalWalker-v2/) environment (even though the official documentation says that v2 is latest, it is deprecated!). We trained the agent on a high-performance GPU with CUDA, and after 550 episodes the following results were obtained:
+![walk_demo](https://drive.google.com/uc?id=1y0_Z9uhuqt7hOb3m1wWrZzy1cKKR6NfV)
+
+Project dependencies can be found in requirements.txt file, as usual.
+
+## Todo
+- To trully estimate how good is TD3 for this environment, we planned on repeating the training process 10-60 times, and from here we wanted to estimate the uncertainty of obtained reward for each agent-episode pair. We planned on constructing simple 95% confidence intervals for these quantities.
+- Paper and presentation for this project are still being worked on.
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,107 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: win-64
+blas=1.0=mkl
+bottleneck=1.3.4=py39h080aedc_0
+box2d-py=2.3.8=py39h415ef7b_5
+brotli=1.0.9=ha925a31_2
+ca-certificates=2022.4.26=haa95532_0
+cairo=1.16.0=hb19e0ff_1008
+certifi=2021.10.8=py39haa95532_2
+cloudpickle=2.0.0=pyhd3eb1b0_0
+cudatoolkit=11.3.1=h59b6b97_2
+cycler=0.11.0=pyhd3eb1b0_0
+expat=2.4.8=h39d44d4_0
+ffmpeg=4.2.2=he774522_0
+fontconfig=2.14.0=hce3cb01_0
+fonttools=4.25.0=pyhd3eb1b0_0
+freetype=2.10.4=hd328e21_0
+fribidi=1.0.10=h62dcd97_0
+future=0.18.2=py39haa95532_1
+getopt-win32=0.1=h8ffe710_0
+gettext=0.19.8.1=ha2e2712_1008
+graphviz=2.47.1=h8277ce1_0
+gts=0.7.6=h7c369d9_2
+gym=0.21.0=py39h832f523_2
+icu=68.1=h6c2663c_0
+intel-openmp=2021.4.0=haa95532_3556
+jpeg=9e=h2bbff1b_0
+kiwisolver=1.3.2=py39hd77b12b_0
+libclang=11.1.0=default_h5c34c98_1
+libffi=3.4.2=h604cdb4_1
+libgd=2.3.2=h138e682_0
+libglib=2.68.4=h3be07f2_1
+libiconv=1.16=h2bbff1b_2
+libpng=1.6.37=h2a8f88b_0
+libtiff=4.2.0=hd0e1b90_0
+libuv=1.40.0=he774522_0
+libwebp=1.2.2=h2bbff1b_0
+libwebp-base=1.2.2=h2bbff1b_0
+libxcb=1.13=hcd874cb_1004
+lz4-c=1.9.3=h2bbff1b_1
+m2w64-gcc-libgfortran=5.3.0=6
+m2w64-gcc-libs=5.3.0=7
+m2w64-gcc-libs-core=5.3.0=7
+m2w64-gmp=6.1.0=2
+m2w64-libwinpthread-git=5.0.0.4634.697f757=2
+matplotlib=3.5.1=py39haa95532_1
+matplotlib-base=3.5.1=py39hd77b12b_1
+mkl=2021.4.0=haa95532_640
+mkl-service=2.4.0=py39h2bbff1b_0
+mkl_fft=1.3.1=py39h277e83a_0
+mkl_random=1.2.2=py39hf11a4ad_0
+msys2-conda-epoch=20160418=1
+munkres=1.1.4=py_0
+numexpr=2.8.1=py39hb80d3ca_0
+numpy=1.21.5=py39ha4e8547_0
+numpy-base=1.21.5=py39hc2deb75_0
+openssl=1.1.1n=h2bbff1b_0
+packaging=21.3=pyhd3eb1b0_0
+pandas=1.4.1=py39hd77b12b_1
+pango=1.42.4=hec34917_5
+pcre=8.45=hd77b12b_0
+pillow=9.0.1=py39hdc2b20a_0
+pip=21.2.4=py39haa95532_0
+pixman=0.40.0=h2bbff1b_1
+pthread-stubs=0.3=h3c9f919_1
+pyglet=1.5.16=py39hcbf5309_1
+pyparsing=3.0.4=pyhd3eb1b0_0
+pyqt=5.12.3=py39hcbf5309_8
+pyqt-impl=5.12.3=py39h415ef7b_8
+pyqt5-sip=4.19.18=py39h415ef7b_8
+pyqtchart=5.12=py39h415ef7b_8
+pyqtwebengine=5.12.1=py39h415ef7b_8
+python=3.9.12=h6244533_0
+python-dateutil=2.8.2=pyhd3eb1b0_0
+python-graphviz=0.19.2=pypi_0
+python_abi=3.9=2_cp39
+pytorch=1.11.0=py3.9_cuda11.3_cudnn8_0
+pytorch-mutex=1.0=cuda
+pytz=2021.3=pyhd3eb1b0_0
+qt=5.12.9=h5909a2a_4
+setuptools=61.2.0=py39haa95532_0
+six=1.16.0=pyhd3eb1b0_1
+sqlite=3.38.2=h2bbff1b_0
+tk=8.6.11=h2bbff1b_0
+torchviz=0.0.2=pypi_0
+tornado=6.1=py39h2bbff1b_0
+typing_extensions=4.1.1=pyh06a4308_0
+tzdata=2022a=hda174b7_0
+vc=14.2=h21ff451_1
+vs2015_runtime=14.27.29016=h5e58377_2
+wheel=0.37.1=pyhd3eb1b0_0
+wincertstore=0.2=py39haa95532_2
+xorg-kbproto=1.0.7=hcd874cb_1002
+xorg-libice=1.0.10=hcd874cb_0
+xorg-libsm=1.2.3=hcd874cb_1000
+xorg-libx11=1.7.2=hcd874cb_0
+xorg-libxau=1.0.9=hcd874cb_0
+xorg-libxdmcp=1.1.3=hcd874cb_0
+xorg-libxext=1.3.4=hcd874cb_1
+xorg-libxpm=3.5.13=hcd874cb_0
+xorg-libxt=1.2.1=hcd874cb_2
+xorg-xextproto=7.3.0=hcd874cb_1002
+xorg-xproto=7.0.31=hcd874cb_1007
+xz=5.2.5=h8cc25b3_1
+zlib=1.2.12=h8cc25b3_2
+zstd=1.4.9=h19a0ad4_0
diff --git a/src/Agent.py b/src/Agent.py
@@ -0,0 +1,160 @@
+import numpy as np
+import os
+import pickle
+import torch as T
+import torch.nn as nn
+from copy import deepcopy
+from gym.core import Env
+from Buffer import Buffer
+from Network import Network
+
+class Agent():
+    def __init__(
+        self, env: Env, learningRate: float, gamma: float, tau: float,
+        shouldLoad: bool=True, saveFolder: str='saved'
+    ):
+        self.observationDim = env.observation_space.shape[0]
+        self.actionDim = env.action_space.shape[0]
+        self.gamma = gamma
+        self.tau = tau
+        # check if the saveFolder path exists
+        if not os.path.isdir(saveFolder):
+            os.mkdir(saveFolder)
+        self.envName = os.path.join(saveFolder, env.name + '.')
+        name = self.envName
+
+        self.device = T.device("cuda" if T.cuda.is_available() else 'cpu')
+
+        self.buffer = pickle.load(open(name + 'Replay', 'rb'))\
+            if shouldLoad and os.path.exists(name + 'Replay') else Buffer(
+                self.observationDim, self.actionDim
+            )
+        # initialize the actor and critics
+        self.actor = pickle.load(open(name + 'Actor', 'rb'))\
+            if shouldLoad and os.path.exists(name + 'Actor') else Network(
+                [self.observationDim, 400, 300, self.actionDim],
+                nn.Tanh,
+                learningRate,
+                self.device
+            )
+
+        self.critic1 = pickle.load(open(name + 'Critic1', 'rb'))\
+            if shouldLoad and os.path.exists(name + 'Critic1') else Network(
+                [self.observationDim + self.actionDim, 400, 300, 1],
+                nn.Identity,
+                learningRate,
+                self.device
+            )
+
+        self.critic2 = pickle.load(open(name + 'Critic2', 'rb'))\
+            if shouldLoad and os.path.exists(name + 'Critic2') else Network(
+                [self.observationDim + self.actionDim, 400, 300, 1],
+                nn.Identity,
+                learningRate,
+                self.device
+            )
+
+        # create target networks
+        self.targetActor = pickle.load(open(name + 'TargetActor', 'rb'))\
+            if shouldLoad and os.path.exists(name + 'TargetActor') else\
+            deepcopy(self.actor)
+
+        self.targetCritic1 = pickle.load(open(name + 'TargetCritic1', 'rb'))\
+            if shouldLoad and os.path.exists(name + 'TargetCritic1') else\
+            deepcopy(self.critic1)
+
+        self.targetCritic2 = pickle.load(open(name + 'TargetCritic2', 'rb'))\
+            if shouldLoad and os.path.exists(name + 'TargetCritic2') else\
+            deepcopy(self.critic2)
+
+    def getNoisyAction(self, state: np.ndarray, sigma: float) -> np.ndarray:
+        deterministicAction = self.getDeterministicAction(state)
+        noise = np.random.normal(0, sigma, deterministicAction.shape)
+        return np.clip(deterministicAction + noise, -1, +1)
+
+    def getDeterministicAction(self, state: np.ndarray) -> np.ndarray:
+        actions: T.Tensor = self.actor.forward(T.tensor(state, device=self.device))
+        return actions.cpu().detach().numpy()
+
+    def update(
+        self, miniBatchSize: int, trainingSigma: float, trainingClip: float,
+        updatePolicy: bool
+    ):
+        # randomly sample a mini-batch from the replay buffer
+        miniBatch = self.buffer.getMiniBatch(miniBatchSize)
+        # create tensors to start generating computational graph
+        states = T.tensor(miniBatch["states"], requires_grad=True, device=self.device)
+        actions = T.tensor(miniBatch["actions"], requires_grad=True, device=self.device)
+        rewards = T.tensor(miniBatch["rewards"], requires_grad=True, device=self.device)
+        nextStates = T.tensor(
+            miniBatch["nextStates"], requires_grad=True, device=self.device
+        )
+        dones = T.tensor(miniBatch["doneFlags"], requires_grad=True, device=self.device)
+        # compute the targets
+        targets = self.computeTargets(
+            rewards, nextStates, dones, trainingSigma, trainingClip
+        )
+        # do a single step on each critic network
+        Q1Loss = self.computeQLoss(self.critic1, states, actions, targets)
+        self.critic1.gradientDescentStep(Q1Loss, True)
+        Q2Loss = self.computeQLoss(self.critic2, states, actions, targets)
+        self.critic2.gradientDescentStep(Q2Loss)
+        if updatePolicy:
+            # do a single step on the actor network
+            policyLoss = self.computePolicyLoss(states)
+            self.actor.gradientDescentStep(policyLoss)
+            # update target networks
+            self.updateTargetNetwork(self.targetActor, self.actor)
+            self.updateTargetNetwork(self.targetCritic1, self.critic1)
+            self.updateTargetNetwork(self.targetCritic2, self.critic2)
+
+    def computeTargets(
+        self, rewards: T.Tensor, nextStates: T.Tensor, dones: T.Tensor,
+        trainingSigma: float, trainingClip: float
+    ) -> T.Tensor:
+        targetActions = self.targetActor.forward(nextStates.float())
+        # create additive noise for target actions
+        noise = np.random.normal(0, trainingSigma, targetActions.shape)
+        clippedNoise = T.tensor(
+            np.clip(noise, -trainingClip, +trainingClip), device=self.device
+        )
+        targetActions = T.clip(targetActions + clippedNoise, -1, +1)
+        # compute targets
+        targetQ1Values = T.squeeze(
+            self.targetCritic1.forward(T.hstack([nextStates, targetActions]).float())
+        )
+        targetQ2Values = T.squeeze(
+            self.targetCritic2.forward(T.hstack([nextStates, targetActions]).float())
+        )
+        targetQValues = T.minimum(targetQ1Values, targetQ2Values)
+        return rewards + self.gamma*(1 - dones)*targetQValues
+
+    def computeQLoss(
+        self, network: Network, states: T.Tensor, actions: T.Tensor, targets: T.Tensor
+    ) -> T.Tensor:
+        # compute the MSE of the Q function with respect to the targets
+        QValues = T.squeeze(network.forward(T.hstack([states, actions]).float()))
+        return T.square(QValues - targets).mean()
+
+    def computePolicyLoss(self, states: T.Tensor): 
+        actions = self.actor.forward(states.float())
+        QValues = T.squeeze(self.critic1.forward(T.hstack([states, actions]).float()))
+        return -QValues.mean()
+
+    def updateTargetNetwork(self, targetNetwork: Network, network: Network):
+        with T.no_grad():
+            for targetParameter, parameter in zip(
+                targetNetwork.parameters(), network.parameters()
+            ):
+                targetParameter.mul_(1 - self.tau)
+                targetParameter.add_(self.tau*parameter)
+
+    def save(self, suffix = ""):
+        name = self.envName
+        pickle.dump(self.buffer, open(name + 'Replay' + suffix, 'wb'))
+        pickle.dump(self.actor, open(name + 'Actor' + suffix, 'wb'))
+        pickle.dump(self.critic1, open(name + 'Critic1' + suffix, 'wb'))
+        pickle.dump(self.critic2, open(name + 'Critic2' + suffix, 'wb'))
+        pickle.dump(self.targetActor, open(name + 'TargetActor' + suffix, 'wb'))
+        pickle.dump(self.targetCritic1, open(name + 'TargetCritic1' + suffix, 'wb'))
+        pickle.dump(self.targetCritic2, open(name + 'TargetCritic2' + suffix, 'wb'))
diff --git a/src/Buffer.py b/src/Buffer.py
@@ -0,0 +1,44 @@
+import numpy as np
+
+class Buffer():
+    def __init__(self, observationDim: int, actionDim: int, size: int=1_000_000):
+        # use a fixed-size buffer to prevent constant list instantiations
+        self.states = np.zeros((size, observationDim))
+        self.actions = np.zeros((size, actionDim))
+        self.rewards = np.zeros(size)
+        self.nextStates = np.zeros((size, observationDim))
+        self.doneFlags = np.zeros(size)
+        # use a pointer to keep track of where in the buffer we are
+        self.pointer = 0
+        # use current size to ensure we don't train on any non-existent data points
+        self.currentSize = 0
+        self.size = size
+
+    def store(
+        self, state: np.ndarray, action: np.ndarray, reward: float, nextState: np.ndarray,
+        doneFlag: bool
+    ):
+        # store all the data for this transition
+        ptr = self.pointer
+        self.states[ptr] = state
+        self.actions[ptr] = action
+        self.rewards[ptr] = reward
+        self.nextStates[ptr] = nextState
+        self.doneFlags[ptr] = doneFlag
+        # update the pointer and current size
+        self.pointer = (self.pointer + 1) % self.size
+        self.currentSize = min(self.currentSize + 1, self.size)
+
+    def getMiniBatch(self, size: int) -> dict:
+        # ensure size is not bigger than the current size of the buffer
+        size = min(size, self.currentSize)
+        # generate random indices
+        indices = np.random.choice(self.currentSize, size, replace=False)
+        # return the mini-batch of transitions
+        return {
+            "states": self.states[indices],
+            "actions": self.actions[indices],
+            "rewards": self.rewards[indices],
+            "nextStates": self.nextStates[indices],
+            "doneFlags": self.doneFlags[indices],
+        }
diff --git a/src/Network.py b/src/Network.py
@@ -0,0 +1,33 @@
+from typing import Callable
+import torch as T
+import torch.nn as nn
+import torch.optim as optim
+
+class Network(nn.Module):
+    def __init__(
+        self, shape: list, outputActivation: Callable, learningRate: float,
+        device: T.device
+    ):
+        super().__init__()
+        # initialize the network
+        layers = []
+        for i in range(1, len(shape)):
+            dim1 = shape[i - 1]
+            dim2 = shape[i]
+            layers.append(nn.Linear(dim1, dim2))
+            if i < len(shape) - 1:
+                layers.append(nn.ReLU())
+        layers.append(outputActivation())
+        self.network = nn.Sequential(*layers)
+
+        self.optimizer = optim.Adam(self.parameters(), lr=learningRate)
+        self.device = device
+        self.to(self.device)
+
+    def forward(self, state: object) -> object:
+        return self.network(state)
+
+    def gradientDescentStep(self, loss: T.Tensor, retainGraph: bool=False):
+        self.optimizer.zero_grad()
+        loss.backward(retain_graph=retainGraph)
+        self.optimizer.step()