report and presentation added

lukau2357 · May 25, 2022 · f14ac3e · f14ac3e
1 parent aeca317
commit f14ac3e
Show file tree

Hide file tree

Showing 5 changed files with 86 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -1,11 +1,11 @@
 ## Project description
 Implementation of the TD3 - twin delayed DDPG algorithm for reinforcement learning ([original publication link](https://arxiv.org/pdf/1802.09477.pdf)), particularlly usefull for continuous action space-continuous state space problems.
 
-The algorithm was tested on the [BipedalWalker-v3](https://gym.openai.com/envs/BipedalWalker-v2/) environment (even though the official documentation says that v2 is latest, it is deprecated!). We trained the agent on a high-performance GPU with CUDA, and after 550 episodes the following results were obtained:
-![walk_demo](https://drive.google.com/uc?id=1y0_Z9uhuqt7hOb3m1wWrZzy1cKKR6NfV)
+The algorithm was tested on the [BipedalWalker-v3](https://gym.openai.com/envs/BipedalWalker-v2/) environment. In order to evaluate the variability of this algorithm, we trained 15 different agents on a high-performance GPU with CUDA for 550 episodes. We recorded the obtained reward by each agent, and obtained the following results:
 
-Project dependencies can be found in requirements.txt file, as usual.
+![ci_plot](https://drive.google.com/uc?id=10C8y5Cd4TLgOPf2-ea22SUM9mXqdFB-q)
 
-## Todo
-- To trully estimate how good is TD3 for this environment, we planned on repeating the training process 10-60 times, and from here we wanted to estimate the uncertainty of obtained reward for each agent-episode pair. We planned on constructing simple 95% confidence intervals for these quantities.
-- Paper and presentation for this project are still being worked on.
+The learning process can be observed on the following video:
+![run_simulation](https://drive.google.com/uc?id=1nPQ4f92XR9sbvwVg6HLMhGoBHQ5naSVn)
+
+Technical details about the algorithm can be found in the acompanying report.
diff --git a/presentation.pdf b/presentation.pdf
diff --git a/report.pdf b/report.pdf
diff --git a/src/Utils.py b/src/Utils.py
@@ -0,0 +1,76 @@
+import matplotlib.pyplot as plt
+import csv
+import os
+import numpy as np
+
+MAIN_ENV = "BipedalWalker-v3"
+
+"""
+Plotting utilities
+"""
+
+def plot_ep_history(model_suffix = "", dir = "", version = 0):
+    path = "{}{}_{}-data.csv".format(MAIN_ENV, model_suffix, str(version))
+    path = os.path.join(dir, path)
+    scores = []
+
+    with open(path, "r") as f:
+        reader = csv.reader(f)
+
+        for row in reader:
+            if len(row) == 0:
+                continue
+
+            scores.append(float(row[1]))
+
+    plt.style.use("ggplot")
+    fig, ax = plt.subplots()
+    ax.set_title("Model {} history".format(str(version)))
+    ax.plot(scores)
+    ax.set_ylabel("Score achieved")
+    ax.set_xlabel("Episode number")
+    plt.show()
+
+
+def plot_ci(model_suffix = "", dir = "history"):
+    path = os.path.join(".", dir)
+    data = []
+
+    for file in os.listdir(path):
+        if model_suffix not in file:
+            continue
+
+        current_scores = []
+
+        with open(os.path.join(path, file), "r") as f:
+            reader = csv.reader(f, delimiter = ",")
+
+            for row in reader:
+                if len(row) == 0:
+                    continue
+
+                current_scores.append(float(row[1]))
+
+        data.append(current_scores)
+
+    data = np.asarray(data)
+    mu = data.mean(axis = 0)
+    std = data.std(axis = 0) / np.sqrt(data.shape[0])
+
+    plt.style.use("ggplot")
+    fig, ax = plt.subplots()
+    ax.plot([i for i in range(1, data.shape[1] + 1)], mu, label = "Average reward obtained")
+    ax.fill_between(list(range(1, data.shape[1] + 1)), 
+                    mu - 1.96 * std, mu + 1.96 * std, alpha = 0.1, color = "b", 
+                    label = r"95% confidence region")
+    ax.axhline(y = 0, linestyle = "--", color = "black")
+    xticks = [i for i in range(1, data.shape[1] + 1) if i % 100 == 0 ]
+    xticks.insert(0, 1)
+    ax.set_xticks(xticks)
+    ax.set_xlabel("Episode number")
+    ax.set_ylabel("Score")
+    ax.legend()
+    plt.show()
+
+if __name__ == "__main__":
+    plot_ci()
diff --git a/src/main.py b/src/main.py
@@ -20,7 +20,7 @@
 policyDelay = 2      # how many steps to wait before updating the policy
 resume = True        # resume from previous checkpoint if possible?
 render = False       # render out the environment?
-episode_limit = 550  # limiting the number of episodes, including pretrained episodes
+episode_limit = 20  # limiting the number of episodes, including pretrained episodes
 
 envName = "BipedalWalker-v3"
 
@@ -54,7 +54,7 @@ def train(trials = 1, suffix = "", periodicSaving = False, period = 100):
         if path.exists(csvName):
             fileData = list(csv.reader(open(csvName)))
             lastLine = fileData[-2]
-            numEpisode = int(lastLine[0])
+            numEpisode = int(lastLine[0]) + 1
             runningReward = float(lastLine[2])
 
         while numEpisode < episode_limit:
@@ -135,5 +135,5 @@ def evaluate_model(index, env_suffix = "", trials = 3):
             state = nextState
             done = env_done
 
-# train(64, periodicSaving = True, period = 100)
-evaluate_model(0)
+train(3, periodicSaving = True, period = 100)
+# evaluate_model(0)