Skip to content

Commit

Permalink
report and presentation added
Browse files Browse the repository at this point in the history
  • Loading branch information
lukau2357 committed May 25, 2022
1 parent aeca317 commit f14ac3e
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 10 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
## Project description
Implementation of the TD3 - twin delayed DDPG algorithm for reinforcement learning ([original publication link](https://arxiv.org/pdf/1802.09477.pdf)), particularlly usefull for continuous action space-continuous state space problems.

The algorithm was tested on the [BipedalWalker-v3](https://gym.openai.com/envs/BipedalWalker-v2/) environment (even though the official documentation says that v2 is latest, it is deprecated!). We trained the agent on a high-performance GPU with CUDA, and after 550 episodes the following results were obtained:
![walk_demo](https://drive.google.com/uc?id=1y0_Z9uhuqt7hOb3m1wWrZzy1cKKR6NfV)
The algorithm was tested on the [BipedalWalker-v3](https://gym.openai.com/envs/BipedalWalker-v2/) environment. In order to evaluate the variability of this algorithm, we trained 15 different agents on a high-performance GPU with CUDA for 550 episodes. We recorded the obtained reward by each agent, and obtained the following results:

Project dependencies can be found in requirements.txt file, as usual.
![ci_plot](https://drive.google.com/uc?id=10C8y5Cd4TLgOPf2-ea22SUM9mXqdFB-q)

## Todo
- To trully estimate how good is TD3 for this environment, we planned on repeating the training process 10-60 times, and from here we wanted to estimate the uncertainty of obtained reward for each agent-episode pair. We planned on constructing simple 95% confidence intervals for these quantities.
- Paper and presentation for this project are still being worked on.
The learning process can be observed on the following video:
![run_simulation](https://drive.google.com/uc?id=1nPQ4f92XR9sbvwVg6HLMhGoBHQ5naSVn)

Technical details about the algorithm can be found in the acompanying report.
Binary file added presentation.pdf
Binary file not shown.
Binary file added report.pdf
Binary file not shown.
76 changes: 76 additions & 0 deletions src/Utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import matplotlib.pyplot as plt
import csv
import os
import numpy as np

MAIN_ENV = "BipedalWalker-v3"

"""
Plotting utilities
"""

def plot_ep_history(model_suffix = "", dir = "", version = 0):
path = "{}{}_{}-data.csv".format(MAIN_ENV, model_suffix, str(version))
path = os.path.join(dir, path)
scores = []

with open(path, "r") as f:
reader = csv.reader(f)

for row in reader:
if len(row) == 0:
continue

scores.append(float(row[1]))

plt.style.use("ggplot")
fig, ax = plt.subplots()
ax.set_title("Model {} history".format(str(version)))
ax.plot(scores)
ax.set_ylabel("Score achieved")
ax.set_xlabel("Episode number")
plt.show()


def plot_ci(model_suffix = "", dir = "history"):
path = os.path.join(".", dir)
data = []

for file in os.listdir(path):
if model_suffix not in file:
continue

current_scores = []

with open(os.path.join(path, file), "r") as f:
reader = csv.reader(f, delimiter = ",")

for row in reader:
if len(row) == 0:
continue

current_scores.append(float(row[1]))

data.append(current_scores)

data = np.asarray(data)
mu = data.mean(axis = 0)
std = data.std(axis = 0) / np.sqrt(data.shape[0])

plt.style.use("ggplot")
fig, ax = plt.subplots()
ax.plot([i for i in range(1, data.shape[1] + 1)], mu, label = "Average reward obtained")
ax.fill_between(list(range(1, data.shape[1] + 1)),
mu - 1.96 * std, mu + 1.96 * std, alpha = 0.1, color = "b",
label = r"95% confidence region")
ax.axhline(y = 0, linestyle = "--", color = "black")
xticks = [i for i in range(1, data.shape[1] + 1) if i % 100 == 0 ]
xticks.insert(0, 1)
ax.set_xticks(xticks)
ax.set_xlabel("Episode number")
ax.set_ylabel("Score")
ax.legend()
plt.show()

if __name__ == "__main__":
plot_ci()
8 changes: 4 additions & 4 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
policyDelay = 2 # how many steps to wait before updating the policy
resume = True # resume from previous checkpoint if possible?
render = False # render out the environment?
episode_limit = 550 # limiting the number of episodes, including pretrained episodes
episode_limit = 20 # limiting the number of episodes, including pretrained episodes

envName = "BipedalWalker-v3"

Expand Down Expand Up @@ -54,7 +54,7 @@ def train(trials = 1, suffix = "", periodicSaving = False, period = 100):
if path.exists(csvName):
fileData = list(csv.reader(open(csvName)))
lastLine = fileData[-2]
numEpisode = int(lastLine[0])
numEpisode = int(lastLine[0]) + 1
runningReward = float(lastLine[2])

while numEpisode < episode_limit:
Expand Down Expand Up @@ -135,5 +135,5 @@ def evaluate_model(index, env_suffix = "", trials = 3):
state = nextState
done = env_done

# train(64, periodicSaving = True, period = 100)
evaluate_model(0)
train(3, periodicSaving = True, period = 100)
# evaluate_model(0)

0 comments on commit f14ac3e

Please sign in to comment.