rlberry-py · JulienT01 · Apr 12, 2023 · Feb 15, 2023 · Feb 15, 2023 · Feb 15, 2023
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,6 @@ dmypy.json
 
 # PyCharm
 .idea
+.project
+.pydevproject
+profile.prof
diff --git a/docs/_video/video_plot_atari_freeway.mp4 b/docs/_video/video_plot_atari_freeway.mp4
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -9,6 +9,10 @@ Dev version
 
 * Move old scripts (jax agents, attention networks, old examples...) that we won't maintain from the main branch to an archive branch.
 
+*PR #277*
+
+* Add and update code to use "Atari games" env
+
 
 Version 0.4.0  (latest stable version)
 --------------------------------------

diff --git a/docs/thumbnails/video_plot_atari_freeway.jpg b/docs/thumbnails/video_plot_atari_freeway.jpg
diff --git a/examples/demo_env/video_plot_atari_freeway.py b/examples/demo_env/video_plot_atari_freeway.py
@@ -0,0 +1,120 @@
+"""
+===============================================
+A demo of ATARI Freeway environment with DQNAgent
+===============================================
+Illustration of the training and video rendering of DQN Agent in
+ATARI Freeway environment.
+
+Agent is slightly tuned, but not optimal. This is just for illustration purpose.
+
+.. video:: ../../video_plot_atari_freeway.mp4
+   :width: 600
+
+"""
+# sphinx_gallery_thumbnail_path = 'thumbnails/video_plot_atari_freeway.jpg'
+
+
+from rlberry.manager.agent_manager import AgentManager
+from datetime import datetime
+from rlberry.agents.torch.dqn.dqn import DQNAgent
+from gym.wrappers.record_video import RecordVideo
+import shutil
+import os
+from rlberry.envs.gym_make import atari_make
+
+
+initial_time = datetime.now()
+print("-------- init agent --------")
+
+mlp_configs = {
+    "type": "MultiLayerPerceptron",  # A network architecture
+    "layer_sizes": [512],  # Network dimensions
+    "reshape": False,
+    "is_policy": False,  # The network should output a distribution
+    # over actions
+}
+
+cnn_configs = {
+    "type": "ConvolutionalNetwork",  # A network architecture
+    "activation": "RELU",
+    "in_channels": 4,
+    "in_height": 84,
+    "in_width": 84,
+    "head_mlp_kwargs": mlp_configs,
+    "transpose_obs": False,
+    "is_policy": False,  # The network should output a distribution
+}
+
+tuned_agent = AgentManager(
+    DQNAgent,  # The Agent class.
+    (
+        atari_make,
+        dict(
+            id="ALE/Freeway-v5",
+        ),
+    ),  # The Environment to solve.
+    init_kwargs=dict(  # Where to put the agent's hyperparameters
+        q_net_constructor="rlberry.agents.torch.utils.training.model_factory_from_env",
+        q_net_kwargs=cnn_configs,
+        max_replay_size=50000,
+        batch_size=32,
+        learning_starts=25000,
+        gradient_steps=1,
+        epsilon_final=0.01,
+        learning_rate=1e-4,  # Size of the policy gradient descent steps.
+        chunk_size=1,
+    ),
+    fit_budget=90000,  # The number of interactions between the agent and the environment during training.
+    eval_kwargs=dict(
+        eval_horizon=500
+    ),  # The number of interactions between the agent and the environment during evaluations.
+    n_fit=1,  # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic.
+    agent_name="DQN_tuned",  # The agent's name.
+    output_dir="DQN_for_freeway",
+)
+
+print("-------- init agent : done!--------")
+print("-------- train agent --------")
+
+tuned_agent.fit()
+
+print("-------- train agent : done!--------")
+
+final_train_time = datetime.now()
+
+print("-------- test agent with video--------")
+
+env = atari_make(
+    "ALE/Freeway-v5",
+)
+env = RecordVideo(env, "docs/_video/temp")
+
+if "render_modes" in env.metadata:
+    env.metadata["render.modes"] = env.metadata[
+        "render_modes"
+    ]  # bug with some 'gym' version
+
+state = env.reset()
+for tt in range(30000):
+    action = tuned_agent.get_agent_instances()[0].policy(state)
+    next_s, _, done, test = env.step(action)
+    if done:
+        break
+    state = next_s
+
+env.close()
+
+print("-------- test agent with video : done!--------")
+final_test_time = datetime.now()
+tuned_agent.save()
+
+os.rename("_video/temp/rl-video-episode-0.mp4", "_video/video_plot_atari_freeway.mp4")
+shutil.rmtree("_video/temp/")
+
+
+print("Done!!!")
+print("-------------")
+print("begin run at :" + str(initial_time))
+print("end training at :" + str(final_train_time))
+print("end run at :" + str(final_test_time))
+print("-------------")
diff --git a/requirements.txt b/requirements.txt
@@ -4,7 +4,7 @@ pygame
 matplotlib
 seaborn
 pandas
-gym==0.21
+gym[accept-rom-license]==0.21.0
 dill
 docopt
 pyyaml
@@ -18,3 +18,7 @@ torch>=1.6.0
 stable-baselines3
 protobuf==3.20.1
 tensorboard
+opencv-python
+ale-py==0.7.4
+pytest
+pytest-xprocess
diff --git a/rlberry/agents/torch/tests/test_torch_atari.py b/rlberry/agents/torch/tests/test_torch_atari.py
@@ -0,0 +1,100 @@
+from rlberry.manager.agent_manager import AgentManager
+from rlberry.agents.torch.dqn.dqn import DQNAgent
+from rlberry.envs.gym_make import atari_make
+
+
+def test_forward_dqn():
+    mlp_configs = {
+        "type": "MultiLayerPerceptron",  # A network architecture
+        "layer_sizes": [512],  # Network dimensions
+        "reshape": False,
+        "is_policy": False,  # The network should output a distribution
+        # over actions
+    }
+
+    cnn_configs = {
+        "type": "ConvolutionalNetwork",  # A network architecture
+        "activation": "RELU",
+        "in_channels": 4,
+        "in_height": 84,
+        "in_width": 84,
+        "head_mlp_kwargs": mlp_configs,
+        "transpose_obs": False,
+        "is_policy": False,  # The network should output a distribution
+    }
+
+    tuned_agent = AgentManager(
+        DQNAgent,  # The Agent class.
+        (
+            atari_make,
+            # uncomment when rlberry will manage vectorized env
+            # dict(id="ALE/Breakout-v5", n_envs=3),
+            dict(id="ALE/Breakout-v5", n_envs=1),
+        ),  # The Environment to solve.
+        init_kwargs=dict(  # Where to put the agent's hyperparameters
+            q_net_constructor="rlberry.agents.torch.utils.training.model_factory_from_env",
+            q_net_kwargs=cnn_configs,
+            max_replay_size=100,
+            batch_size=32,
+            learning_starts=100,
+            gradient_steps=1,
+            epsilon_final=0.01,
+            learning_rate=1e-4,  # Size of the policy gradient descent steps.
+            chunk_size=5,
+        ),
+        fit_budget=200,  # The number of interactions between the agent and the environment during training.
+        eval_kwargs=dict(
+            eval_horizon=10
+        ),  # The number of interactions between the agent and the environment during evaluations.
+        n_fit=1,  # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic.
+        agent_name="DQN_test",  # The agent's name.
+    )
+
+    tuned_agent.fit()
+
+
+def test_forward_empty_input_dim():
+    mlp_configs = {
+        "type": "MultiLayerPerceptron",  # A network architecture
+        "layer_sizes": [512],  # Network dimensions
+        "reshape": False,
+        "is_policy": False,  # The network should output a distribution
+        # over actions
+    }
+
+    cnn_configs = {
+        "type": "ConvolutionalNetwork",  # A network architecture
+        "activation": "RELU",
+        "head_mlp_kwargs": mlp_configs,
+        "transpose_obs": False,
+        "is_policy": False,  # The network should output a distribution
+    }
+
+    tuned_agent = AgentManager(
+        DQNAgent,  # The Agent class.
+        (
+            atari_make,
+            # uncomment when rlberry will manage vectorized env
+            # dict(id="ALE/Breakout-v5", n_envs=3),
+            dict(id="ALE/Breakout-v5", n_envs=1),
+        ),  # The Environment to solve.
+        init_kwargs=dict(  # Where to put the agent's hyperparameters
+            q_net_constructor="rlberry.agents.torch.utils.training.model_factory_from_env",
+            q_net_kwargs=cnn_configs,
+            max_replay_size=100,
+            batch_size=32,
+            learning_starts=100,
+            gradient_steps=1,
+            epsilon_final=0.01,
+            learning_rate=1e-4,  # Size of the policy gradient descent steps.
+            chunk_size=5,
+        ),
+        fit_budget=10,  # The number of interactions between the agent and the environment during training.
+        eval_kwargs=dict(
+            eval_horizon=10
+        ),  # The number of interactions between the agent and the environment during evaluations.
+        n_fit=1,  # The number of agents to train. Usually, it is good to do more than 1 because the training is stochastic.
+        agent_name="DQN_test",  # The agent's name.
+    )
+
+    tuned_agent.fit()
diff --git a/rlberry/agents/torch/utils/models.py b/rlberry/agents/torch/utils/models.py
@@ -61,7 +61,7 @@ def default_policy_net_fn(env):
         )
 
     if len(obs_shape) == 3:
-        if obs_shape[0] < obs_shape[1] and obs_shape[0] < obs_shape[1]:
+        if obs_shape[0] < obs_shape[1] and obs_shape[0] < obs_shape[2]:
             # Assume CHW observation space
             model_config = {
                 "type": "ConvolutionalNetwork",
@@ -397,6 +397,8 @@ class ConvolutionalNetwork(nn.Module):
     H = height;
     W = width.
 
+    For the CNN forward, if the tensor has more than 4 dimensions (not BCHW), it keeps the 3 last dimension as CHW and merge all first ones into 1 (Batch). Go through the CNN + MLP, then split the first dimension as before.
+
     Parameters
     ----------
     activation: {"RELU", "TANH", "ELU"}
@@ -434,25 +436,30 @@ def __init__(
         self.conv3 = nn.Conv2d(32, 64, kernel_size=2, stride=2)
 
         # MLP Head
-        # Number of Linear input connections depends on output of conv2d layers
-        # and therefore the input image size, so compute it.
-        def conv2d_size_out(size, kernel_size=2, stride=2):
-            return (size - (kernel_size - 1) - 1) // stride + 1
-
-        convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(in_width)))
-        convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(in_height)))
-        assert convh > 0 and convw > 0
         self.head_mlp_kwargs = head_mlp_kwargs or {}
-        self.head_mlp_kwargs["in_size"] = convw * convh * 64
+        self.head_mlp_kwargs["in_size"] = self._get_conv_out_size(
+            [in_channels, in_height, in_width]
+        )  # Number of Linear input connections depends on output of conv layers
         self.head_mlp_kwargs["out_size"] = out_size
         self.head_mlp_kwargs["is_policy"] = is_policy
         self.head = model_factory(**self.head_mlp_kwargs)
 
         self.is_policy = is_policy
         self.transpose_obs = transpose_obs
 
+    def _get_conv_out_size(self, shape):
+        """
+        Computes the output dimensions of the convolution network.
+        Shape : dimension of the input of the CNN
+        """
+        conv_result = self.activation((self.conv1(torch.zeros(1, *shape))))
+        conv_result = self.activation((self.conv2(conv_result)))
+        conv_result = self.activation((self.conv3(conv_result)))
+        return int(np.prod(conv_result.size()))
+
     def convolutions(self, x):
         x = x.float()
+        # if there is no batch (CHW), add one dimension to specify batch of 1 (and get format BCHW)
         if len(x.shape) == 3:
             x = x.unsqueeze(0)
         if self.transpose_obs:
@@ -470,9 +477,28 @@ def forward(self, x):
         Parameters
         ----------
         x: torch.tensor
-            Tensor of shape BCHW
+            Tensor of shape BCHW (Batch,Chanel,Height,Width : if more than 4 dimensions, merge all the first in batch dimension)
         """
-        return self.head(self.convolutions(x))
+        flag_view_to_change = False
+
+        if len(x.shape) > 4:
+            flag_view_to_change = True
+            dim_to_retore = x.shape[:-3]
+            inputview_size = tuple((-1,)) + tuple(x.shape[-3:])
+            outputview_size = tuple(dim_to_retore) + tuple(
+                (self.head_mlp_kwargs["out_size"],)
+            )
+            x = x.view(inputview_size)
+
+        conv_result = self.convolutions(x)
+        output_result = self.head(
+            conv_result.view(conv_result.size()[0], -1)
+        )  # give the 'conv_result' flattenned in 2 dimensions (batch and other) to the MLP (head)
+
+        if flag_view_to_change:
+            output_result = output_result.view(outputview_size)
+
+        return output_result
 
     def action_scores(self, x):
         return self.head.action_scores(self.convolutions(x))
diff --git a/rlberry/agents/torch/utils/training.py b/rlberry/agents/torch/utils/training.py
@@ -105,10 +105,23 @@ def size_model_config(env, **model_config):
         return model_config
 
     # Assume CHW observation space
-    if model_config["type"] == "ConvolutionalNetwork":
-        model_config["in_channels"] = int(obs_shape[0])
-        model_config["in_height"] = int(obs_shape[1])
-        model_config["in_width"] = int(obs_shape[2])
+    if "type" in model_config and model_config["type"] == "ConvolutionalNetwork":
+        if "transpose_obs" in model_config and not model_config["transpose_obs"]:
+            # Assume CHW observation space
+            if "in_channels" not in model_config:
+                model_config["in_channels"] = int(obs_shape[0])
+            if "in_height" not in model_config:
+                model_config["in_height"] = int(obs_shape[1])
+            if "in_width" not in model_config:
+                model_config["in_width"] = int(obs_shape[2])
+        else:
+            # Assume WHC observation space to transpose
+            if "in_channels" not in model_config:
+                model_config["in_channels"] = int(obs_shape[2])
+            if "in_height" not in model_config:
+                model_config["in_height"] = int(obs_shape[1])
+            if "in_width" not in model_config:
+                model_config["in_width"] = int(obs_shape[0])
     else:
         model_config["in_size"] = int(np.prod(obs_shape))