hyperparams/trpo.yml

# Note: OpenAI uses 16 mpi workers for training on atari
# in practice it is run with with 4 workers
atari:
  n_timesteps: !!float 1e7
  policy: 'CnnPolicy'
  timesteps_per_batch: 512
  max_kl: 0.001
  cg_iters: 10
  cg_damping: !!float 1e-3
  entcoeff: 0.01
  gamma: 0.98
  lam: 1
  vf_iters: 3
  vf_stepsize: !!float 1e-4

# Tuned
Pendulum-v0:
  n_timesteps: !!float 3e5
  policy: 'MlpPolicy'
  max_kl: 0.000193
  timesteps_per_batch: 1024
  gamma: 0.99
  lam: 0.9
  entcoeff: 0.01118
  cg_damping: 2.35e-05
  vf_iters: 10
  vf_stepsize: 0.00428


MountainCar-v0:
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'

# Tuned
MountainCarContinuous-v0:
  n_timesteps: !!float 1.2e5
  policy: 'MlpPolicy'
  max_kl: 1.77e-06
  timesteps_per_batch: 32
  entcoeff: 0.0
  cg_iters: 20
  lam: 0.95
  gamma: 0.9999
  cg_damping: 0.0002
  vf_iters: 10
  vf_stepsize: !!float 0.0463

CartPole-v1:
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  timesteps_per_batch: 512
  max_kl: 0.001
  cg_iters: 10
  cg_damping: !!float 1e-3
  entcoeff: 0.0
  gamma: 0.99
  lam: 1
  vf_iters: 3
  vf_stepsize: !!float 1e-4

LunarLander-v2:
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  timesteps_per_batch: 1024
  max_kl: 0.01
  cg_iters: 10
  cg_damping: 0.1
  entcoeff: 0.0
  gamma: 0.99
  lam: 0.98
  vf_iters: 5
  vf_stepsize: !!float 1e-3

LunarLanderContinuous-v2:
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  timesteps_per_batch: 1024
  max_kl: 0.01
  cg_iters: 10
  cg_damping: 0.1
  entcoeff: 0.0
  gamma: 0.99
  lam: 0.98
  vf_iters: 5
  vf_stepsize: !!float 1e-3

# Tuned
HalfCheetahBulletEnv-v0:
  env_wrapper: utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  timesteps_per_batch: 2048
  max_kl: 0.01
  cg_iters: 15
  cg_damping: 0.1
  entcoeff: 0.0
  gamma: 0.99
  lam: 0.95
  vf_iters: 5
  vf_stepsize: !!float 1e-3

# Tuned
AntBulletEnv-v0:
  env_wrapper: utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  timesteps_per_batch: 2048
  max_kl: 0.01
  cg_iters: 15
  cg_damping: 0.1
  entcoeff: 0.0
  gamma: 0.99
  lam: 0.95
  vf_iters: 5
  vf_stepsize: !!float 1e-3

# Tuned
Walker2DBulletEnv-v0:
  env_wrapper: utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  timesteps_per_batch: 2048
  max_kl: 0.01
  cg_iters: 20
  cg_damping: 0.1
  entcoeff: 0.001
  gamma: 0.99
  lam: 0.95
  vf_iters: 5
  vf_stepsize: !!float 1e-3

# Tuned
HopperBulletEnv-v0:
  env_wrapper: utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  timesteps_per_batch: 2048
  max_kl: 0.005
  cg_iters: 15
  cg_damping: 0.1
  entcoeff: 0.01
  gamma: 0.99
  lam: 0.95
  vf_iters: 5
  vf_stepsize: !!float 1e-3

# Tuned
BipedalWalker-v3:
  env_wrapper: utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 5e6
  policy: 'MlpPolicy'
  timesteps_per_batch: 2048
  max_kl: 0.01
  cg_iters: 15
  cg_damping: 0.1
  entcoeff: 0.001
  gamma: 0.99
  lam: 0.95
  vf_iters: 5
  vf_stepsize: !!float 1e-3

# To be tuned
BipedalWalkerHardcore-v3:
  n_timesteps: !!float 5e7
  policy: 'MlpPolicy'
  timesteps_per_batch: 4096
  max_kl: 0.02
  cg_iters: 20
  cg_damping: 0.1
  entcoeff: 0.001
  gamma: 0.99
  lam: 0.95
  vf_iters: 5
  vf_stepsize: !!float 1e-3