Add HER support + fix for loading VecNormalize (#47)

* Added her. * Updated parameters for her. * Update her. * Bug fixes and reformating * Add mujoco envs * Add neck env support for HER * Added History Wrapper for dict observation. Updated success plotting. * Add action smoothing and low pass filter * Added comment for history wrapper for dict observations. * Fix TimeFeatureWrapper * Add low-pass filter experiment * Fixed key for plotting training success. * Update plot script * Fix VecNormalize ignoring gamma * Add TimeFeatureDict wrapper * Update her hyperparams * Added max episode length for parking env * Fix VecNormalize loading * Updated loading replay buffer for HER * Fixes * Install parking-env in CI Co-authored-by: Megan Klaiber <megan.klaiber@outlook.com>
DLR-RM · Oct 22, 2020 · 26dfece · 26dfece
1 parent 8969508
commit 26dfece
Show file tree

Hide file tree

Showing 13 changed files with 578 additions and 63 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -36,8 +36,8 @@ jobs:
         pip install -r requirements.txt
         # Use headless version
         pip install opencv-python-headless
-        # TODO: remove once SB3 is updated
-        pip install black isort
+        # install parking-env to test HER
+        pip install git+https://github.com/eleurent/highway-env
     - name: Type check
       run: |
         make type

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,17 +1,22 @@
-## Pre-Release 0.9.0a2 (WIP)
+## Pre-Release 0.10.0a0 (WIP)
 
 ### Breaking Changes
 
 ### New Features
+- Added support for `HER`
+- Added low-pass filter wrappers in `utils/wrappers.py`
 
 ### Bug fixes
 - Fixed `TimeFeatureWrapper` inferring max timesteps
 - Fixed ``flatten_dict_observations`` in `utils/utils.py` for recent Gym versions (@ManifoldFR)
+- `VecNormalize` now takes `gamma` hyperparameter into account
+- Fix loading of `VecNormalize` when continuing training or using trained agent
 
 ### Documentation
 
 ### Other
 - Added tests for the wrappers
+- Updated plotting script
 
 
 ## Release 0.8.0 (2020-08-04)

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -21,6 +21,7 @@ RUN \
     mkdir -p ${CODE_DIR}/rl_zoo && \
     pip uninstall -y stable-baselines3 && \
     pip install -r /tmp/requirements.txt && \
+    pip install git+https://github.com/eleurent/highway-env && \
     rm -rf $HOME/.cache/pip
 
 ENV PATH=$VENV/bin:$PATH

diff --git a/hyperparams/her.yml b/hyperparams/her.yml
@@ -0,0 +1,218 @@
+# === Real Robot envs
+NeckGoalEnvRelativeSparse-v2:
+  model_class: 'sac'
+  # env_wrapper:
+  #   - utils.wrappers.HistoryWrapper:
+  #       horizon: 2
+  #   - utils.wrappers.TimeFeatureWrapper
+  n_timesteps: !!float 1e6
+  policy: 'MlpPolicy'
+  learning_rate: !!float 7.3e-4
+  buffer_size: 100000
+  batch_size: 256
+  ent_coef: 'auto'
+  gamma: 0.99
+  tau: 0.02
+  n_episodes_rollout: 1
+  gradient_steps: -1
+  train_freq: -1
+  # 10 episodes of warm-up
+  learning_starts: 1500
+  use_sde_at_warmup: True
+  use_sde: True
+  sde_sample_freq: 64
+  policy_kwargs: "dict(log_std_init=-2, net_arch=[256, 256])"
+  n_sampled_goal: 4
+  goal_selection_strategy: 'future'
+  online_sampling: False
+
+NeckGoalEnvRelativeDense-v2:
+  model_class: 'sac'
+  env_wrapper:
+    - utils.wrappers.HistoryWrapperObsDict:
+        horizon: 2
+  #   - utils.wrappers.TimeFeatureWrapper
+  n_timesteps: !!float 1e6
+  policy: 'MlpPolicy'
+  learning_rate: !!float 7.3e-4
+  buffer_size: 200000
+  batch_size: 256
+  ent_coef: 'auto'
+  gamma: 0.99
+  tau: 0.02
+  n_episodes_rollout: 1
+  gradient_steps: -1
+  train_freq: -1
+  # 10 episodes of warm-up
+  learning_starts: 1500
+  use_sde_at_warmup: True
+  use_sde: True
+  sde_sample_freq: 64
+  policy_kwargs: "dict(log_std_init=-2, net_arch=[256, 256])"
+  n_sampled_goal: 4
+  goal_selection_strategy: 'future'
+  online_sampling: False
+
+# DDPG hyperparams
+#parking-v0:
+#  n_timesteps: !!float 2e5
+#  policy: 'MlpPolicy'
+#  model_class: 'ddpg'
+#  n_sampled_goal: 4
+#  goal_selection_strategy: 'future'
+#  buffer_size: 1000000
+#  batch_size: 256
+#  gamma: 0.95
+#  learning_rate: !!float 1e-3
+#  noise_type: 'normal'
+#  noise_std: 0.2
+#  policy_kwargs: "dict(net_arch=[256, 256, 256])"
+#  online_sampling: True
+#  max_episode_length: 100
+
+
+# SAC hyperparams, her paper
+parking-v0:
+  n_timesteps: !!float 2e5
+  policy: 'MlpPolicy'
+  model_class: 'sac'
+  n_sampled_goal: 4
+  goal_selection_strategy: 'future'
+  buffer_size: 1000000
+  batch_size: 256
+  gamma: 0.95
+  learning_rate: !!float 1e-3
+  # noise_type: 'normal'
+  # noise_std: 0.2
+  policy_kwargs: "dict(net_arch=[256, 256, 256])"
+  online_sampling: False
+  # normalize: True
+  max_episode_length: 100
+
+# TD3 hyperparams, her paper
+#parking-v0:
+#  n_timesteps: !!float 2e5
+#  policy: 'MlpPolicy'
+#  model_class: 'td3'
+#  n_sampled_goal: 4
+#  goal_selection_strategy: 'future'
+#  buffer_size: 1000000
+#  batch_size: 256
+#  gamma: 0.95
+#  learning_rate: !!float 1e-3
+#  noise_type: 'normal'
+#  noise_std: 0.2
+#  policy_kwargs: "dict(net_arch=[256, 256, 256])"
+#  online_sampling: True
+#  max_episode_length: 100
+
+
+# Mujoco Robotic Env
+# DDPG hyperparams
+# FetchReach-v1:
+#   n_timesteps: !!float 20000
+#   policy: 'MlpPolicy'
+#   model_class: 'ddpg'
+#   n_sampled_goal: 4
+#   goal_selection_strategy: 'future'
+#   buffer_size: 1000000
+#   batch_size: 256
+#   gamma: 0.95
+#   random_exploration: 0.3
+#   actor_lr: !!float 1e-3
+#   critic_lr: !!float 1e-3
+#   noise_type: 'normal'
+#   noise_std: 0.2
+#   normalize_observations: true
+#   normalize_returns: false
+#   policy_kwargs: "dict(layers=[256, 256, 256])"
+#   online_sampling: True
+
+# NOTE: shoube be run with 8 workers: mpirun -n 8
+# FetchPush-v1:
+#   n_timesteps: !!float 2e6
+#   policy: 'MlpPolicy'
+#   model_class: 'ddpg'
+#   n_sampled_goal: 4
+#   goal_selection_strategy: 'future'
+#   buffer_size: 200000
+#   batch_size: 256
+#   gamma: 0.95
+#   random_exploration: 0.3
+#   actor_lr: !!float 1e-3
+#   critic_lr: !!float 1e-3
+#   noise_type: 'normal'
+#   noise_std: 0.2
+#   normalize_observations: true
+#   normalize_returns: false
+#   policy_kwargs: "dict(layers=[16, 16, 16])"
+
+FetchPush-v1:
+  env_wrapper:
+    - utils.wrappers.HistoryWrapperObsDict:
+        horizon: 2
+    # - utils.wrappers.TimeFeatureObsDictWrapper
+  n_timesteps: !!float 3e6
+  policy: 'MlpPolicy'
+  model_class: 'sac'
+  n_sampled_goal: 4
+  goal_selection_strategy: 'future'
+  buffer_size: 1000000
+  ent_coef: 'auto'
+  gamma: 0.95
+  learning_rate: !!float 7e-4
+  use_sde: True
+  gradient_steps: -1
+  train_freq: -1
+  n_episodes_rollout: 1
+  sde_sample_freq: 10
+  # noise_type: 'normal'
+  # noise_std: 0.2
+  learning_starts: 1000
+  online_sampling: True
+  normalize: True
+
+FetchPickAndPlace-v1:
+  n_timesteps: !!float 4e6
+  policy: 'MlpPolicy'
+  model_class: 'sac'
+  n_sampled_goal: 4
+  goal_selection_strategy: 'future'
+  buffer_size: 1000000
+  ent_coef: 'auto'
+  # batch_size: 256
+  gamma: 0.95
+  # learning_rate: !!float 1e-3
+  learning_starts: 1000
+  train_freq: 1
+  online_sampling: True
+
+# SAC hyperparams
+FetchReach-v1:
+  n_timesteps: !!float 20000
+  policy: 'MlpPolicy'
+  model_class: 'sac'
+  n_sampled_goal: 4
+  goal_selection_strategy: 'future'
+  buffer_size: 1000000
+  ent_coef: 'auto'
+  batch_size: 256
+  gamma: 0.95
+  learning_rate: 0.001
+  learning_starts: 1000
+  online_sampling: True
+  normalize: True
+
+
+# TD3 hyperparams
+# FetchReach-v1:
+#   n_timesteps: !!float 25000
+#   policy: 'MlpPolicy'
+#   model_class: 'td3'
+#   n_sampled_goal: 4
+#   goal_selection_strategy: 'future'
+#   buffer_size: 1000000
+#   batch_size: 256
+#   gamma: 0.95
+#   learning_rate: 0.001
+#   learning_starts: 1000
diff --git a/hyperparams/sac.yml b/hyperparams/sac.yml
@@ -3,7 +3,11 @@ NeckEnvRelative-v2:
   env_wrapper:
     - utils.wrappers.HistoryWrapper:
         horizon: 2
-    - utils.wrappers.TimeFeatureWrapper
+    - utils.wrappers.TimeFeatureWrapper:
+        test_mode: False
+    # - utils.wrappers.LowPassFilterWrapper:
+    #     freq: 2.0
+    #     df: 25.0
   n_timesteps: !!float 1e6
   policy: 'MlpPolicy'
   learning_rate: !!float 7.3e-4

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-stable-baselines3[extra,tests,docs]>=0.9.0a0
+stable-baselines3[extra,tests,docs]>=0.10.0a0
 box2d-py==2.3.5
 pybullet
 gym-minigrid

diff --git a/scripts/plot_training_success.py b/scripts/plot_training_success.py
@@ -7,7 +7,7 @@
 import numpy as np
 import seaborn
 from matplotlib import pyplot as plt
-from stable_baselines3.common.monitor import load_results
+from stable_baselines3.common.monitor import LoadMonitorResultsError, load_results
 from stable_baselines3.common.results_plotter import X_EPISODES, X_TIMESTEPS, X_WALLTIME, ts2xy, window_func
 
 # For tensorflow imported with tensorboard
@@ -39,7 +39,7 @@
 x_axis = {"steps": X_TIMESTEPS, "episodes": X_EPISODES, "time": X_WALLTIME}[args.x_axis]
 x_label = {"steps": "Timesteps", "episodes": "Episodes", "time": "Walltime (in hours)"}[args.x_axis]
 
-y_axis = {"success": "success", "reward": "r"}[args.y_axis]
+y_axis = {"success": "is_success", "reward": "r"}[args.y_axis]
 y_label = {"success": "Training Success Rate", "reward": "Training Episodic Reward"}[args.y_axis]
 
 dirs = [
@@ -53,7 +53,10 @@
 plt.xlabel(f"{x_label}", fontsize=args.fontsize)
 plt.ylabel(y_label, fontsize=args.fontsize)
 for folder in dirs:
-    data_frame = load_results(folder)
+    try:
+        data_frame = load_results(folder)
+    except LoadMonitorResultsError:
+        continue
     if args.max_timesteps is not None:
         data_frame = data_frame[data_frame.l.cumsum() <= args.max_timesteps]
     success = np.array(data_frame[y_axis])
@@ -63,6 +66,8 @@
     if x.shape[0] >= args.episode_window:
         # Compute and plot rolling mean with window of size args.episode_window
         x, y_mean = window_func(x, success, args.episode_window, np.mean)
-        plt.plot(x, y_mean, linewidth=2)
+        plt.plot(x, y_mean, linewidth=2, label=folder.split("/")[-1])
+
+plt.legend()
 plt.tight_layout()
 plt.show()
diff --git a/tests/test_hyperparams_opt.py b/tests/test_hyperparams_opt.py
@@ -10,7 +10,7 @@ def _assert_eq(left, right):
 
 
 N_STEPS = 100
-N_TRIALS = 3
+N_TRIALS = 2
 N_JOBS = 1
 
 ALGOS = ("ppo", "a2c")
@@ -27,6 +27,8 @@ def _assert_eq(left, right):
 experiments["sac-Pendulum-v0"] = ("sac", "Pendulum-v0")
 # Test for TD3
 experiments["td3-Pendulum-v0"] = ("td3", "Pendulum-v0")
+# Test for HER
+experiments["her-parking-v0"] = ("her", "parking-v0")
 
 # Clean up
 if os.path.isdir(LOG_FOLDER):
@@ -38,6 +40,11 @@ def _assert_eq(left, right):
 @pytest.mark.parametrize("experiment", experiments.keys())
 def test_optimize(sampler, pruner, experiment):
     algo, env_id = experiments[experiment]
+
+    # Skip slow tests
+    if algo not in {"a2c", "ppo"} and not (sampler == "random" and pruner == "median"):
+        pytest.skip("Skipping slow tests")
+
     args = ["-n", str(N_STEPS), "--algo", algo, "--env", env_id, "-params", 'policy_kwargs:"dict(net_arch=[32])"', "n_envs:1"]
     args += ["n_steps:10"] if algo == "ppo" else []
     args += [