Skip to content

Commit

Permalink
[RLlib; Testing] Green all RLlib nightly tests. (#18073)
Browse files Browse the repository at this point in the history
  • Loading branch information
sven1977 authored Aug 26, 2021
1 parent 089dd9b commit 8acb469
Show file tree
Hide file tree
Showing 6 changed files with 177 additions and 73 deletions.
2 changes: 2 additions & 0 deletions release/rllib_tests/app_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ python:
- gym[atari]
- atari_py
- pybullet
# Pin this to 2.4.3 so it'll work with CUDA=11.0.
- tensorflow==2.4.3
conda_packages: []

post_build_cmds:
Expand Down
93 changes: 45 additions & 48 deletions release/rllib_tests/learning_tests/hard_learning_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,51 @@ apex-breakoutnoframeskip-v4:
target_network_update_freq: 50000
timesteps_per_iteration: 25000

ddpg-hopperbulletenv-v0:
env: HopperBulletEnv-v0
run: DDPG
# Minimum reward and total ts (in given time_total_s) to pass this test.
pass_criteria:
episode_reward_mean: 120.0
timesteps_total: 50000
stop:
time_total_s: 3600
config:
actor_hiddens: [256, 256]
critic_hiddens: [256, 256]
n_step: 3
model: {}
gamma: 0.99
env_config: {}
exploration_config:
initial_scale: 1.0
final_scale: 0.02
scale_timesteps: 10000
ou_base_scale: 0.1
ou_theta: 0.15
ou_sigma: 0.2
timesteps_per_iteration: 1000
target_network_update_freq: 0
tau: 0.001
buffer_size: 10000
prioritized_replay: True
prioritized_replay_alpha: 0.6
prioritized_replay_beta: 0.4
prioritized_replay_eps: 0.000001
clip_rewards: false
actor_lr: 0.001
critic_lr: 0.001
use_huber: true
huber_threshold: 1.0
l2_reg: 0.000001
learning_starts: 500
rollout_fragment_length: 1
train_batch_size: 48
num_gpus: 1
num_workers: 0
num_gpus_per_worker: 0
worker_side_prioritization: false

dqn-breakoutnoframeskip-v4:
env: BreakoutNoFrameskip-v4
run: DQN
Expand Down Expand Up @@ -173,51 +218,3 @@ sac-halfcheetahbulletenv-v0:
normalize_actions: true
evaluation_interval: 1
metrics_smoothing_episodes: 5

# Expect roughly 1000 reward after 1h on 1GPU
# TODO: (sven) this seems to be somewhat broken on tf AND torch (?)
# try to find older version that still works.
ddpg-halfcheetahbulletenv-v0:
env: HalfCheetahBulletEnv-v0
run: DDPG
# Minimum reward and total ts (in given time_total_s) to pass this test.
pass_criteria:
episode_reward_mean: -100.0
timesteps_total: 400000
stop:
time_total_s: 7200
config:
actor_hiddens: [64, 64]
critic_hiddens: [64, 64]
n_step: 1
model: {}
gamma: 0.99
env_config: {}
exploration_config:
initial_scale: 1.0
final_scale: 0.02
scale_timesteps: 10000
ou_base_scale: 0.1
ou_theta: 0.15
ou_sigma: 0.2
timesteps_per_iteration: 1000
target_network_update_freq: 0
tau: 0.001
buffer_size: 10000
prioritized_replay: True
prioritized_replay_alpha: 0.6
prioritized_replay_beta: 0.4
prioritized_replay_eps: 0.000001
clip_rewards: False
actor_lr: 0.001
critic_lr: 0.001
use_huber: False
huber_threshold: 1.0
l2_reg: 0.000001
learning_starts: 500
rollout_fragment_length: 1
train_batch_size: 64
num_workers: 0
num_gpus: 1
num_gpus_per_worker: 0
worker_side_prioritization: False
65 changes: 42 additions & 23 deletions rllib/tests/git_bisect/debug_learning_failure_git_bisect.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import argparse
import importlib
import json
import numpy as np
import os
import subprocess
import yaml
Expand All @@ -47,6 +48,11 @@
"--skip-install-ray",
action="store_true",
help="If set, do not attempt to re-build ray from source.")
parser.add_argument(
"--num-samples",
type=int,
default=1,
help="The number of samples to run for the given experiment.")
parser.add_argument(
"--stop-iters",
type=int,
Expand Down Expand Up @@ -122,8 +128,9 @@
if args.framework:
config["framework"] = args.framework

# Define stopping criteria.
stop = {}
# Define stopping criteria. From the yaml file ..
stop = experiment_config.get("stop", {})
# .. but override with command line provided ones.
if args.stop_iters:
stop["training_iteration"] = args.stop_iters
if args.stop_timesteps:
Expand All @@ -133,15 +140,24 @@
if args.stop_time:
stop["time_total_s"] = args.stop_time

# Invalid pass criteria.
if stop.get("episode_reward_mean") is None and \
(stop.get("timesteps_total") is None or
stop.get("time_total_s") is None):
raise ValueError("Invalid pass criterium! Must use either "
"(--stop-reward + optionally any other) OR "
"(--stop-timesteps + --stop-time).")

# - Stop ray.
# - Uninstall and re-install ray (from source) if required.
# - Start ray.
# Do this twice to make sure all processes are stopped (older versions of
# ray used to not kill everything the first time around).
try:
subprocess.run("ray stop".split(" "))
subprocess.run("ray stop".split(" "))
except Exception:
pass

# - Uninstall and re-install ray (from source) if required.
# Install ray from the checked out repo.
if not args.skip_install_ray:
subprocess.run("sudo apt-get update".split(" "))
Expand All @@ -158,10 +174,15 @@
subprocess.run("pip install -e . --verbose".split(" "))
os.chdir("../")

# - Start ray.
try:
subprocess.run("ray start --head".split(" "))
except Exception:
subprocess.run("ray stop".split(" "))
try:
subprocess.run("ray stop".split(" "))
subprocess.run("ray stop".split(" "))
except Exception:
pass
try:
subprocess.run("ray start --head".split(" "))
except Exception as e:
Expand All @@ -175,31 +196,29 @@
ray.init()

results = tune.run(run, stop=stop, config=config)

# Criterium is to have reached some min reward.
if args.stop_reward:
last_result = results.trials[0].last_result
avg_reward = last_result["episode_reward_mean"]
if avg_reward < args.stop_reward:
last_results = [t.last_result for t in results.trials]

# Criterion is to have reached some min reward within given
# wall time, iters, or timesteps.
if stop.get("episode_reward_mean") is not None:
max_avg_reward = np.max(
[r["episode_reward_mean"] for r in last_results])
if max_avg_reward < stop["episode_reward_mean"]:
raise ValueError("`stop-reward` of {} not reached!".format(
args.stop_reward))
# Criterium is to have run through n env timesteps in some wall time m.
elif args.stop_timesteps and args.stop_time:
last_result = results.trials[0].last_result
total_timesteps = last_result["timesteps_total"]
total_time = last_result["time_total_s"]
desired_speed = args.stop_timesteps / args.stop_time
stop["episode_reward_mean"]))
# Criterion is to have run through n env timesteps in some wall time m
# (minimum throughput).
else:
total_timesteps = np.sum([r["timesteps_total"] for r in last_results])
total_time = np.sum([r["time_total_s"] for r in last_results])
desired_speed = stop["timesteps_total"] / stop["time_total_s"]
actual_speed = total_timesteps / total_time
# We stopped because we reached the time limit ->
# Means throughput is too slow (time steps not reached).
if actual_speed < desired_speed:
raise ValueError(
"`stop-timesteps` of {} not reached in {}sec!".format(
args.stop_timesteps, args.stop_time))
else:
raise ValueError("Invalid pass criterium! Must use either "
"(--stop-reward + optionally any other) OR "
"(--stop-timesteps + --stop-time).")
stop["timesteps_total"], stop["time_total_s"]))

print("ok")
ray.shutdown()
4 changes: 2 additions & 2 deletions rllib/tuned_examples/ddpg/halfcheetah-ddpg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ halfcheetah-ddpg:
# === Optimization ===
actor_lr: 0.001
critic_lr: 0.001
use_huber: False
use_huber: false
huber_threshold: 1.0
l2_reg: 0.000001
learning_starts: 500
Expand All @@ -50,7 +50,7 @@ halfcheetah-ddpg:
# === Parallelism ===
num_workers: 0
num_gpus_per_worker: 0
worker_side_prioritization: False
worker_side_prioritization: false

# === Evaluation ===
evaluation_interval: 5
Expand Down
42 changes: 42 additions & 0 deletions rllib/tuned_examples/ddpg/halfcheetah-pybullet-ddpg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Note: HalfCheetahBulletEnv-v0 is not the same as MuJoCo's HalfCheetah-v0.
ddpg-halfcheetahbulletenv-v0:
env: HalfCheetahBulletEnv-v0
run: DDPG
stop:
episode_reward_mean: -300.0
timesteps_total: 200000
config:
actor_hiddens: [256, 256]
critic_hiddens: [256, 256]
n_step: 3
model: {}
gamma: 0.99
env_config: {}
exploration_config:
initial_scale: 1.0
final_scale: 0.02
scale_timesteps: 10000
ou_base_scale: 0.1
ou_theta: 0.15
ou_sigma: 0.2
timesteps_per_iteration: 1000
target_network_update_freq: 0
tau: 0.001
buffer_size: 15000
prioritized_replay: true
prioritized_replay_alpha: 0.6
prioritized_replay_beta: 0.4
prioritized_replay_eps: 0.000001
clip_rewards: false
actor_lr: 0.001
critic_lr: 0.001
use_huber: true
huber_threshold: 1.0
l2_reg: 0.000001
learning_starts: 500
rollout_fragment_length: 1
train_batch_size: 48
num_workers: 0
num_gpus: 1
num_gpus_per_worker: 0
worker_side_prioritization: false
44 changes: 44 additions & 0 deletions rllib/tuned_examples/ddpg/hopper-pybullet-ddpg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Note: HopperBulletEnv-v0 is not the same as MuJoCo's Hopper-v0.
ddpg-hopperbulletenv-v0:
env: HopperBulletEnv-v0
run: DDPG
# Minimum reward and total ts (in given time_total_s) to pass this test.
pass_criteria:
episode_reward_mean: 120.0
timesteps_total: 50000
stop:
time_total_s: 2000
config:
actor_hiddens: [256, 256]
critic_hiddens: [256, 256]
n_step: 3
model: {}
gamma: 0.99
env_config: {}
exploration_config:
initial_scale: 1.0
final_scale: 0.02
scale_timesteps: 10000
ou_base_scale: 0.1
ou_theta: 0.15
ou_sigma: 0.2
timesteps_per_iteration: 1000
target_network_update_freq: 0
tau: 0.001
buffer_size: 10000
prioritized_replay: True
prioritized_replay_alpha: 0.6
prioritized_replay_beta: 0.4
prioritized_replay_eps: 0.000001
clip_rewards: False
actor_lr: 0.001
critic_lr: 0.001
use_huber: False
huber_threshold: 1.0
l2_reg: 0.000001
learning_starts: 500
rollout_fragment_length: 1
train_batch_size: 48
num_workers: 0
num_gpus_per_worker: 0
worker_side_prioritization: False

0 comments on commit 8acb469

Please sign in to comment.