diff --git a/doc/source/rllib-config.svg b/doc/source/rllib-config.svg new file mode 100644 index 000000000000..6bc412a60c84 --- /dev/null +++ b/doc/source/rllib-config.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index 937c368524a0..b2bfc2699b29 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -50,10 +50,13 @@ In an example below, we train A2C by specifying 8 workers through the config fla python ray/python/ray/rllib/train.py --env=PongDeterministic-v4 \ --run=A2C --config '{"num_workers": 8, "monitor": true}' +.. image:: rllib-config.svg + Specifying Resources ~~~~~~~~~~~~~~~~~~~~ You can control the degree of parallelism used by setting the ``num_workers`` hyperparameter for most agents. Many agents also provide a ``num_gpus`` or ``gpu`` option. In addition, you can allocate a fraction of a GPU by setting ``gpu_fraction: f``. For example, with DQN you can pack five agents onto one GPU by setting ``gpu_fraction: 0.2``. Note that fractional GPU support requires enabling the experimental Xray backend by setting the environment variable ``RAY_USE_XRAY=1``. +>>>>>>> 01b030bd57f014386aa5e4c67a2e069938528abb Evaluating Trained Agents ~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py index dacda4818e0e..9deeaace67f0 100644 --- a/python/ray/rllib/agents/agent.py +++ b/python/ray/rllib/agents/agent.py @@ -26,6 +26,9 @@ "num_workers": 2, # Default sample batch size "sample_batch_size": 200, + # Training batch size, if applicable. Should be >= sample_batch_size. + # Samples batches will be concatenated together to this size for training. + "train_batch_size": 200, # Whether to rollout "complete_episodes" or "truncate_episodes" "batch_mode": "truncate_episodes", # Whether to use a background thread for sampling (slightly off-policy) diff --git a/python/ray/rllib/agents/es/es.py b/python/ray/rllib/agents/es/es.py index 8eac02f93839..de59137b6900 100644 --- a/python/ray/rllib/agents/es/es.py +++ b/python/ray/rllib/agents/es/es.py @@ -30,7 +30,7 @@ "l2_coeff": 0.005, "noise_stdev": 0.02, "episodes_per_batch": 1000, - "timesteps_per_batch": 10000, + "train_batch_size": 10000, "eval_prob": 0.003, "return_proc_mode": "centered_rank", "num_workers": 10, @@ -213,8 +213,7 @@ def _train(self): # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( - theta_id, config["episodes_per_batch"], - config["timesteps_per_batch"]) + theta_id, config["episodes_per_batch"], config["train_batch_size"]) all_noise_indices = [] all_training_returns = [] diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 3f5ce16ef20c..369d9db45c0e 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -20,18 +20,20 @@ "lambda": 1.0, # Initial coefficient for KL divergence "kl_coeff": 0.2, + # Size of batches collected from each worker + "sample_batch_size": 200, # Number of timesteps collected for each SGD round - "timesteps_per_batch": 4000, + "train_batch_size": 4000, + # Total SGD batch size across all devices for SGD (multi-gpu only) + "sgd_minibatch_size": 128, # Number of SGD iterations in each outer loop "num_sgd_iter": 30, # Stepsize of SGD - "sgd_stepsize": 5e-5, + "lr": 5e-5, # Learning rate schedule "lr_schedule": None, # Share layers for value function "vf_share_layers": False, - # Total SGD batch size across all devices for SGD (multi-gpu only) - "sgd_batchsize": 128, # Coefficient of the value function loss "vf_loss_coeff": 1.0, # Coefficient of the entropy regularizer @@ -79,6 +81,17 @@ def default_resource_request(cls, config): extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"]) def _init(self): + waste_ratio = ( + self.config["sample_batch_size"] * self.config["num_workers"] / + self.config["train_batch_size"]) + if waste_ratio > 1: + msg = ("sample_batch_size * num_workers >> train_batch_size. " + "This means that many steps will be discarded. Consider " + "reducing sample_batch_size, or increase train_batch_size.") + if waste_ratio > 1.5: + raise ValueError(msg) + else: + print("Warning: " + msg) self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) self.remote_evaluators = self.make_remote_evaluators( @@ -90,15 +103,15 @@ def _init(self): self.optimizer = SyncSamplesOptimizer( self.local_evaluator, self.remote_evaluators, { "num_sgd_iter": self.config["num_sgd_iter"], - "timesteps_per_batch": self.config["timesteps_per_batch"] + "train_batch_size": self.config["train_batch_size"] }) else: self.optimizer = LocalMultiGPUOptimizer( self.local_evaluator, self.remote_evaluators, { - "sgd_batch_size": self.config["sgd_batchsize"], + "sgd_batch_size": self.config["sgd_minibatch_size"], "num_sgd_iter": self.config["num_sgd_iter"], "num_gpus": self.config["num_gpus"], - "timesteps_per_batch": self.config["timesteps_per_batch"], + "train_batch_size": self.config["train_batch_size"], "standardize_fields": ["advantages"], }) diff --git a/python/ray/rllib/agents/ppo/ppo_policy_graph.py b/python/ray/rllib/agents/ppo/ppo_policy_graph.py index 0a0cb628ac4b..e23f0a5b31ed 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy_graph.py +++ b/python/ray/rllib/agents/ppo/ppo_policy_graph.py @@ -191,7 +191,7 @@ def __init__(self, vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) - LearningRateSchedule.__init__(self, self.config["sgd_stepsize"], + LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, diff --git a/python/ray/rllib/agents/ppo/rollout.py b/python/ray/rllib/agents/ppo/rollout.py index 54a235680e23..4084e9ba063a 100644 --- a/python/ray/rllib/agents/ppo/rollout.py +++ b/python/ray/rllib/agents/ppo/rollout.py @@ -6,7 +6,7 @@ from ray.rllib.evaluation.sample_batch import SampleBatch -def collect_samples(agents, timesteps_per_batch): +def collect_samples(agents, train_batch_size): num_timesteps_so_far = 0 trajectories = [] # This variable maps the object IDs of trajectories that are currently @@ -19,7 +19,7 @@ def collect_samples(agents, timesteps_per_batch): fut_sample = agent.sample.remote() agent_dict[fut_sample] = agent - while num_timesteps_so_far < timesteps_per_batch: + while num_timesteps_so_far < train_batch_size: # TODO(pcm): Make wait support arbitrary iterators and remove the # conversion to list here. [fut_sample], _ = ray.wait(list(agent_dict)) diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py index 4e01bbc778a0..9559648290da 100644 --- a/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py +++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py @@ -41,7 +41,7 @@ def create_env(env_config): num_cpus = 4 ray.init(num_cpus=num_cpus, redirect_output=True) config["num_workers"] = num_cpus - config["timesteps_per_batch"] = 10 + config["train_batch_size"] = 1000 config["num_sgd_iter"] = 10 config["gamma"] = 0.999 config["horizon"] = horizon diff --git a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py index 098ad69540d1..b183ff2c0b15 100644 --- a/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py +++ b/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py @@ -41,8 +41,8 @@ def create_env(env_config): num_cpus = 4 ray.init(num_cpus=num_cpus, redirect_output=True) config["num_workers"] = num_cpus - config["timesteps_per_batch"] = 10 - config["sgd_batchsize"] = 10 + config["train_batch_size"] = 1000 + config["sgd_minibatch_size"] = 10 config["num_sgd_iter"] = 10 config["gamma"] = 0.999 config["horizon"] = horizon diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py index 56ed1a1c4924..234ffea8078a 100644 --- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py +++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py @@ -33,12 +33,12 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): def _init(self, sgd_batch_size=128, num_sgd_iter=10, - timesteps_per_batch=1024, + train_batch_size=1024, num_gpus=0, standardize_fields=[]): self.batch_size = sgd_batch_size self.num_sgd_iter = num_sgd_iter - self.timesteps_per_batch = timesteps_per_batch + self.train_batch_size = train_batch_size if not num_gpus: self.devices = ["/cpu:0"] else: @@ -99,7 +99,7 @@ def step(self): # TODO(rliaw): remove when refactoring from ray.rllib.agents.ppo.rollout import collect_samples samples = collect_samples(self.remote_evaluators, - self.timesteps_per_batch) + self.train_batch_size) else: samples = self.local_evaluator.sample() self._check_not_multiagent(samples) diff --git a/python/ray/rllib/optimizers/sync_samples_optimizer.py b/python/ray/rllib/optimizers/sync_samples_optimizer.py index 5ed6a1ca21c0..20922ff54036 100644 --- a/python/ray/rllib/optimizers/sync_samples_optimizer.py +++ b/python/ray/rllib/optimizers/sync_samples_optimizer.py @@ -17,13 +17,13 @@ class SyncSamplesOptimizer(PolicyOptimizer): model weights are then broadcast to all remote evaluators. """ - def _init(self, num_sgd_iter=1, timesteps_per_batch=1): + def _init(self, num_sgd_iter=1, train_batch_size=1): self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() self.num_sgd_iter = num_sgd_iter - self.timesteps_per_batch = timesteps_per_batch + self.train_batch_size = train_batch_size self.learner_stats = {} def step(self): @@ -35,7 +35,7 @@ def step(self): with self.sample_timer: samples = [] - while sum(s.count for s in samples) < self.timesteps_per_batch: + while sum(s.count for s in samples) < self.train_batch_size: if self.remote_evaluators: samples.extend( ray.get([ diff --git a/python/ray/rllib/test/test_checkpoint_restore.py b/python/ray/rllib/test/test_checkpoint_restore.py index 6d2f277f978e..cb371c90c29f 100644 --- a/python/ray/rllib/test/test_checkpoint_restore.py +++ b/python/ray/rllib/test/test_checkpoint_restore.py @@ -22,7 +22,7 @@ def get_mean_action(alg, obs): CONFIGS = { "ES": { "episodes_per_batch": 10, - "timesteps_per_batch": 100, + "train_batch_size": 100, "num_workers": 2 }, "DQN": {}, @@ -40,7 +40,7 @@ def get_mean_action(alg, obs): }, "PPO": { "num_sgd_iter": 5, - "timesteps_per_batch": 1000, + "train_batch_size": 1000, "num_workers": 2 }, "A3C": { diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py index cded0c165804..dd81c1853e5b 100644 --- a/python/ray/rllib/test/test_supported_spaces.py +++ b/python/ray/rllib/test/test_supported_spaces.py @@ -107,15 +107,16 @@ def testAll(self): "PPO", { "num_workers": 1, "num_sgd_iter": 1, - "timesteps_per_batch": 1, - "sgd_batchsize": 1 + "train_batch_size": 10, + "sample_batch_size": 10, + "sgd_minibatch_size": 1 }, stats) check_support( "ES", { "num_workers": 1, "noise_size": 10000000, "episodes_per_batch": 1, - "timesteps_per_batch": 1 + "train_batch_size": 1 }, stats) check_support( "ARS", { diff --git a/python/ray/rllib/tuned_examples/atari-ppo.yaml b/python/ray/rllib/tuned_examples/atari-ppo.yaml index 285dc8a95faa..159160e0b67e 100644 --- a/python/ray/rllib/tuned_examples/atari-ppo.yaml +++ b/python/ray/rllib/tuned_examples/atari-ppo.yaml @@ -13,9 +13,9 @@ atari-ppo: kl_coeff: 0.5 clip_param: 0.1 entropy_coeff: 0.01 - timesteps_per_batch: 5000 + train_batch_size: 5000 sample_batch_size: 500 - sgd_batchsize: 500 + sgd_minibatch_size: 500 num_sgd_iter: 10 num_workers: 10 num_envs_per_worker: 5 diff --git a/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml b/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml index be14fcab9a83..8f01a6284db2 100644 --- a/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml +++ b/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml @@ -8,5 +8,5 @@ cartpole-ppo: num_workers: 2 num_sgd_iter: grid_search: [1, 4] - sgd_batchsize: + sgd_minibatch_size: grid_search: [128, 256, 512] diff --git a/python/ray/rllib/tuned_examples/hopper-ppo.yaml b/python/ray/rllib/tuned_examples/hopper-ppo.yaml index 27441d394c1a..c1c75b166e7c 100644 --- a/python/ray/rllib/tuned_examples/hopper-ppo.yaml +++ b/python/ray/rllib/tuned_examples/hopper-ppo.yaml @@ -5,8 +5,8 @@ hopper-ppo: gamma: 0.995 kl_coeff: 1.0 num_sgd_iter: 20 - sgd_stepsize: .0001 - sgd_batchsize: 32768 - timesteps_per_batch: 160000 + lr: .0001 + sgd_minibatch_size: 32768 + train_batch_size: 160000 num_workers: 64 num_gpus: 4 diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml index 5dfbf4315c85..e176dcae26c6 100644 --- a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml +++ b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml @@ -9,10 +9,10 @@ humanoid-ppo-gae: clip_param: 0.2 kl_coeff: 1.0 num_sgd_iter: 20 - sgd_stepsize: .0001 - sgd_batchsize: 32768 + lr: .0001 + sgd_minibatch_size: 32768 horizon: 5000 - timesteps_per_batch: 320000 + train_batch_size: 320000 model: free_log_std: true num_workers: 64 diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml index c896f7d3b1cd..0608f8b60353 100644 --- a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml +++ b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml @@ -7,9 +7,9 @@ humanoid-ppo: gamma: 0.995 kl_coeff: 1.0 num_sgd_iter: 20 - sgd_stepsize: .0001 - sgd_batchsize: 32768 - timesteps_per_batch: 320000 + lr: .0001 + sgd_minibatch_size: 32768 + train_batch_size: 320000 model: free_log_std: true use_gae: false diff --git a/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml b/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml index 0d6ffb25cae6..64d5571dbf29 100644 --- a/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml +++ b/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml @@ -9,5 +9,5 @@ cartpole-ppo: num_workers: 1 num_sgd_iter: grid_search: [1, 4] - sgd_batchsize: + sgd_minibatch_size: grid_search: [128, 256, 512] diff --git a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml index 841bbfd6f88f..dcb2775fa5f9 100644 --- a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml @@ -3,12 +3,12 @@ pendulum-ppo: env: Pendulum-v0 run: PPO config: - timesteps_per_batch: 2048 + train_batch_size: 2048 num_workers: 4 lambda: 0.1 gamma: 0.95 - sgd_stepsize: 0.0003 - sgd_batchsize: 64 + lr: 0.0003 + sgd_minibatch_size: 64 num_sgd_iter: 10 model: fcnet_hiddens: [64, 64] diff --git a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml index 2e3814f7f4c6..36830dcd6ef7 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml @@ -6,12 +6,12 @@ pendulum-ppo: # expect -140 within 300-500k steps timesteps_total: 600000 config: - timesteps_per_batch: 2048 + train_batch_size: 2048 num_workers: 4 lambda: 0.1 gamma: 0.95 - sgd_stepsize: 0.0003 - sgd_batchsize: 64 + lr: 0.0003 + sgd_minibatch_size: 64 num_sgd_iter: 10 model: fcnet_hiddens: [64, 64] diff --git a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml index 4591b4b58af9..deb5a0038dcb 100644 --- a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml +++ b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml @@ -4,8 +4,8 @@ walker2d-v1-ppo: config: kl_coeff: 1.0 num_sgd_iter: 20 - sgd_stepsize: .0001 - sgd_batchsize: 32768 - timesteps_per_batch: 320000 + lr: .0001 + sgd_minibatch_size: 32768 + train_batch_size: 320000 num_workers: 64 num_gpus: 4 diff --git a/python/ray/tune/examples/pbt_ppo_example.py b/python/ray/tune/examples/pbt_ppo_example.py index fb659b007a44..efd7ee4a8958 100755 --- a/python/ray/tune/examples/pbt_ppo_example.py +++ b/python/ray/tune/examples/pbt_ppo_example.py @@ -21,8 +21,8 @@ # Postprocess the perturbed config to ensure it's still valid def explore(config): # ensure we collect enough timesteps to do sgd - if config["timesteps_per_batch"] < config["sgd_batchsize"] * 2: - config["timesteps_per_batch"] = config["sgd_batchsize"] * 2 + if config["train_batch_size"] < config["sgd_minibatch_size"] * 2: + config["train_batch_size"] = config["sgd_minibatch_size"] * 2 # ensure we run at least one sgd iter if config["num_sgd_iter"] < 1: config["num_sgd_iter"] = 1 @@ -37,10 +37,10 @@ def explore(config): hyperparam_mutations={ "lambda": lambda: random.uniform(0.9, 1.0), "clip_param": lambda: random.uniform(0.01, 0.5), - "sgd_stepsize": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5], + "lr": [1e-3, 5e-4, 1e-4, 5e-5, 1e-5], "num_sgd_iter": lambda: random.randint(1, 30), - "sgd_batchsize": lambda: random.randint(128, 16384), - "timesteps_per_batch": lambda: random.randint(2000, 160000), + "sgd_minibatch_size": lambda: random.randint(128, 16384), + "train_batch_size": lambda: random.randint(2000, 160000), }, custom_explore_fn=explore) @@ -61,13 +61,13 @@ def explore(config): # These params are tuned from a fixed starting value. "lambda": 0.95, "clip_param": 0.2, - "sgd_stepsize": 1e-4, + "lr": 1e-4, # These params start off randomly drawn from a set. "num_sgd_iter": lambda spec: random.choice([10, 20, 30]), - "sgd_batchsize": + "sgd_minibatch_size": lambda spec: random.choice([128, 512, 2048]), - "timesteps_per_batch": + "train_batch_size": lambda spec: random.choice([10000, 20000, 40000]) }, }, diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh index 15dbb540e347..5fd6ead78464 100755 --- a/test/jenkins_tests/run_multi_node_tests.sh +++ b/test/jenkins_tests/run_multi_node_tests.sh @@ -30,7 +30,7 @@ docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ --env CartPole-v1 \ --run PPO \ --stop '{"training_iteration": 2}' \ - --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1, "model": {"free_log_std": true}}' + --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "model": {"free_log_std": true}}' docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \ @@ -51,21 +51,21 @@ docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ --env CartPole-v1 \ --run PPO \ --stop '{"training_iteration": 2}' \ - --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1, "use_gae": false}' + --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "use_gae": false}' docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \ --env Pendulum-v0 \ --run ES \ --stop '{"training_iteration": 2}' \ - --config '{"stepsize": 0.01, "episodes_per_batch": 20, "timesteps_per_batch": 100, "num_workers": 2}' + --config '{"stepsize": 0.01, "episodes_per_batch": 20, "train_batch_size": 100, "num_workers": 2}' docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \ --env Pong-v0 \ --run ES \ --stop '{"training_iteration": 2}' \ - --config '{"stepsize": 0.01, "episodes_per_batch": 20, "timesteps_per_batch": 100, "num_workers": 2}' + --config '{"stepsize": 0.01, "episodes_per_batch": 20, "train_batch_size": 100, "num_workers": 2}' docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \ @@ -105,7 +105,7 @@ docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ --env FrozenLake-v0 \ --run PPO \ --stop '{"training_iteration": 2}' \ - --config '{"num_sgd_iter": 10, "sgd_batchsize": 64, "timesteps_per_batch": 1000, "num_workers": 1}' + --config '{"num_sgd_iter": 10, "sgd_minibatch_size": 64, "train_batch_size": 1000, "num_workers": 1}' docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \ @@ -119,7 +119,7 @@ docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ --env MontezumaRevenge-v0 \ --run PPO \ --stop '{"training_iteration": 2}' \ - --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "sgd_stepsize": 1e-4, "sgd_batchsize": 64, "timesteps_per_batch": 2000, "num_workers": 1, "model": {"dim": 40, "conv_filters": [[16, [8, 8], 4], [32, [4, 4], 2], [512, [5, 5], 1]]}}' + --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "model": {"dim": 40, "conv_filters": [[16, [8, 8], 4], [32, [4, 4], 2], [512, [5, 5], 1]]}}' docker run -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \