From 97ccd7595238b205669e2c73489307bd00938c2f Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Mon, 26 Aug 2019 01:37:28 -0700 Subject: [PATCH] [rllib] Enable object store memory limit by default (#5534) --- .travis.yml | 6 ------ ci/jenkins_tests/run_rllib_tests.sh | 12 ++++++++++++ python/ray/resource_spec.py | 2 +- rllib/agents/trainer.py | 4 ++-- rllib/tests/test_catalog.py | 10 +++++----- rllib/tests/test_evaluators.py | 2 +- rllib/tests/test_filters.py | 2 +- rllib/tests/test_optimizers.py | 6 +++--- 8 files changed, 25 insertions(+), 19 deletions(-) diff --git a/.travis.yml b/.travis.yml index e3286bd948950..5a6d4c0dd69f2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -172,12 +172,6 @@ script: # `cluster_tests.py` runs on Jenkins, not Travis. - if [ $RAY_CI_TUNE_AFFECTED == "1" ]; then python -m pytest --durations=10 --timeout=300 --ignore=python/ray/tune/tests/test_cluster.py --ignore=python/ray/tune/tests/test_tune_restore.py --ignore=python/ray/tune/tests/test_actor_reuse.py python/ray/tune/tests; fi - # ray rllib tests - - if [ $RAY_CI_RLLIB_AFFECTED == "1" ]; then ./ci/suppress_output python python/ray/rllib/tests/test_catalog.py; fi - - if [ $RAY_CI_RLLIB_AFFECTED == "1" ]; then ./ci/suppress_output python python/ray/rllib/tests/test_filters.py; fi - - if [ $RAY_CI_RLLIB_AFFECTED == "1" ]; then ./ci/suppress_output python python/ray/rllib/tests/test_optimizers.py; fi - - if [ $RAY_CI_RLLIB_AFFECTED == "1" ]; then ./ci/suppress_output python python/ray/rllib/tests/test_evaluators.py; fi - # ray tests # Python3.5+ only. Otherwise we will get `SyntaxError` regardless of how we set the tester. - if [ $RAY_CI_PYTHON_AFFECTED == "1" ]; then python -c 'import sys;exit(sys.version_info>=(3,5))' || python -m pytest -v --durations=5 --timeout=300 python/ray/experimental/test/async_test.py; fi diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 107040351f11f..5f533a057192d 100755 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -1,3 +1,15 @@ +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/rllib/tests/test_catalog.py + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/rllib/tests/test_optimizers.py + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/rllib/tests/test_filters.py + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/rllib/tests/test_evaluators.py + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/rllib/tests/test_eager_support.py diff --git a/python/ray/resource_spec.py b/python/ray/resource_spec.py index 12ba0289177a6..48d5b0364d7be 100644 --- a/python/ray/resource_spec.py +++ b/python/ray/resource_spec.py @@ -183,7 +183,7 @@ def resolve(self, is_head): if memory is None: memory = (avail_memory - object_store_memory - (redis_max_memory if is_head else 0)) - if memory < 500e6 and memory < 0.05 * system_memory: + if memory < 100e6 and memory < 0.05 * system_memory: raise ValueError( "After taking into account object store and redis memory " "usage, the amount of memory on this node available for " diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index fd4320e9195ba..7c224797e6d88 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -149,14 +149,14 @@ # Object store memory to reserve for the trainer process. Being large # enough to fit a few copies of the model weights should be sufficient. # This is enabled by default since models are typically quite small. - "object_store_memory": 0, + "object_store_memory": 200 * 1024 * 1024, # Heap memory to reserve for each worker. Should generally be small unless # your environment is very heavyweight. "memory_per_worker": 0, # Object store memory to reserve for each worker. This only needs to be # large enough to fit a few sample batches at a time. This is enabled # by default since it almost never needs to be larger than ~200MB. - "object_store_memory_per_worker": 0, + "object_store_memory_per_worker": 200 * 1024 * 1024, # === Execution === # Number of environments to evaluate vectorwise per worker. diff --git a/rllib/tests/test_catalog.py b/rllib/tests/test_catalog.py index 99fcd4c8e1ee3..6fc7505abccfd 100644 --- a/rllib/tests/test_catalog.py +++ b/rllib/tests/test_catalog.py @@ -63,7 +63,7 @@ def testGymPreprocessors(self): self.assertEqual(type(p2), OneHotPreprocessor) def testTuplePreprocessor(self): - ray.init() + ray.init(object_store_memory=1000 * 1024 * 1024) class TupleEnv(object): def __init__(self): @@ -78,7 +78,7 @@ def __init__(self): [float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]]) def testCustomPreprocessor(self): - ray.init() + ray.init(object_store_memory=1000 * 1024 * 1024) ModelCatalog.register_custom_preprocessor("foo", CustomPreprocessor) ModelCatalog.register_custom_preprocessor("bar", CustomPreprocessor2) env = gym.make("CartPole-v0") @@ -90,7 +90,7 @@ def testCustomPreprocessor(self): self.assertEqual(type(p3), NoPreprocessor) def testDefaultModels(self): - ray.init() + ray.init(object_store_memory=1000 * 1024 * 1024) with tf.variable_scope("test1"): p1 = ModelCatalog.get_model({ @@ -106,7 +106,7 @@ def testDefaultModels(self): self.assertEqual(type(p2), VisionNetwork) def testCustomModel(self): - ray.init() + ray.init(object_store_memory=1000 * 1024 * 1024) ModelCatalog.register_custom_model("foo", CustomModel) p1 = ModelCatalog.get_model({ "obs": tf.constant([1, 2, 3]) @@ -118,7 +118,7 @@ def testCustomActionDistribution(self): class Model(): pass - ray.init() + ray.init(object_store_memory=1000 * 1024 * 1024) # registration ModelCatalog.register_custom_action_dist("test", CustomActionDistribution) diff --git a/rllib/tests/test_evaluators.py b/rllib/tests/test_evaluators.py index 7f2ef740e4f55..4af4d02a46856 100644 --- a/rllib/tests/test_evaluators.py +++ b/rllib/tests/test_evaluators.py @@ -34,7 +34,7 @@ def env_creator(env_config): agent_classes = [DQNTrainer, A3CTrainer] for agent_cls in agent_classes: - ray.init() + ray.init(object_store_memory=1000 * 1024 * 1024) register_env("CartPoleWrapped-v0", env_creator) agent = agent_cls( env="CartPoleWrapped-v0", diff --git a/rllib/tests/test_filters.py b/rllib/tests/test_filters.py index 1446809eb9fcd..9c4110a908c74 100644 --- a/rllib/tests/test_filters.py +++ b/rllib/tests/test_filters.py @@ -75,7 +75,7 @@ def testBasic(self): class FilterManagerTest(unittest.TestCase): def setUp(self): - ray.init(num_cpus=1) + ray.init(num_cpus=1, object_store_memory=1000 * 1024 * 1024) def tearDown(self): ray.shutdown() diff --git a/rllib/tests/test_optimizers.py b/rllib/tests/test_optimizers.py index 526f421cff6f4..58e5fef3b4905 100644 --- a/rllib/tests/test_optimizers.py +++ b/rllib/tests/test_optimizers.py @@ -26,7 +26,7 @@ def tearDown(self): ray.shutdown() def testBasic(self): - ray.init(num_cpus=4) + ray.init(num_cpus=4, object_store_memory=1000 * 1024 * 1024) local = _MockWorker() remotes = ray.remote(_MockWorker) remote_workers = [remotes.remote() for i in range(5)] @@ -41,7 +41,7 @@ def tearDown(self): ray.shutdown() def testPPOSampleWaste(self): - ray.init(num_cpus=4) + ray.init(num_cpus=4, object_store_memory=1000 * 1024 * 1024) # Check we at least collect the initial wave of samples ppo = PPOTrainer( @@ -101,7 +101,7 @@ def tearDownClass(cls): @classmethod def setUpClass(cls): - ray.init(num_cpus=8) + ray.init(num_cpus=8, object_store_memory=1000 * 1024 * 1024) def testSimple(self): local, remotes = self._make_envs()