diff --git a/python/ray/tests/test_memory_pressure.py b/python/ray/tests/test_memory_pressure.py index 6adc947691a99..684785cac0f80 100644 --- a/python/ray/tests/test_memory_pressure.py +++ b/python/ray/tests/test_memory_pressure.py @@ -158,7 +158,7 @@ def test_restartable_actor_killed_by_memory_monitor_with_actor_error( timeout=10, retry_interval_ms=100, tag="MemoryManager.ActorEviction.Total", - value=1.0, # TODO(clarng): This should be 2. Look at why restart doesn't work + value=2.0, ) diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index e995ea601e08f..7dcf689321a74 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -96,7 +96,7 @@ RAY_CONFIG(uint64_t, task_failure_entry_ttl_ms, 15 * 60 * 1000) /// the retry counter of the task or actor is only used when it fails in other ways /// that is not related to running out of memory. Note infinite retry (-1) is not /// supported. -RAY_CONFIG(uint64_t, task_oom_retries, 3) +RAY_CONFIG(uint64_t, task_oom_retries, 15) /// If the raylet fails to get agent info, we will retry after this interval. RAY_CONFIG(uint64_t, raylet_get_agent_info_interval_ms, 1) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 9a47e5e775c0e..55a61ce50825f 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -2985,7 +2985,7 @@ MemoryUsageRefreshCallback NodeManager::CreateMemoryUsageRefreshCallback() { /// since we print the process memory in the message. Destroy should be called /// as soon as possible to free up memory. DestroyWorker(high_memory_eviction_target_, - rpc::WorkerExitType::USER_ERROR, + rpc::WorkerExitType::SYSTEM_ERROR, worker_exit_message, true /* force */);