From d3551dd8df7ef01dc420df98c03a6228024ea5e8 Mon Sep 17 00:00:00 2001 From: Jimpachnet Date: Thu, 31 Jan 2019 11:21:16 +0100 Subject: [PATCH] [tune] Added possibility to execute infinite recovery retries for a trial (#3901) Allows to let a trial try to do infinite recoveries by setting _max_failures_ to a negative number. --- python/ray/tune/experiment.py | 3 ++- python/ray/tune/trial.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/ray/tune/experiment.py b/python/ray/tune/experiment.py index 4471edd2b311..31a1ce7a86c4 100644 --- a/python/ray/tune/experiment.py +++ b/python/ray/tune/experiment.py @@ -74,7 +74,8 @@ class Experiment(object): experiment regardless of the checkpoint_freq. Default is False. max_failures (int): Try to recover a trial from its last checkpoint at least this many times. Only applies if - checkpointing is enabled. Defaults to 3. + checkpointing is enabled. Setting to -1 will lead to infinite + recovery retries. Defaults to 3. restore (str): Path to checkpoint. Only makes sense to set if running 1 trial. Defaults to None. repeat: Deprecated and will be removed in future versions of diff --git a/python/ray/tune/trial.py b/python/ray/tune/trial.py index 0dd571942310..12693a6e3db3 100644 --- a/python/ray/tune/trial.py +++ b/python/ray/tune/trial.py @@ -356,7 +356,8 @@ def should_recover(self): be a checkpoint. """ return (self.checkpoint_freq > 0 - and self.num_failures < self.max_failures) + and (self.num_failures < self.max_failures + or self.max_failures < 0)) def update_last_result(self, result, terminate=False): if terminate: