Skip to content

Commit

Permalink
Merge pull request #44 from flowersteam/gloo-timeout-in-config
Browse files Browse the repository at this point in the history
Putting the GLOO timeout in the config instead of env variable
  • Loading branch information
ClementRomac authored Nov 5, 2024
2 parents 4318812 + 01f5302 commit d90e284
Show file tree
Hide file tree
Showing 10 changed files with 16 additions and 1 deletion.
1 change: 1 addition & 0 deletions examples/PPO_LoRA_finetuning/local_gpu_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
lamorel_args:
log_level: info
gloo_timeout: 1800
allow_subgraph_use_whith_gradient: false
distributed_setup_args:
n_rl_processes: 1
Expand Down
1 change: 1 addition & 0 deletions examples/PPO_finetuning/local_gpu_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
lamorel_args:
log_level: info
gloo_timeout: 1800
allow_subgraph_use_whith_gradient: false
distributed_setup_args:
n_rl_processes: 1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
lamorel_args:
log_level: info
gloo_timeout: 1800
allow_subgraph_use_whith_gradient: false
distributed_setup_args:
n_rl_processes: 1
Expand Down
1 change: 1 addition & 0 deletions examples/SayCan/local_gpu_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
lamorel_args:
log_level: info
gloo_timeout: 1800
allow_subgraph_use_whith_gradient: false
distributed_setup_args:
n_rl_processes: 1
Expand Down
1 change: 1 addition & 0 deletions examples/configs/local_cpu_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
lamorel_args:
log_level: info
gloo_timeout: 1800
allow_subgraph_use_whith_gradient: false
distributed_setup_args:
n_rl_processes: 1
Expand Down
1 change: 1 addition & 0 deletions examples/configs/local_gpu_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
lamorel_args:
log_level: info
gloo_timeout: 1800
allow_subgraph_use_whith_gradient: false
distributed_setup_args:
n_rl_processes: 1
Expand Down
1 change: 1 addition & 0 deletions examples/configs/multi-node_slurm_cluster_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
lamorel_args:
log_level: info
gloo_timeout: 1800
allow_subgraph_use_whith_gradient: false
distributed_setup_args:
n_rl_processes: 1
Expand Down
8 changes: 7 additions & 1 deletion lamorel/src/lamorel/caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import logging
lamorel_logger = logging.getLogger('lamorel_logger')
gloo_timeout = datetime.timedelta(seconds=int(os.environ.get('GLOO_TIMEOUT', 1800)))

class Caller:
'''
Expand All @@ -30,6 +29,13 @@ def __init__(self, config, custom_updater=None, custom_module_functions={}, cust
lamorel_logger.setLevel(numeric_log_level)

# Initialize distributed groups
if "gloo_timeout" in config:
lamorel_logger.info(f"Setting the GLOO timeout to {int(config.gloo_timeout)} seconds.")
gloo_timeout = datetime.timedelta(seconds=int(config.gloo_timeout))
else:
lamorel_logger.info(f"No configuration found for the GLOO timeout, setting it to default: 1800 seconds.")
gloo_timeout = datetime.timedelta(seconds=1800)

# RL processes are considered as the first n processes
rl_processes = list(range(config.distributed_setup_args.n_rl_processes))
llm_processes = list(range(
Expand Down
1 change: 1 addition & 0 deletions tests/causal_lms/configs/local_gpu_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
lamorel_args:
log_level: info
gloo_timeout: 1800
allow_subgraph_use_whith_gradient: false
distributed_setup_args:
n_rl_processes: 1
Expand Down
1 change: 1 addition & 0 deletions tests/seq2seq_lms/configs/local_gpu_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
lamorel_args:
log_level: info
gloo_timeout: 1800
allow_subgraph_use_whith_gradient: false
distributed_setup_args:
n_rl_processes: 1
Expand Down

0 comments on commit d90e284

Please sign in to comment.