Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
8815089
Scheduler: update to new scheduler that uses Monte Carlo simulations,…
pradyumna-rfai Sep 10, 2025
445c08f
Scheduler: updated scemantics to schedule runs by min chunks-visited,…
pradyumna-rfai Sep 11, 2025
572476d
Db: updated db to include req_workers, estimated_runtime in runs table
pradyumna-rfai Sep 11, 2025
68ebf26
AutoML: update linting
pradyumna-rfai Sep 11, 2025
073a4ac
AutoML: add num_gpus in model_config
pradyumna-rfai Sep 11, 2025
f7889a1
Db: add set_estimated_runtime func
pradyumna-rfai Sep 11, 2025
603076a
Scheduler: expose monto carlo simulations as a param in experiment ru…
pradyumna-rfai Sep 11, 2025
9f6989d
Db: add multi_worker_details as a field in Worker task table
pradyumna-rfai Sep 11, 2025
85539d1
Controller, Worker: update run_fit logic for multi-node training in C…
pradyumna-rfai Sep 11, 2025
1e1c948
Scheduler: modify scheduler to be fair round robin with Monte Carlo
pradyumna-rfai Sep 11, 2025
80feeed
fsdp initial changes
humaira-rf Sep 12, 2025
2bfb92c
full model fixes
humaira-rf Sep 15, 2025
05d198e
fixed gpu ids
humaira-rf Sep 16, 2025
2e704ce
notebook params
humaira-rf Sep 17, 2025
86fc486
notebooks for qlora
humaira-rf Sep 18, 2025
391be75
error handling
humaira-rf Sep 20, 2025
5ec02b3
Revert "error handling"
humaira-rf Sep 24, 2025
f14be2f
full model changes
humaira-rf Sep 26, 2025
86b76c0
Updated warm_started_from to warm_started (bool)
pradyumna-rfai Sep 27, 2025
74369ef
Controller: minor fixes from rebase
pradyumna-rfai Sep 27, 2025
ef97bf3
Scheduler: restored scheduler from before rebase
pradyumna-rfai Sep 27, 2025
389e2a1
Scheduler: removed start_chunk_id from scheduler
pradyumna-rfai Sep 27, 2025
ba3f756
Scheduler: updated minor comment
pradyumna-rfai Sep 27, 2025
8d806fa
Worker: fixed runtime code, minor updates
pradyumna-rfai Sep 27, 2025
78094e6
Misc: dist_utils formatting changes
pradyumna-rfai Sep 27, 2025
ba52417
Scheduler: minor changes to scheduler, added tests
pradyumna-rfai Sep 27, 2025
c63b307
fsdp chnages: optimizer fixes, warm start bug fix
Oct 1, 2025
1d1d0fa
corrected eff batch sizze, added suppression of warnings
humaira-rf Oct 1, 2025
9b15503
notebooks updation
humaira-rf Oct 2, 2025
0281584
Organized tutorial notebooks into subdirs
pradyumna-rfai Oct 2, 2025
6bb1ef0
num_gpus correction, notebookupdate, vllm changes
humaira-rf Oct 3, 2025
809d5e2
Controller: Fixed clone modify race condition
pradyumna-rfai Oct 3, 2025
5f383d2
temp changes to multi-gpu
humaira-rf Jan 29, 2026
d50f614
Merge remote-tracking branch 'origin/feature/multi-gpu-scheduler' int…
humaira-rf Feb 2, 2026
048e62e
experiment, controller - merge fixes
humaira-rf Feb 2, 2026
d0eb447
more merge fixes
humaira-rf Feb 2, 2026
fda4e54
scheduler fixes for single gpu
humaira-rf Feb 3, 2026
189638c
sft notebook updates
humaira-rf Feb 4, 2026
6743fde
lite notebooks added
humaira-rf Feb 4, 2026
b8a2bec
working notebooks sft lite, normal
humaira-rf Feb 6, 2026
f72e235
fsdp notebook updated
humaira-rf Feb 6, 2026
ca6b2cd
evaluation changes, num_gpus fix, icops-warm clone and delete
humaira-rf Feb 7, 2026
8e1554e
final saving checkpoint to disk, llama 70b changes
humaira-rf Feb 7, 2026
2c22ee2
trl version reverted
humaira-rf Feb 7, 2026
34ad039
fixed linter errors
humaira-rf Feb 7, 2026
50da616
llama 70b num chunks increased
humaira-rf Feb 7, 2026
19d0135
notebooks updated
humaira-rf Feb 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions rapidfireai/automl/automl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
# TODO: add code to validate param_config


def get_flattened_config_leaf(param_config: dict[str, Any], prefix: str = "") -> dict[str, Any]:
def get_flattened_config_leaf(
param_config: dict[str, Any], prefix: str = ""
) -> dict[str, Any]:
"""Flattens the param_config dictionary into a single hierarchy"""
items = []
for k, v in param_config.items():
Expand Down Expand Up @@ -43,7 +45,9 @@ def get_flattened_config_leaf(param_config: dict[str, Any], prefix: str = "") ->
return dict(items)


def get_runs(param_config: AutoMLAlgorithm | dict[str, Any] | list[Any], seed: int) -> list[dict[str, Any]]:
def get_runs(
param_config: AutoMLAlgorithm | dict[str, Any] | list[Any], seed: int
) -> list[dict[str, Any]]:
"""Get the runs for the given param_config."""
# FIXME: how do we handle seed for dict and list?
if isinstance(param_config, AutoMLAlgorithm):
Expand All @@ -56,4 +60,4 @@ def get_runs(param_config: AutoMLAlgorithm | dict[str, Any] | list[Any], seed: i
config_leaves.extend(get_runs(config, seed))
return config_leaves
else:
raise ValueError(f"Invalid param_config type: {type(param_config)}")
raise ValueError(f"Invalid param_config type: {type(param_config)}")
26 changes: 18 additions & 8 deletions rapidfireai/automl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,37 +12,47 @@ class AutoMLAlgorithm(ABC):

VALID_TRAINER_TYPES = {"SFT", "DPO", "GRPO"}

def __init__(self, configs=None, create_model_fn=None, trainer_type: str | None = None, num_runs: int = 1):
def __init__(
self,
configs=None,
create_model_fn=None,
trainer_type: str | None = None,
num_runs: int = 1,
):
"""
Initialize AutoML algorithm with configurations and trainer type.

Args:
configs: List of configurations (RFModelConfig for fit mode, dict for evals mode)
create_model_fn: Optional function to create models (legacy parameter)
trainer_type: Trainer type ("SFT", "DPO", "GRPO") for fit mode, None for evals mode
num_runs: Number of runs for random search

Mode detection:
- If trainer_type is provided: fit mode (requires RFModelConfig instances)
- If trainer_type is None: evals mode (requires dict instances)
"""
try:
self.configs = self._normalize_configs(configs)
self.num_runs = num_runs

# Detect mode based on trainer_type
if trainer_type is not None:
self.mode = "fit"
self.trainer_type = trainer_type.upper()
if self.trainer_type not in self.VALID_TRAINER_TYPES:
raise AutoMLException(f"trainer_type must be one of {self.VALID_TRAINER_TYPES}")
raise AutoMLException(
f"trainer_type must be one of {self.VALID_TRAINER_TYPES}"
)
else:
self.mode = "evals"
self.trainer_type = None

self._validate_configs()
except Exception as e:
raise AutoMLException(f"Error initializing {self.__class__.__name__}: {e}") from e
raise AutoMLException(
f"Error initializing {self.__class__.__name__}: {e}"
) from e

def _normalize_configs(self, configs):
"""Normalize configs to list format."""
Expand All @@ -56,10 +66,10 @@ def _validate_configs(self):
"""Validate configs based on mode."""
if not self.configs:
return

# Import here to avoid circular imports
from rapidfireai.automl.model_config import RFModelConfig

if self.mode == "fit":
# Fit mode: must have RFModelConfig instances
for config in self.configs:
Expand Down
6 changes: 4 additions & 2 deletions rapidfireai/automl/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ class Range:

def __init__(self, start, end, dtype: str | None = None):
if dtype is None:
self.dtype = "int" if isinstance(start, int) and isinstance(end, int) else "float"
self.dtype = (
"int" if isinstance(start, int) and isinstance(end, int) else "float"
)
else:
if dtype not in ("int", "float"):
raise ValueError("dtype must be either 'int' or 'float'.")
self.dtype = dtype
if not (isinstance(start, (int, float)) and isinstance(end, (int, float))):
if not (isinstance(start, int | float) and isinstance(end, int | float)):
raise ValueError("start and end must be either int or float.")
self.start = start
self.end = end
Expand Down
71 changes: 46 additions & 25 deletions rapidfireai/automl/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def recursive_expand_gridsearch(item: Any):
class RFGridSearch(AutoMLAlgorithm):
"""Grid search algorithm that generates all hyperparameter combinations."""

def get_runs(self, seed: int=42) -> list[dict[str, Any]]:
def get_runs(self, seed: int = 42) -> list[dict[str, Any]]:
"""Generate all possible hyperparameter combinations for grid search."""
if not isinstance(seed, int) or seed < 0:
raise AutoMLException("seed must be a non-negative integer")
Expand Down Expand Up @@ -58,23 +58,31 @@ def _get_runs_fit(self) -> list[dict[str, Any]]:

for peft_config in peft_configs:
peft_instances = (
[{}] if peft_config is None else list(recursive_expand_gridsearch(peft_config._user_params))
[{}]
if peft_config is None
else list(recursive_expand_gridsearch(peft_config._user_params))
)
training_instances = (
[{}]
if config.training_args is None
else list(recursive_expand_gridsearch(config.training_args._user_params))
else list(
recursive_expand_gridsearch(config.training_args._user_params)
)
)
model_kwargs_instances = (
[{}] if config.model_kwargs is None else list(recursive_expand_gridsearch(config.model_kwargs))
[{}]
if config.model_kwargs is None
else list(recursive_expand_gridsearch(config.model_kwargs))
)
ref_model_kwargs_instances = (
[{}]
if config.ref_model_kwargs is None
else list(recursive_expand_gridsearch(config.ref_model_kwargs))
)
reward_funcs_instances = (
[{}] if config.reward_funcs is None else list(recursive_expand_gridsearch(config.reward_funcs))
[{}]
if config.reward_funcs is None
else list(recursive_expand_gridsearch(config.reward_funcs))
)

# Get additional kwargs for Trainer
Expand All @@ -91,13 +99,18 @@ def _get_runs_fit(self) -> list[dict[str, Any]]:
"ref_model_type",
"ref_model_kwargs",
"reward_funcs",
"num_gpus",
}
# excluded_attrs = set(config.__dict__.keys()) - set(config.__annotations__.keys())
additional_kwargs = {
k: v for k, v in config.__dict__.items() if k not in excluded_attrs and v is not None
k: v
for k, v in config.__dict__.items()
if k not in excluded_attrs and v is not None
}
additional_kwargs_instances = (
[{}] if not additional_kwargs else list(recursive_expand_gridsearch(additional_kwargs))
[{}]
if not additional_kwargs
else list(recursive_expand_gridsearch(additional_kwargs))
)

# Generate gridsearch combinations
Expand All @@ -116,21 +129,26 @@ def _get_runs_fit(self) -> list[dict[str, Any]]:
"model_kwargs": model_kwargs,
"additional_kwargs": additional_kwargs,
}

if self.trainer_type == "DPO":
leaf["ref_model_config"] = {
"model_name": config.ref_model_name,
"model_type": config.ref_model_type,
}
for ref_model_kwargs in ref_model_kwargs_instances:
leaf["ref_model_config"]["model_kwargs"] = ref_model_kwargs
runs.append(leaf)
elif self.trainer_type == "GRPO":
for reward_func in reward_funcs_instances:
leaf["reward_funcs"] = reward_func
runs.append(leaf)
else:
num_gpus = getattr(config, "num_gpus", None)
if num_gpus is not None:
leaf["num_gpus"] = num_gpus

if self.trainer_type == "DPO":
leaf["ref_model_config"] = {
"model_name": config.ref_model_name,
"model_type": config.ref_model_type,
}
for ref_model_kwargs in ref_model_kwargs_instances:
leaf["ref_model_config"][
"model_kwargs"
] = ref_model_kwargs
runs.append(leaf)
elif self.trainer_type == "GRPO":
for reward_func in reward_funcs_instances:
leaf["reward_funcs"] = reward_func
runs.append(leaf)
else:
runs.append(leaf)

return runs

Expand All @@ -147,7 +165,7 @@ def _get_runs_evals(self) -> list[dict[str, Any]]:
pipeline = config["pipeline"]
else:
pipeline = None

if pipeline is None:
pipelines = [None]
elif isinstance(pipeline, List):
Expand All @@ -156,7 +174,7 @@ def _get_runs_evals(self) -> list[dict[str, Any]]:
pipelines = pipeline
else:
pipelines = [pipeline]

for pipeline in pipelines:
pipeline_instances = (
[{}]
Expand All @@ -167,7 +185,10 @@ def _get_runs_evals(self) -> list[dict[str, Any]]:
additional_kwargs = {
k: v
for k, v in config.items()
if k!= "pipeline" and k!= "vllm_config" and k != "openai_config" and v is not None
if k != "pipeline"
and k != "vllm_config"
and k != "openai_config"
and v is not None
}
additional_kwargs_instances = (
[{}]
Expand All @@ -181,7 +202,7 @@ def _get_runs_evals(self) -> list[dict[str, Any]]:
pipeline_instance = pipeline.__class__(**pipeline_params)
else:
pipeline_instance = pipeline_params

leaf = {
"pipeline": pipeline_instance,
**additional_kwargs_dict,
Expand Down
Loading