ServiceNow · jlamypoirier · Apr 30, 2025 · Apr 30, 2025 · May 2, 2025 · May 5, 2025
diff --git a/fast_llm/cli.py b/fast_llm/cli.py
@@ -0,0 +1,35 @@
+import logging
+import sys
+import traceback
+
+from fast_llm.config import ValidationError
+from fast_llm.engine.config_utils.logging import configure_logging
+from fast_llm.engine.config_utils.run import log_main_rank
+from fast_llm.engine.config_utils.runnable import RunnableConfig
+
+# Import these submodules to ensure classes are added to the dynamic class registry.
+import fast_llm.data.auto  # isort: skip
+import fast_llm.engine.checkpoint.convert  # isort: skip
+import fast_llm.models.auto  # isort: skip
+
+logger = logging.getLogger(__name__)
+
+
+def fast_llm_main(args: list[str] | None = None):
+    # TODO: Add hook to register model classes? (environment variable?)
+    # (Pre-)configure logging
+    configure_logging()
+    try:
+        RunnableConfig.parse_and_run(args)
+    except Exception as e:
+        if sys.gettrace():
+            raise
+        if isinstance(e, ValidationError):
+            log_main_rank(traceback.format_exc(), log_fn=logger.error)
+        else:
+            logger.critical(traceback.format_exc())
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    fast_llm_main()
diff --git a/fast_llm/config.py b/fast_llm/config.py
@@ -16,7 +16,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 _AUTO_VALIDATE = True
 
 MISSING = Tag("<MISSING>")
@@ -245,7 +244,7 @@ def _process_config_class(cls: type["Config"]):
 
 def config_class[
     T: Config
-](registry: bool = False, dynamic_type: "dict[type[Config], str]|None" = None) -> typing.Callable[[type[T]], type[T]]:
+](dynamic_type: "dict[type[Config], str]|None" = None) -> typing.Callable[[type[T]], type[T]]:
     """
     Fast-LLM replacement for the default dataclass wrapper. Performs additional verifications.
     """
@@ -270,11 +269,8 @@ def __init__(self, **kwargs):
 
         wrapped.__init__ = __init__
 
-        wrapped._registry = Registry[str, type[wrapped]](wrapped.__name__, {}) if registry else None
-
         if dynamic_type is not None:
             for cls_, name in dynamic_type.items():
-                print(cls_, name, wrapped)
                 cls_.register_subclass(name, wrapped)
 
         return wrapped
@@ -316,7 +312,7 @@ class Config(metaclass=ConfigMeta):
     _setting_implicit_default: bool | None = Field(init=False)
 
     # A registry for all the config classes.
-    _registry: typing.ClassVar[Registry[str, type[typing.Self]] | None] = None
+    _registry: typing.ClassVar[Registry[str, type[typing.Self]]] = Registry[str, "type[Config]"]("Config", {})
 
     def __setattr__(self, key: str, value: typing.Any) -> None:
         """
@@ -371,17 +367,6 @@ def validate[T: Config](self: T, *, _is_validating: bool = False) -> T:
         Validate a class and mark it as read-only
         This should not be overridden in derived classes.
         """
-        # Should be handled in `from_dict`, but can fail if instantiating directly.
-        try:
-            expected_class = self.get_subclass(self.type)
-        except KeyError as e:
-            # Delayed instantiation error in `from_dict`.
-            raise ValidationError(*e.args)
-
-        if expected_class is not None:
-            # Should be handled in `from_dict`, but can fail if instantiating directly.
-            Assert.is_(self.__class__, expected_class)
-
         if not self._validated:
             try:
                 self._validate()
@@ -401,6 +386,17 @@ def _validate(self) -> None:
         Can be extended to add custom post-processing (typically before the super() call)
         and validation (typically after)
         """
+        # Should be handled in `from_dict`, but can fail if instantiating directly.
+        try:
+            expected_class = self.get_subclass(self.type)
+        except KeyError as e:
+            # Delayed instantiation error in `from_dict`.
+            raise ValidationError(*e.args)
+
+        if expected_class is not None:
+            # Should be handled in `from_dict`, but can fail if instantiating directly.
+            Assert.is_(self.__class__, expected_class)
+
         if self._abstract:
             raise ValidationError(f"{type(self).__name__} is abstract")
         if not self.__class_validated__:
@@ -409,6 +405,8 @@ def _validate(self) -> None:
             )
         errors = []
         with self._set_implicit_default(None):
+            # Set the type field, or override it to the provided type with the actual class for clarity and safety.
+            self.type = self.__class__.__name__
             for name, field in self.fields():
                 if not field.init or field._field_type != dataclasses._FIELD:  # noqa
                     continue
@@ -486,6 +484,7 @@ def _validate_element(cls, value, type_, name: str):
             raise FieldTypeError(f"Not a type.")
         elif issubclass(type_, Config):
             cls._validate_element_type(value, type_, strict=False)
+
             value.validate(_is_validating=True)
         else:
             value = cls._validate_simple(value, type_)
@@ -737,7 +736,7 @@ def from_dict(
             for keys, value in update.items():
                 set_nested_dict_value(default, keys, value, update_type)
 
-        return cls._from_dict(default, strict)
+        return cls._from_dict(default, strict=strict)
 
     @classmethod
     def from_flat_dict(
@@ -899,8 +898,6 @@ def compare(self, other: "Config", log_fn: typing.Union[type[BaseException], typ
     @classmethod
     def register_subclass(cls, name: str, cls_: type[typing.Self]) -> None:
         Assert.custom(issubclass, cls_, cls)
-        if cls._registry is None:
-            raise NotImplementedError(f"Subclass `{cls.__name__}` doesn't have a registry..")
         if name in cls._registry:
             old_cls = cls._registry[name]
             if old_cls.__name__ == cls_.__name__ and cls._registry[name].__module__ == cls_.__module__:
@@ -916,7 +913,7 @@ def get_subclass(cls, name: str | None):
             return None
         cls_ = None
         for base_class in cls.__mro__:
-            if issubclass(base_class, Config) and base_class._registry is not None and name in base_class._registry:
+            if issubclass(base_class, Config) and name in base_class._registry:
                 if cls_ is None:
                     cls_ = base_class._registry[name]
                     if not issubclass(cls_, cls):
@@ -937,6 +934,12 @@ def __init_subclass__(cls):
         We need to postpone validation until the class has been processed by the dataclass wrapper.
         """
         Assert.eq(cls.__name__, cls.__qualname__)
+        cls._registry = Registry[str, type[cls]](cls.__name__, {})
+        if not cls._abstract:
+            Config.register_subclass(cls.__name__, cls)
+            short_name = cls.__name__.strip("Config")
+            if short_name != cls.__name__:
+                Config.register_subclass(short_name, cls)
         for base_class in cls.__mro__:
             if issubclass(base_class, Config) and base_class is not cls:
                 assert cls.__class_validated__, (
@@ -982,7 +985,7 @@ def __init_subclass__(cls):
                     cls.__annotations__[name] = base_class_field.type
 
     # Type for the field. At the end of class definition to avoid shadowing builtin.
-    type: str | None = Field(
+    type: str = Field(
         default=None,
         desc="The config class name.",
         hint=FieldHint.feature,

diff --git a/fast_llm/data/auto.py b/fast_llm/data/auto.py
@@ -1,13 +1,5 @@
-from fast_llm.data.preparator.config import DatasetPreparatorConfig
-from fast_llm.data.preparator.gpt_memmap.config import GPTMemmapDatasetPreparatorConfig
-from fast_llm.utils import Registry
+"""
+Import these submodules to ensure classes are added to the dynamic class registry.
+"""
 
-dataset_preparator_registry = Registry[str, DatasetPreparatorConfig](
-    "DatasetPreparator",
-    {
-        dataset_preparator.preparator_name: dataset_preparator
-        for dataset_preparator in [
-            GPTMemmapDatasetPreparatorConfig,
-        ]
-    },
-)
+from fast_llm.data.preparator.gpt_memmap.config import GPTMemmapDatasetPreparatorConfig  # isort: skip
diff --git a/fast_llm/data/data/gpt/data.py b/fast_llm/data/data/gpt/data.py
@@ -32,29 +32,18 @@ class GPTBatch:
     token_ids: torch.Tensor
     loss_masking_spans: list[torch.Tensor] | None = None
     sequence_lengths: list[torch.Tensor] | None = None
-    chosen_spans: list[torch.Tensor] | None = None
-    rejected_spans: list[torch.Tensor] | None = None
 
 
 def gpt_data_collate_fn(batch: list[GPTSample], sampling_parameters: GPTSamplingParameters) -> GPTBatch:
     stacked_ids = np.stack([sample.token_ids for sample in batch])
     stacked_spans = None
     sequence_lengths = None
-    stacked_chosen_spans = None
-    stacked_rejected_spans = None
     if sampling_parameters.use_loss_masking_spans:
         stacked_spans = [torch.from_numpy(sample.loss_masking_spans) for sample in batch]
-    if sampling_parameters.use_preference_loss_spans:
-        stacked_chosen_spans = [torch.from_numpy(sample.chosen_span) for sample in batch]
-        stacked_rejected_spans = [torch.from_numpy(sample.rejected_span) for sample in batch]
     if not sampling_parameters.cross_document_attention:
         sequence_lengths = [torch.tensor(sample.sequence_lengths) for sample in batch]
     return GPTBatch(
-        token_ids=torch.from_numpy(stacked_ids),
-        loss_masking_spans=stacked_spans,
-        sequence_lengths=sequence_lengths,
-        chosen_spans=stacked_chosen_spans,
-        rejected_spans=stacked_rejected_spans,
+        token_ids=torch.from_numpy(stacked_ids), loss_masking_spans=stacked_spans, sequence_lengths=sequence_lengths
     )
 
 
@@ -160,7 +149,6 @@ def get_iterator(
         sampling_parameters = self._sampling_parameters[dataset_name]
         Assert.in_range_incl(batch_config.sequence_length, 1, sampling_parameters.sequence_length)
         log_main_rank(f"Initializing {dataset_name} dataset iterator from sample {consumed_samples}...")
-
         return iter(
             torch.utils.data.DataLoader(
                 self._datasets[dataset_name],  # noqa

diff --git a/fast_llm/data/dataset/gpt/config.py b/fast_llm/data/dataset/gpt/config.py
@@ -73,7 +73,6 @@ class GPTSamplingParameters(SamplingParameters):
     sequence_length: int
     vocab_size: int
     use_loss_masking_spans: bool = False
-    use_preference_loss_spans: bool = False
     cross_document_attention: bool = True
     # How many extra tokens to add to the sequence length.
     # This is used to provide labels even for the last tokens in the sequence.
@@ -93,7 +92,7 @@ class GPTSamplingData(SamplingData):
     truncate_documents: bool = True
 
 
-@config_class(registry=True)
+@config_class()
 class GPTSampledDatasetConfig(SampledDatasetConfig):
     pass