prepare full-foak benchmarks

foundation-model-stack · Sep 6, 2024 · 1f3d3fb · 1f3d3fb
1 parent c8459bc
commit 1f3d3fb
Show file tree

Hide file tree

Showing 6 changed files with 49 additions and 8 deletions.
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/__init__.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/__init__.py
@@ -14,3 +14,4 @@
 
 # Local
 from .framework_plugin_fast_quantized_peft import FastQuantizedPeftAccelerationPlugin
+from .framework_plugin_fast_kernels import FastKernelsAccelerationPlugin
diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py
@@ -85,7 +85,7 @@ def __init__(self, configurations: Dict[str, Dict]):
             )
 
         self._check_config_and_maybe_check_values(
-            key="base_layer", 
+            key="base_layer",
             values=["auto_gptq", "bitsandbytes"],
             default="auto_gptq"
         )
@@ -113,11 +113,6 @@ def augmentation(
         modifiable_args: Tuple[LoraConfig],
     ):
 
-        # this seems to be only needed for fused lora? 
-        assert (
-            model.dtype == torch.float16 and train_args.fp16
-        ), "need to run in fp16 mixed precision or load model in fp16 when using fused lora"
-
         terms = set()
         for k, v in self.configurations.items():
             if v:
@@ -138,7 +133,7 @@ def augmentation(
 AccelerationPlugin.register_plugin(
     FastKernelsAccelerationPlugin,
     configuration_or_paths=[
-        "training.fused_ops_and_kernels"
+        "training.fused_ops_and_kernels",
         "peft.quantization.fused_ops_and_kernels",
     ],
 )
diff --git a/sample-configurations/CONTENTS.yaml b/sample-configurations/CONTENTS.yaml
@@ -67,4 +67,9 @@ framework_configs:
         - accelerated-peft
         - attention-and-distributed-packing
         - fused-ops-and-kernels
-      filename: accelerated-peft-autogptq-foak-padding-free-sample-configuration.yaml
+      filename: accelerated-peft-autogptq-foak-padding-free-sample-configuration.yaml
+
+    - shortname: foak-fast-kernels
+      plugins:
+        - fused-ops-and-kernels
+      filename: foak-fast-kernels-sample-configuration.yaml
diff --git a/sample-configurations/foak-fast-kernels-sample-configuration.yaml b/sample-configurations/foak-fast-kernels-sample-configuration.yaml
@@ -0,0 +1,31 @@
+# FMS Acceleration Plugin Configuration. 
+#
+# Each stanza incorporates various configurations for 
+# different fine-tuning / training tasks.
+plugins:
+  # Configurations to accelerate data packing/padding in training
+  training:
+
+    fused_ops_and_kernels: 
+
+      # if under training stanza, then putting
+      # base_layer and fused_lora will be a misnomer
+      # - this should be in peft.quantized
+      # However, if it is specified, it will still 
+      # be read. This is useful in use cases where
+      # the yaml is system generated and not shown
+      # to a user.
+
+      # activate various unsloth optimizations
+      # there are two versions of the plugin
+      # - the FastKernel version supports individual kernels
+      # - the FastQuantized version is all-or-nothing
+
+      # fast loss triton kernels
+      fast_loss: True
+
+      # fast rms norm triton kernels
+      fast_rsm_layernorm: True
+
+      # fast RoPE embedding triton kernels
+      fast_rope_embeddings: True
diff --git a/scripts/benchmarks/scenarios.yaml b/scripts/benchmarks/scenarios.yaml
@@ -37,6 +37,9 @@
 
 scenarios:
     -   name: full-finetuning
+        framework_config: 
+            - null
+            - foak-fast-kernels
         arguments:
             learning_rate: 2e-5
             model_name_or_path: 
@@ -46,6 +49,9 @@ scenarios:
             torch_dtype: float16
 
     -   name: standard-peft
+        framework_config: 
+            - null
+            - foak-fast-kernels
         arguments:
             learning_rate: 2e-4
             torch_dtype: float16

diff --git a/scripts/generate_sample_configurations.py b/scripts/generate_sample_configurations.py
@@ -147,6 +147,7 @@ def read_configuration(path: str) -> Dict:
 KEY_BNB_NF4_FOAK = "bnb-nf4-foak"
 KEY_AADP_PADDING_FREE = "aadp-padding-free"
 KEY_AADP_MULTIPACK = "aadp-multipack"
+KEY_FAST_KERNELS = "foak-fast-kernels"
 
 CONFIGURATIONS = {
     KEY_AUTO_GPTQ: "plugins/accelerated-peft/configs/autogptq.yaml",
@@ -171,6 +172,7 @@ def read_configuration(path: str) -> Dict:
     ),
     KEY_AADP_PADDING_FREE: "plugins/attention-and-distributed-packing/configs/padding_free.yaml",
     KEY_AADP_MULTIPACK: "plugins/attention-and-distributed-packing/configs/multipack.yaml",
+    KEY_FAST_KERNELS: "plugins/fused-ops-and-kernels/configs/fast_full.yaml",
 }
 
 # list of (tag, combi) tuples
@@ -190,6 +192,7 @@ def read_configuration(path: str) -> Dict:
     ("accelerated-peft-autogptq-foak-padding-free", (KEY_AADP_PADDING_FREE,KEY_AUTO_GPTQ, KEY_AUTO_GPTQ_FOAK)),
     ("accelerated-peft-bnb-nf4-foak-padding-free", (KEY_AADP_PADDING_FREE,KEY_BNB_NF4, KEY_BNB_NF4_FOAK)),
     ("aadp-padding-free-multipack", (KEY_AADP_PADDING_FREE, KEY_AADP_MULTIPACK)),
+    ("foak-fast-kernels", (KEY_FAST_KERNELS))
 ]