diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/__init__.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/__init__.py index edf3f23d..be1c3e9c 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/__init__.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/__init__.py @@ -14,3 +14,4 @@ # Local from .framework_plugin_fast_quantized_peft import FastQuantizedPeftAccelerationPlugin +from .framework_plugin_fast_kernels import FastKernelsAccelerationPlugin diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py index b659567e..7fe5a898 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_kernels.py @@ -85,7 +85,7 @@ def __init__(self, configurations: Dict[str, Dict]): ) self._check_config_and_maybe_check_values( - key="base_layer", + key="base_layer", values=["auto_gptq", "bitsandbytes"], default="auto_gptq" ) @@ -113,11 +113,6 @@ def augmentation( modifiable_args: Tuple[LoraConfig], ): - # this seems to be only needed for fused lora? - assert ( - model.dtype == torch.float16 and train_args.fp16 - ), "need to run in fp16 mixed precision or load model in fp16 when using fused lora" - terms = set() for k, v in self.configurations.items(): if v: @@ -138,7 +133,7 @@ def augmentation( AccelerationPlugin.register_plugin( FastKernelsAccelerationPlugin, configuration_or_paths=[ - "training.fused_ops_and_kernels" + "training.fused_ops_and_kernels", "peft.quantization.fused_ops_and_kernels", ], ) diff --git a/sample-configurations/CONTENTS.yaml b/sample-configurations/CONTENTS.yaml index 09301193..6781b3bd 100644 --- a/sample-configurations/CONTENTS.yaml +++ b/sample-configurations/CONTENTS.yaml @@ -67,4 +67,9 @@ framework_configs: - accelerated-peft - attention-and-distributed-packing - fused-ops-and-kernels - filename: accelerated-peft-autogptq-foak-padding-free-sample-configuration.yaml \ No newline at end of file + filename: accelerated-peft-autogptq-foak-padding-free-sample-configuration.yaml + + - shortname: foak-fast-kernels + plugins: + - fused-ops-and-kernels + filename: foak-fast-kernels-sample-configuration.yaml diff --git a/sample-configurations/foak-fast-kernels-sample-configuration.yaml b/sample-configurations/foak-fast-kernels-sample-configuration.yaml new file mode 100644 index 00000000..4f2e3692 --- /dev/null +++ b/sample-configurations/foak-fast-kernels-sample-configuration.yaml @@ -0,0 +1,31 @@ +# FMS Acceleration Plugin Configuration. +# +# Each stanza incorporates various configurations for +# different fine-tuning / training tasks. +plugins: + # Configurations to accelerate data packing/padding in training + training: + + fused_ops_and_kernels: + + # if under training stanza, then putting + # base_layer and fused_lora will be a misnomer + # - this should be in peft.quantized + # However, if it is specified, it will still + # be read. This is useful in use cases where + # the yaml is system generated and not shown + # to a user. + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + # fast loss triton kernels + fast_loss: True + + # fast rms norm triton kernels + fast_rsm_layernorm: True + + # fast RoPE embedding triton kernels + fast_rope_embeddings: True diff --git a/scripts/benchmarks/scenarios.yaml b/scripts/benchmarks/scenarios.yaml index 2eb22872..bd020400 100644 --- a/scripts/benchmarks/scenarios.yaml +++ b/scripts/benchmarks/scenarios.yaml @@ -37,6 +37,9 @@ scenarios: - name: full-finetuning + framework_config: + - null + - foak-fast-kernels arguments: learning_rate: 2e-5 model_name_or_path: @@ -46,6 +49,9 @@ scenarios: torch_dtype: float16 - name: standard-peft + framework_config: + - null + - foak-fast-kernels arguments: learning_rate: 2e-4 torch_dtype: float16 diff --git a/scripts/generate_sample_configurations.py b/scripts/generate_sample_configurations.py index c72c62eb..27cd3df2 100644 --- a/scripts/generate_sample_configurations.py +++ b/scripts/generate_sample_configurations.py @@ -147,6 +147,7 @@ def read_configuration(path: str) -> Dict: KEY_BNB_NF4_FOAK = "bnb-nf4-foak" KEY_AADP_PADDING_FREE = "aadp-padding-free" KEY_AADP_MULTIPACK = "aadp-multipack" +KEY_FAST_KERNELS = "foak-fast-kernels" CONFIGURATIONS = { KEY_AUTO_GPTQ: "plugins/accelerated-peft/configs/autogptq.yaml", @@ -171,6 +172,7 @@ def read_configuration(path: str) -> Dict: ), KEY_AADP_PADDING_FREE: "plugins/attention-and-distributed-packing/configs/padding_free.yaml", KEY_AADP_MULTIPACK: "plugins/attention-and-distributed-packing/configs/multipack.yaml", + KEY_FAST_KERNELS: "plugins/fused-ops-and-kernels/configs/fast_full.yaml", } # list of (tag, combi) tuples @@ -190,6 +192,7 @@ def read_configuration(path: str) -> Dict: ("accelerated-peft-autogptq-foak-padding-free", (KEY_AADP_PADDING_FREE,KEY_AUTO_GPTQ, KEY_AUTO_GPTQ_FOAK)), ("accelerated-peft-bnb-nf4-foak-padding-free", (KEY_AADP_PADDING_FREE,KEY_BNB_NF4, KEY_BNB_NF4_FOAK)), ("aadp-padding-free-multipack", (KEY_AADP_PADDING_FREE, KEY_AADP_MULTIPACK)), + ("foak-fast-kernels", (KEY_FAST_KERNELS)) ]