instructlab · splotnikv · Jun 24, 2025 · thisisatharva-rh · Jun 10, 2025 · splotnikv
diff --git a/docs/hpu.md b/docs/hpu.md
@@ -0,0 +1,61 @@
+# InstructLab Training on HPU
+
+## HPU specific changes
+Next changes are required to enable training on HPU:
+
+|GPU|HPU|
+|---|---|
+|`from accelerate import Accelerator` | `from optimum.habana.accelerate import GaudiAccelerator`|
+|`from accelerate.utils import FullyShardedDataParallelPlugin` | `from optimum.habana.accelerate.utils import GaudiFullyShardedDataParallelPlugin` |
+
+It is also recommended to use HPU optimized versions of transformers:
+
+```Python
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+adapt_transformers_to_gaudi()
+```
+
+## Bucketing
+Multipack sampler implementation produces wide range of batches with different sample lengths and number of samples. Each of these combinations leads to graph recompilation and this recompilation takes time and slows down training. To reduce number of recompilations HPU implementation uses bucketing approach, when maximum sample length in batch is aligned to some predefined value. It is similar to padding but all samples in the batch are padded not to the longest sample but to the some slightly bigger value.
+
+![bucketing vs. padding](./hpu_pic/bucketing_vs_padding.png)
+
+
+To compute bucked size, we use next algorithm: 
+- Firstly, we find MSB of the longest sample in the batch, let's call it S.
+- Then we slice the range [2 ** S, 2 ** (S+1)] into 16 buckets of the same size.
+- Then we use top boundary of the smallest suitable bucked as padding value.
+
+This approach limits overhead of the bucketing to 1/16 th of the longest sample and allows us to significantly reduce number of recompilations.
+
+## How to run
+To run training build docker using next dockerfile:
+```Dockerfile
+FROM vault.habana.ai/gaudi-docker/1.21.0/rhel9.4/habanalabs/pytorch-installer-2.6.0:1.21.0-555
+
+ARG CMAKE_ARGS="-DGGML_NATIVE=off"
+
+WORKDIR /app
+RUN pip install git+https://github.com/instructlab/instructlab.git@v0.26.1
+
+WORKDIR /app
+RUN pip install git+https://github.com/huggingface/optimum-habana.git@v1.18.0
+```
+
+Then make next changes to config file:
+```YAML
+train:
+  device: hpu
+  distributed_backend: fsdp
+  fsdp_cpu_offload_optimizer: false
+  is_padding_free: true
+  pipeline: accelerated 
+  disable_flash_attn: true
+```
+
+And finally run this command line:
+```BASH
+ilab --config=./config.yaml model train --pipeline accelerated --data-path ./data.jsonl
+```
+
+
diff --git a/docs/hpu_pic/bucketing_vs_padding.png b/docs/hpu_pic/bucketing_vs_padding.png
diff --git a/src/instructlab/training/accelerator.py b/src/instructlab/training/accelerator.py
@@ -3,7 +3,6 @@
 from typing import Callable, Optional
 
 # Third Party
-from accelerate import Accelerator as TransformersAccel
 from torch.utils.data import DataLoader
 from transformers import get_scheduler
 import torch
@@ -32,6 +31,7 @@ def __init__(
         deepspeed_cpu_offload_optimizer_pin_memory: Optional[bool] = False,
         deepspeed_cpu_offload_optimizer_ratio: Optional[float] = None,
         fsdp_cpu_offload_params: Optional[bool] = False,
+        device: Optional[str] = None,
     ):
         self.samples_per_gpu = samples_per_gpu
         self.save_samples = save_samples
@@ -48,6 +48,7 @@ def __init__(
             deepspeed_cpu_offload_optimizer_ratio
         )
         self.fsdp_cpu_offload_params = fsdp_cpu_offload_params
+        self.device_str = device
 
         if self.distributed_framework == DistributedBackend.DEEPSPEED:
             # Standard
@@ -69,6 +70,12 @@ def __init__(
                 "fsdp_plugin": self.get_fsdp_config(),
                 "mixed_precision": "bf16",
             }
+
+        if device == "hpu":
+            from optimum.habana.accelerate import GaudiAccelerator as TransformersAccel
+        else:
+            from accelerate import Accelerator as TransformersAccel
+
         self.accelerator = TransformersAccel(
             **accel_args,
         )
@@ -160,6 +167,10 @@ def get_fsdp_config(self):
             cpu_offload=CPUOffload(self.fsdp_cpu_offload_params),
         )
 
+        if self.device_str == "hpu":
+            fsdp_plugin.use_orig_params=True
+            fsdp_plugin.sync_module_states=True
+
         # `use_orig_params` must be disabled when using LoRA and FSDP together
         # Source: https://huggingface.co/docs/peft/en/accelerate/fsdp#the-important-parts
         if self.model.lora_config is not None:

diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
@@ -246,3 +246,5 @@ class TrainingArgs(BaseModel):
     log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = Field(
         default="INFO"
     )
+
+    device: Optional[str] = None
diff --git a/src/instructlab/training/hpu_utils.py b/src/instructlab/training/hpu_utils.py
@@ -0,0 +1,49 @@
+import torch
+from functools import lru_cache
+
+
+@lru_cache(maxsize=None)
+def is_torch_hpu_available() -> bool:
+    try:
+        import habana_frameworks.torch.core  # noqa: F401
+    except ImportError:
+        return False
+    return True
+
+
+def simple_bucket(length):
+    """
+    This bucket algorithm merely relies on the given number instead of based on
+    slicing the known (min, max) range for several reasons:
+        1) Due to the use of the first-fit-decreasing (FFD) algorithm, the
+           (min, max) sequence length of each rank will be much smaller than the
+           (min, max) sequence length of the dataset. Bucketing on the
+           (min, max) sequence length of the dataset is not practical
+        2) The (min, max) sequence length of a given rank is unknown until
+           finishing 1 epoch since the packing is done on the fly
+        3) Due to the shuffling, the (min, max) sequence length of a given rank
+           may vary between ranks. Once the (min, max) sequence length of a
+           given rank changes, the bucketing also needs adjustment
+
+    This bucket algorithm is based on the most significant set bit of the input number.
+    It first check what’s the most significant set bit, assuming it's bit "S",
+    and then slice the range [2 ** S, 2 ** (S+1)] into buckets with the same size.
+    By default the range is divided into 16 buckets, so the bucket size will be
+    2 ** (S - 4)
+    For example, 0b10001 will be padded to 0b10010.
+    This approach can limit the overhead of bucketing (at most 1/16 of the input
+    number) and also prevent recompilation due to a too small bucket size.
+    """
+    l = length
+    msb = 0
+    while l > 0:
+        msb += 1
+        l = l // 2
+
+    align = (1 << (msb - 4)) if msb >= 4 else 1
+
+    return (length + align - 1) // align * align
+
+
+def bucket(length):
+    return simple_bucket(length)
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
@@ -33,6 +33,14 @@
             UserWarning,
         )
 
+from instructlab.training.hpu_utils import is_torch_hpu_available
+
+if is_torch_hpu_available():
+    import habana_frameworks.torch.core as htcore
+    import habana_frameworks.torch.distributed.hccl
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+    adapt_transformers_to_gaudi()
+
 # Third Party
 from tqdm import tqdm
 from transformers import AutoConfig
@@ -122,7 +130,7 @@ def train(
         if local_rank == 0:
             inner_pb = tqdm(range(num_epoch_steps), desc=f"Epoch {epoch}")
 
-        # blast through the batches in the train loader up to the last step within the epoch.
+        # blast through the batches in the train loader up to the last step within the epoch. 
         for batch in accelerator.train_loader:
             if global_step <= args.last_step:
                 # in the case of resuming, last_step > 0
@@ -137,10 +145,19 @@ def train(
             micro_batch_size = float(torch.tensor([batch.pop("num_samples")]))
             total_length = float(torch.tensor([batch.pop("total_length")]))
             for k in batch:
-                batch[k] = batch[k].to(local_rank)
+                batch[k] = batch[k].to('hpu' if args.device == "hpu" else local_rank)
+
+            hpu_args = {}
+            if args.device == "hpu":
+                hpu_args = {
+                    "use_flash_attention":True,
+                    "lazy_mode":False,
+                }
+
             output = model(
                 **batch,
                 use_cache=False,
+                **hpu_args,
             )
             loss = output.loss
             log_loss = loss.detach().item()
@@ -177,8 +194,14 @@ def train(
                 elapsed_time = time.time() - start
                 overall_throughput = args.samples_per_gpu * world_size / elapsed_time
                 current_lr = accelerator.lr_scheduler.get_last_lr()[0]
-                cuda_mem_allocated = torch.cuda.memory_allocated() / (1024**3)
-                cuda_malloc_retries = torch.cuda.memory_stats()["num_alloc_retries"]
+
+                if args.device == "hpu":
+                    mem_allocated = torch.hpu.memory_allocated() / (1024**3)
+                    malloc_retries = 0
+                else:
+                    mem_allocated = torch.cuda.memory_allocated() / (1024**3)
+                    malloc_retries = torch.cuda.memory_stats()["num_alloc_retries"]
+
                 global_grad_norm = (
                     model.get_global_grad_norm()
                     if hasattr(model, "get_global_grad_norm")
@@ -200,8 +223,8 @@ def train(
                         "rank": torch.distributed.get_rank(),
                         "overall_throughput": overall_throughput,
                         "lr": current_lr,
-                        "cuda_mem_allocated": cuda_mem_allocated,
-                        "cuda_malloc_retries": cuda_malloc_retries,
+                        ("hpu" if args.device == "hpu" else "cuda") + "_mem_allocated": mem_allocated,
+                        ("hpu" if args.device == "hpu" else "cuda") + "_malloc_retries": malloc_retries,
                         "num_loss_counted_tokens": int(num_loss_counted_tokens),
                         "num_tokens_rank0": int(total_length),
                         "batch_size": int(micro_batch_size),
@@ -234,7 +257,10 @@ def train(
             global_step += 1
             if local_rank == 0:
                 inner_pb.update(1)
-            torch.cuda.empty_cache()
+
+            if args.device != "hpu":
+                torch.cuda.empty_cache()
+
         if args.checkpoint_at_epoch:
             base_logger.debug(f"Saving checkpoint at epoch {epoch}")
             save_checkpoint(
@@ -312,17 +338,24 @@ def main(args):
     args.model_type = model_conf.model_type
 
     #### distributed init #####
-    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+    if args.device == "hpu":
+        torch.hpu.set_device(int(os.environ["LOCAL_RANK"]))
+    else:
+        torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+
     args.local_rank = int(os.environ["LOCAL_RANK"])
 
     timeout = _get_collective_timeout()
-    if timeout is not None:
-        torch.distributed.init_process_group(timeout=timeout)
-    else:
-        torch.distributed.init_process_group()
+    backend = "hccl" if args.device == "hpu" else None
+    torch.distributed.init_process_group(backend=backend, timeout=timeout)
 
     args.global_rank = torch.distributed.get_rank()
-    tensor = torch.ByteTensor([False]).cuda()
+
+    if args.device == "hpu":
+        tensor = torch.ByteTensor([False]).to('hpu')
+    else:
+        tensor = torch.ByteTensor([False]).cuda()
+
     torch.distributed.all_reduce(tensor)
     torch.distributed.barrier()
 
@@ -369,6 +402,7 @@ def main(args):
         flash_enabled=flash_enabled,
         noise_alpha=args.NEFTune_alpha,
         lora_quant_bits=args.lora_quant_bits,
+        device=args.device,
     )
 
     args.base_model_args = m.base_model_args
@@ -407,6 +441,7 @@ def main(args):
         samples_per_gpu=args.samples_per_gpu,
         sampler=args.sampler,
         seed=args.seed,
+        device=args.device,
     )
     if len(train_loader) == 0:
         # this happens sometimes when we have more GPUs than data to process. In this case
@@ -426,6 +461,7 @@ def main(args):
             samples_per_gpu=args.samples_per_gpu,
             sampler=args.sampler,
             seed=args.seed,
+            device=args.device,
         )
 
     if args.local_rank == 0:
@@ -457,6 +493,7 @@ def main(args):
         deepspeed_cpu_offload_optimizer_ratio=args.cpu_offload_optimizer_ratio,
         fsdp_cpu_offload_params=args.cpu_offload_params_fsdp,
         save_samples=args.save_samples,
+        device=args.device,
     )
     # optimizer needs model that has been prepared by accelerator
     # and then accelerator needs to be prepared AGAIN once optimizer is initialized
@@ -636,6 +673,10 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     if train_args.keep_last_checkpoint_only:
         command.append("--keep_last_checkpoint_only")
 
+    command.append(
+        f"--device={train_args.device}"
+    )
+
     logger.info("Running training command as subprocess: %s", " ".join(command))
     process = None
     interrupt: KeyboardInterrupt | Exception | None = None
@@ -837,6 +878,14 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         action="store_true",
         help="Use Liger kernels for training.",
     )
+
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        help="PyTorch device to use.",
+    )
+
     args = parser.parse_args()
     set_random_seed(args.seed)
     main(args)