pytorch
diff --git a/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/torchao_experimental_test.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/float8/training/README.md
Lines changed: 1 addition & 0 deletions b/‎benchmarks/float8/training/README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/float8/training/float8_training_benchmark.sh
Lines changed: 2 additions & 1 deletion b/‎benchmarks/float8/training/float8_training_benchmark.sh
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/microbenchmarks/benchmark_inference.py
Lines changed: 27 additions & 4 deletions b/‎benchmarks/microbenchmarks/benchmark_inference.py
Lines changed: 27 additions & 4 deletions
diff --git a/‎benchmarks/microbenchmarks/benchmark_runner.py
Lines changed: 63 additions & 8 deletions b/‎benchmarks/microbenchmarks/benchmark_runner.py
Lines changed: 63 additions & 8 deletions
diff --git a/‎benchmarks/microbenchmarks/test/benchmark_config.yml
Lines changed: 6 additions & 2 deletions b/‎benchmarks/microbenchmarks/test/benchmark_config.yml
Lines changed: 6 additions & 2 deletions
diff --git a/‎benchmarks/microbenchmarks/test/test_benchmark_inference.py
Lines changed: 66 additions & 1 deletion b/‎benchmarks/microbenchmarks/test/test_benchmark_inference.py
Lines changed: 66 additions & 1 deletion
@@ -103,6 +103,7 @@ jobs:
           pip install parameterized
           pip install pyyaml
           pip install numpy
+          pip install importlib-metadata
       - name: Print pip freeze
         run: |
           pip freeze
 
@@ -14,5 +14,6 @@ Training parameters can be configured via environment variables.
     - `FLOAT8_RECIPE_WITH_BEST_SETTINGS`: "rowwise" or "tensorwise". Applies float8 training with the specified scaling recipe, as well as additional training configs which are optimal for that scaling recipe. See `float8_training_benchmark.sh` for more details.
     - `BATCH_SIZE`: Defaults to 1.
     - `STEPS`: Defaults to 100.
+    - `EXTRA_ARGS`: Extra arguments to pass to torchtitan training script. See [torchtitan](https://github.com/pytorch/torchtitan) docs for the full list of options.
 
 **NOTE**: `torch.compile` and FSDP2 are always used. Other forms of parallelism supported in torchtitan are not yet supported in this script.
@@ -22,6 +22,7 @@ if [ -z "${TORCHTITAN_ROOT}" ]; then
   echo " * FLOAT8_RECIPE_WITH_BEST_SETTINGS: "rowwise" or "tensorwise". if set, use float8 training in torchtitan with the specified recipe, including the additional settings which are optimal for that recipe. otherwise, use bf16 mixed precision training."
   echo " * BATCH_SIZE: defaults to 1."
   echo " * STEPS: defaults to 100."
+  echo " * EXTRA_ARGS: additional arguments to pass to the torchtitan training script."
   exit 1
 fi
 
@@ -44,7 +45,7 @@ cd ${TORCHTITAN_ROOT}
 echo "float8 args: ${FLOAT8_ARGS}"
 
 # run the command with the specified arguments
-CONFIG_FILE="./torchtitan/models/llama/train_configs/llama3_8b.toml" ${TORCHTITAN_ROOT}/run_train.sh --training.steps=${STEPS} --training.batch_size=${BATCH_SIZE} --training.compile ${FLOAT8_ARGS} 2>&1 | tee ${LOG_FILE}
+CONFIG_FILE="./torchtitan/models/llama/train_configs/llama3_8b.toml" ${TORCHTITAN_ROOT}/run_train.sh --training.steps=${STEPS} --training.batch_size=${BATCH_SIZE} --training.compile ${FLOAT8_ARGS} ${EXTRA_ARGS} 2>&1 | tee ${LOG_FILE}
 
 # return to original working directory
 cd $original_dir
 
@@ -24,6 +24,7 @@
     string_to_config,
 )
 from torchao.quantization import quantize_
+from torchao.sparsity.sparse_api import sparsify_
 
 
 def run(config: BenchmarkConfig) -> BenchmarkResult:
@@ -44,11 +45,33 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
 
     # Use quantize_ to apply each quantization function to the model
     m_copy = deepcopy(base_model).eval().to(config.device)
-    quantization_config = string_to_config(
-        config.quantization, high_precision_dtype=config.high_precision_dtype
+    ao_base_config = string_to_config(
+        config.quantization,
+        config.sparsity,
+        high_precision_dtype=config.high_precision_dtype,
     )
-    if quantization_config is not None:
-        quantize_(m_copy, quantization_config)
+
+    # Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
+    is_cuda = config.device == "cuda" and torch.cuda.is_available()
+
+    if config.sparsity is not None and (
+        config.quantization is None or "baseline" in config.quantization
+    ):
+        if is_cuda:
+            print(f"Applying {config.sparsity} sparsity to model")
+            sparsify_(m_copy, ao_base_config)
+        else:
+            print(
+                f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
+            )
+    elif config.sparsity is None and (
+        config.quantization is None or "baseline" in config.quantization
+    ):
+        pass  # No quantization or sparsity specified, do nothing
+    else:
+        print("Quantizing model....")
+        quantize_(m_copy, ao_base_config)
+
     if config.use_torch_compile:
         print("Compiling model....")
         m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)
 
@@ -21,7 +21,7 @@
 
 import argparse
 from itertools import product
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import yaml
 
@@ -68,6 +68,53 @@ def get_param_combinations(model_param):
     return shapes, base_params
 
 
+def get_quantization_sparsity_recipes(
+    quantization_recipes: List[str], sparsity_recipes: List[str]
+) -> Set[Tuple[str, Optional[str]]]:
+    """Generate valid quantization and sparsity recipes.
+
+    Args:
+        quantization_recipes: List of quantization recipes
+        sparsity_recipes: List of sparsity recipes
+
+    Returns:
+        Set of tuples containing (quantization_recipe, sparsity_recipe)
+        For block sparsity, quantization is always "baseline"
+        All quantization techniques are also run without sparsity
+    """
+    config_recipes = set()
+
+    # Always include baseline without sparsity
+    config_recipes.add(("baseline", None))
+
+    # Add all quantization techniques without sparsity
+    for quant_config in quantization_recipes:
+        config_recipes.add((quant_config, None))
+
+    # Process combinations of quantization and sparsity
+    for sparse_config in sparsity_recipes:
+        if sparse_config is None:
+            # Skip None sparsity as we've already added all quantization techniques without sparsity
+            continue
+        elif "block" in sparse_config:
+            # For block sparsity, only pair with baseline quantization
+            config_recipes.add(("baseline", sparse_config))
+        elif "semi" in sparse_config or "2:4" in sparse_config:
+            # For semi-sparse, only pair with compatible quantization methods
+            for quant_config in quantization_recipes:
+                if (
+                    "marlin" in quant_config
+                    or "int8dq" in quant_config
+                    or "float8dq" in quant_config
+                    or quant_config == "baseline"
+                ):
+                    config_recipes.add((quant_config, sparse_config))
+        else:
+            raise ValueError(f"Invalid sparsity recipe: {sparse_config}")
+
+    return config_recipes
+
+
 def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig]:
     """Load benchmark configurations from CLI arguments and YAML file."""
     with open(cli_args.config, "r") as f:
@@ -78,24 +125,29 @@ def load_benchmark_configs(cli_args: argparse.Namespace) -> List[BenchmarkConfig
 
     # Create all possible combinations
     configs = []
+    quantization_sparsity_recipes = get_quantization_sparsity_recipes(
+        config.get("quantization_config_recipe_names", []),
+        config.get("sparsity_config_recipe_names", []),
+    )
     for model_param in config["model_params"]:
         shapes, params = get_param_combinations(model_param)
 
         # Create configs for all combinations
-        for quant_config, (shape_name, shape) in product(
-            config.get("quantization_config_recipe_names", ["baseline"]), shapes
+        for (quant_config, sparse_config), (shape_name, shape) in product(
+            quantization_sparsity_recipes,
+            shapes,
         ):
             configs.append(
                 BenchmarkConfig(
                     quantization=quant_config,
+                    sparsity=sparse_config,
                     params=params,
                     shape_name=shape_name,
                     shape=shape,
                     output_dir=output_dir,
                     benchmark_mode=benchmark_mode,
                 )
             )
-
     return configs
 
 
@@ -104,14 +156,17 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None
     from benchmarks.microbenchmarks.benchmark_inference import run as run_inference
 
     results = []
-    print("Benchmarking Inference ......")
+    print("----------------- RUNNING BENCHMARKS FOR INFERENCE -----------------------")
     for config in configs:
+        print("----------------------------------------")
         try:
-            print(f"Running: {config.name}")
+            print(
+                f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}"
+            )
             result = run_inference(config)  # Pass the config object directly
             results.append(result)
-        except Exception as e:
-            print(f"Error running benchmark {config.name}: {e}")
+        except Exception:
+            print(f"Error running benchmark {config.name}")
             continue
 
     # Add results to csv
 
@@ -1,9 +1,13 @@
 # Sample configuration for inference benchmarks
 benchmark_mode: "inference"
 quantization_config_recipe_names:
-  - "baseline"
+  # Will run a baseline inference for model by default, without quantization for comparison
   - "int4wo-32"
-  - "int4wo-128"
+  - "marlin"
+sparsity_config_recipe_names:
+  # Will run a baseline inference for model by default, without sparsity for comparison
+  - "semi-sparse"
+  - "block"
 output_dir: "benchmarks/microbenchmarks/results"
 model_params:
   - name: "small_bf16_linear"
 
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 import tempfile
 import unittest
+from unittest.mock import patch
 
 from benchmarks.microbenchmarks.benchmark_inference import run
 from benchmarks.microbenchmarks.utils import BenchmarkConfig, BenchmarkResult
@@ -17,6 +18,7 @@ def setUp(self):
 
         self.config = BenchmarkConfig(
             quantization="baseline",
+            sparsity="semi-sparse",
             params={
                 "high_precision_dtype": "torch.float32",
                 "use_torch_compile": False,
@@ -35,11 +37,74 @@ def tearDown(self):
 
         shutil.rmtree(self.temp_dir)
 
-    def test_run_inference(self):
+    @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
+    def test_run_inference(self, mock_string_to_config):
+        # Mock string_to_config to return a valid config
+        from torchao.sparsity.sparse_api import SemiSparseWeightConfig
+
+        mock_string_to_config.return_value = SemiSparseWeightConfig()
+
         result = run(self.config)
         self.assertIsInstance(result, BenchmarkResult)
         self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
 
+    @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
+    def test_run_inference_with_semi_sparse_marlin(self, mock_string_to_config):
+        """Test running inference with sparsity configurations"""
+        # Mock string_to_config to return valid configs
+        from torchao.dtypes import MarlinSparseLayout
+        from torchao.quantization import Int4WeightOnlyConfig
+
+        # Test with semi-sparse config
+        mock_string_to_config.return_value = Int4WeightOnlyConfig(
+            layout=MarlinSparseLayout()
+        )
+        config = BenchmarkConfig(
+            quantization="marlin",
+            sparsity="semi-sparse",
+            params={
+                "high_precision_dtype": "torch.float32",
+                "use_torch_compile": False,
+                "device": "cpu",
+                "model_type": "linear",
+            },
+            shape_name="custom",
+            shape=[64, 64, 64],  # Use dimensions divisible by 64
+            output_dir=self.temp_dir,
+            benchmark_mode="inference",
+        )
+        result = run(config)
+        self.assertIsInstance(result, BenchmarkResult)
+        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+
+    @patch("benchmarks.microbenchmarks.benchmark_inference.string_to_config")
+    def test_run_inference_with_block_sparsity(self, mock_string_to_config):
+        """Test running inference with sparsity configurations"""
+        # Mock string_to_config to return valid configs
+        from torchao.sparsity.sparse_api import (
+            BlockSparseWeightConfig,
+        )
+
+        # Test with block sparsity
+        mock_string_to_config.return_value = BlockSparseWeightConfig()
+        config = BenchmarkConfig(
+            quantization="baseline",
+            sparsity="block",
+            params={
+                "high_precision_dtype": "torch.float32",
+                "use_torch_compile": False,
+                "device": "cpu",
+                "model_type": "linear",
+            },
+            shape_name="custom",
+            shape=[64, 64, 64],  # Use dimensions divisible by 64
+            output_dir=self.temp_dir,
+            benchmark_mode="inference",
+        )
+        result = run(config)
+        self.assertIsInstance(result, BenchmarkResult)
+        self.assertTrue(hasattr(result, "model_inference_time_in_ms"))
+
 
 if __name__ == "__main__":
     unittest.main()