Merge pull request pytorch#22 from pytorch-labs/jcaip/sparsity

jcaip · web-flow · commit fc6b3dcd0696 · 2023-12-11T15:56:15.000-05:00
[sparse] add sparsity, add wanda sparsifier to ao
diff --git a/test/sparsity/test_wanda.py b/test/sparsity/test_wanda.py
@@ -0,0 +1,114 @@
+import logging
+import unittest
+
+import torch
+from torch import nn
+from torchao.sparsity import WandaSparsifier
+from torch.ao.pruning import FakeSparsity
+from torch.nn.utils.parametrize import is_parametrized
+from torch.testing._internal.common_pruning import SimpleLinear
+from torch.testing._internal.common_utils import TestCase
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+
+
+class TestWandaSparsifier(TestCase):
+    """
+    Test Wanda Sparsifier
+    """
+
+    def test_prepare(self):
+        model = SimpleLinear()
+        sparsifier = WandaSparsifier()
+        sparsifier.prepare(model, config=None)
+        for g in sparsifier.groups:
+            module = g["module"]
+            # Check mask exists
+            assert hasattr(module.parametrizations["weight"][0], "mask")
+            # Check parametrization exists and is correct
+            assert is_parametrized(module, "weight")
+            assert type(module.parametrizations.weight[0]) == FakeSparsity
+            # check activation observer is present
+            assert hasattr(module, "activation_post_process")
+
+    def test_squash_mask(self):
+        # check observers and parameterizations removed
+        model = SimpleLinear()
+        sparsifier = WandaSparsifier()
+        sparsifier.prepare(model, config=None)
+        sparsifier.squash_mask()
+        for g in sparsifier.groups:
+            module = g["module"]
+            assert not is_parametrized(module, "weight")
+            assert not hasattr(module, "mask")
+            assert not hasattr(module, "activation_post_process")
+
+    def test_one_layer_mlp_2x4(self):
+        model = nn.Sequential(nn.Linear(8, 1))
+        weights = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8]])
+        model[0].weight.data.copy_(weights.data)
+        X = torch.ones(1, 8)
+
+        sparsifier = WandaSparsifier(semi_structured_block_size=4)
+        sparsifier.prepare(model, config=None)
+
+        model(X)
+
+        sparsifier.step()
+        sparsifier.squash_mask()
+
+        sparsity = (model[0].weight == 0).float().mean()
+        assert sparsity == 0.5
+
+        expected_fc = torch.tensor([[0, 0, 3, 4, 0, 0, 7, 8]], dtype=torch.float32)
+        assert torch.allclose(model[0].weight.data, expected_fc, rtol=1e-05, atol=1e-07)
+
+    def test_one_layer_mlp_unstructured(self):
+        model = nn.Sequential(nn.Linear(4, 1))
+        weights = torch.tensor([[1, 2, 3, 4]], dtype=torch.float32)
+        model[0].weight.data.copy_(weights.data)
+        X = torch.tensor([[100, 10, 1, 0.1]], dtype=torch.float32)
+
+        sparsifier = WandaSparsifier(sparsity_level=0.5)
+        sparsifier.prepare(model, config=None)
+
+        model(X)
+
+        sparsifier.step()
+        sparsifier.squash_mask()
+
+        sparsity = (model[0].weight == 0).float().mean()
+        assert sparsity == 0.5
+
+        expected_fc = torch.tensor([[1, 2, 0, 0]], dtype=torch.float32)
+        assert torch.allclose(model[0].weight.data, expected_fc, rtol=1e-05, atol=1e-07)
+
+    def test_two_layer_mlp_unstructured(self):
+        model = nn.Sequential(
+            nn.Linear(128, 200), nn.ReLU(), nn.Linear(200, 10)
+        )  # C_in by C_out
+        X1 = torch.randn(100, 128)  # B1 by C_in
+        X2 = torch.randn(50, 128)  # B2 by C_in
+
+        sparsifier = WandaSparsifier(sparsity_level=0.5)
+        sparsifier.prepare(model, config=None)
+
+        model(X1)
+        model(X2)
+        sparsifier.step()
+
+        cnt = 0
+        for m in model.modules():
+            if isinstance(m, nn.Linear):
+                cnt += 1
+                sparsity_level = (m.weight == 0).float().mean()
+                assert (
+                    sparsity_level == 0.5
+                ), f"sparsity for linear layer {cnt} should be 0.5"
+
+        sparsifier.squash_mask()
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchao/sparsity/__init__.py b/torchao/sparsity/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .wanda import WandaSparsifier  # noqa: F403
+from .utils import PerChannelNormObserver  # noqa: F403
+
+__all__ = [
+    "WandaSparsifier",
+    "PerChannelNormObserver"
+]
diff --git a/torchao/sparsity/utils.py b/torchao/sparsity/utils.py
@@ -0,0 +1,48 @@
+import torch
+from torch.ao.quantization.observer import UniformQuantizationObserverBase
+
+__all__ = ["PerChannelNormObserver"]
+
+# Observers
+class PerChannelNormObserver(UniformQuantizationObserverBase):
+    """
+    A custom observer that computes the L2 norm of each channel and stores it in a buffer.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        # init with fixed qparams for quantization flow
+        super().__init__(
+            dtype=torch.quint8,
+            qscheme=torch.per_channel_affine,
+            reduce_range=False,
+            quant_min=None,
+            quant_max=None,
+            eps=torch.finfo(torch.float32).eps,
+            **kwargs
+        )
+        # set averaging constant so quantization flow knows observer is memoryless.
+        self.averaging_constant = 1.0
+        self.register_buffer("norm", torch.tensor([]))
+
+    def forward(self, x_orig):
+        if x_orig.numel() == 0:
+            return x_orig
+        x = x_orig.detach()  # avoid keeping autograd tape
+
+        # channel_ax is always the last dimension
+        new_axis_list = [i for i in range(x.dim())]  # noqa: C416
+        new_axis_list[0], new_axis_list[-1] = new_axis_list[-1], new_axis_list[0]
+        y = x.permute(new_axis_list)
+        y = torch.flatten(y, start_dim=1)
+        norm = torch.norm(y, dim=1) ** 2
+
+        if self.norm.numel() == 0:
+            self.norm.resize_(norm.shape)
+            self.norm.copy_(norm)
+        else:
+            self.norm += norm
+
+        return x_orig
+
+    def calculate_qparams(self):
+        raise NotImplementedError("PerChannelNormObserver is designed to store activations only. ")
diff --git a/torchao/sparsity/wanda.py b/torchao/sparsity/wanda.py
@@ -0,0 +1,110 @@
+
+import warnings
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from torch.ao.pruning import BaseSparsifier
+from torch.ao.quantization import default_placeholder_observer, QConfig
+from torch.ao.quantization.quantize import _remove_qconfig
+from .utils import PerChannelNormObserver
+
+__all__ = ["WandaSparsifier"]
+
+
+class WandaSparsifier(BaseSparsifier):
+    r"""Wanda sparsifier
+
+    Wanda (Pruning by Weights and activations), proposed in https://arxiv.org/abs/2306.11695
+    is an activation aware pruning method. The sparsifier removes weights based on the product
+    of the input activation norm and the weight magnitude.
+
+    This sparsifier is controlled by three variables:
+    1. `sparsity_level` defines the number of *sparse blocks* that are zeroed-out;
+
+    Args:
+        sparsity_level: The target level of sparsity;
+        model: The model to be sparsified;
+    """
+
+    def __init__(
+        self,
+        sparsity_level: float = 0.5,
+        semi_structured_block_size: Optional[int] = None,
+    ):
+        defaults = {
+            "sparsity_level": sparsity_level,
+            "semi_structured_block_size": semi_structured_block_size,
+        }
+        if semi_structured_block_size is not None:
+            m = semi_structured_block_size
+            warnings.warn(
+                f"WandaSparsifier got semi_structured_bock_size={m}, sparsity_level fixed to 50% ({m // 2}:{m}) sparsity"
+            )
+        super().__init__(defaults=defaults)
+
+    def prepare(self, model: nn.Module, config: List[Dict]) -> None:
+        # activation: use PerChannelNormObserver
+        # use no-op placeholder weight observer
+        model.qconfig = QConfig(
+            activation=PerChannelNormObserver, weight=default_placeholder_observer
+        )  # type: ignore[assignment]
+        torch.ao.quantization.prepare(model, inplace=True)
+
+        # call superclass prepare
+        super().prepare(model, config)
+
+    def update_mask(  # type: ignore[override]
+        self, module: nn.Module, tensor_name: str, sparsity_level: float, **kwargs
+    ) -> None:
+        r"""Pruning function for WandaSparsifier
+
+        The activation statistics is retrieved first in the `act_per_input` variable.
+        Then the Wanda pruning metric is computed. The weight matrix is then pruned
+        by comparing this metric across the whole current layer.
+        """
+
+        # Step 1: get the tensor and the mask from the parametrizations
+        mask = getattr(module.parametrizations, tensor_name)[0].mask
+        tensor = getattr(module.parametrizations, tensor_name).original
+        activation_norm_per_channel = module.activation_post_process.norm
+
+        # Step 2: Calculate Wx
+        pruning_metric = torch.abs(tensor) * activation_norm_per_channel
+
+        # defaults for unstructured sparsity
+        block_size = pruning_metric.numel()
+        num_specified = int(block_size * sparsity_level)
+        # if set to use semi-structured, ignore sparsity_level
+        if kwargs.get("semi_structured_block_size", None) is not None:
+            block_size = kwargs["semi_structured_block_size"]
+            num_specified = block_size // 2
+
+        # get indicies to prune
+        pruning_inds = pruning_metric.view(-1, block_size).argsort(dim=1)[
+            :, :num_specified
+        ]
+        # update mask
+        mask.data.view(-1, block_size).scatter_(
+            1, pruning_inds, torch.zeros_like(pruning_inds, dtype=mask.dtype)
+        )
+
+    def squash_mask(
+        self,
+        params_to_keep: Optional[Tuple[str, ...]] = None,
+        params_to_keep_per_layer: Optional[Dict[str, Tuple[str, ...]]] = None,
+        *args,
+        **kwargs,
+    ):
+        # remove quantization config
+        for config in self.groups:
+            module = config["module"]
+            tensor_name = config["tensor_name"]
+            _remove_qconfig(module)
+
+        # remove parameterizations
+        super().squash_mask(
+            params_to_keep=params_to_keep,
+            params_to_keep_per_layer=params_to_keep_per_layer,
+        )