huggingface
diff --git a/‎pytorch_block_sparse/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎pytorch_block_sparse/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pytorch_block_sparse/block_sparse.py‎
Lines changed: 123 additions & 53 deletions b/‎pytorch_block_sparse/block_sparse.py‎
Lines changed: 123 additions & 53 deletions
diff --git a/‎pytorch_block_sparse/block_sparse_linear.py‎
Lines changed: 32 additions & 13 deletions b/‎pytorch_block_sparse/block_sparse_linear.py‎
Lines changed: 32 additions & 13 deletions
@@ -1,6 +1,6 @@
-from .block_sparse import BlockSparseMatrix
+from .block_sparse import BlockSparseMatrix, BlockSparseMatrixEmulator
 from .block_sparse_linear import BlockSparseLinear
 from .sparse_optimizer import SparseOptimizer
 from .util import BlockSparseModelPatcher
 
-__all__ = [BlockSparseMatrix, BlockSparseLinear, BlockSparseModelPatcher, SparseOptimizer]
+__all__ = [BlockSparseMatrix, BlockSparseMatrixEmulator, BlockSparseLinear, BlockSparseModelPatcher, SparseOptimizer]
@@ -1,5 +1,3 @@
-import math
-
 import numpy
 import torch
 import torch.nn
@@ -24,12 +22,15 @@ def __init__(self, shape, block_mask, data, block_shape=(32, 32)):
 
         self.data = torch.nn.Parameter(data)
 
-        self.rebuild(block_mask)
+        self.rebuild(block_mask, callback=False)
+
+    def updated_data(self):
+        pass
 
     def get_differentiable_data(self):
         return self.data
 
-    def rebuild(self, block_mask, block_ptr=None):
+    def rebuild(self, block_mask, block_ptr=None, callback=True):
         data = self.data
         block_shape = self.block_shape
 
@@ -71,6 +72,8 @@ def rebuild(self, block_mask, block_ptr=None):
             (self.block_shape[1], self.block_shape[0]),
         )
         self.check_ = False
+        if callback:
+            self.updated_data()
 
     @staticmethod
     def blocks_count_(shape, block_shape):
@@ -219,25 +222,26 @@ def block_replace(self, block_replacements):
     def zeros(cls, shape, n_blocks=None, blocks=None, block_shape=(32, 32), device="cuda"):
         for i in range(2):
             if shape[i] % block_shape[i] != 0:
-                raise Exception(
-                    f"Invalid shape: shape[{i}]({shape[i]}) %% block_shape[{i}]({block_shape[i]}) is not 0."
-                )
+                raise Exception(f"Invalid shape: shape[{i}]={shape[i]} %% block_shape[{i}]={block_shape[i]} is not 0.")
+
+        X, Y = cls.blocks_count_(shape, block_shape)
+
         if n_blocks is None:
-            assert blocks is not None
-            for b in blocks:
-                for i in range(2):
-                    if b[i] * block_shape[i] >= shape[i]:
-                        raise Exception(
-                            f"Invalid block definition: block[{i}] = {b[i]} : should be < {shape[i] // block_shape[i]}"
-                        )
-            n_blocks = len(blocks)
+            if blocks is not None:
+                for b in blocks:
+                    for i in range(2):
+                        if b[i] * block_shape[i] >= shape[i]:
+                            raise Exception(
+                                f"Invalid block definition: block[{i}] = {b[i]} : should be < {shape[i] // block_shape[i]}"
+                            )
+                n_blocks = len(blocks)
+            else:
+                n_blocks = X * Y
         else:
             assert blocks is None
         if len(shape) != 2 or shape[0] % block_shape[0] != 0 or shape[1] % block_shape[1] != 0:
             raise Exception("shape should be a tuple of 2 multiples of block_shape")
 
-        X, Y = cls.blocks_count_(shape, block_shape)
-
         if n_blocks > X * Y:
             raise Exception("Too many blocks : %d > %d * %d = %d" % (n_blocks, X, Y, X * Y))
         if blocks is not None:
@@ -257,11 +261,27 @@ def zeros(cls, shape, n_blocks=None, blocks=None, block_shape=(32, 32), device="
 
         return cls(shape, block_mask, data, block_shape)
 
+    @classmethod
+    def ones(
+        cls,
+        shape,
+        n_blocks=None,
+        blocks=None,
+        block_shape=(32, 32),
+        device="cuda",
+        positive=False,
+    ):
+        ret = cls.zeros(shape, n_blocks, blocks, block_shape, device)
+        with torch.no_grad():
+            ret.data += 1
+        ret.updated_data()
+        return ret
+
     @classmethod
     def randn(
         cls,
         shape,
-        n_blocks,
+        n_blocks=None,
         blocks=None,
         block_shape=(32, 32),
         device="cuda",
@@ -273,35 +293,63 @@ def randn(
                 ret.data.normal_().abs_()
             else:
                 ret.data.normal_()
+        ret.updated_data()
         return ret
 
     @classmethod
-    def from_dense(cls, dense, block_shape=(32, 32), block_count=None):
-        dense_block_count = (dense.shape[0] * dense.shape[1]) // (block_shape[0] * block_shape[1])
-        if block_count is None:
-            block_count = dense_block_count
-
-        ret = cls.zeros(
-            dense.shape,
-            n_blocks=block_count,
-            block_shape=block_shape,
-            device=dense.device,
-        )
+    def from_dense(cls, dense, block_shape=(32, 32), block_count=None, blocks=None, slow=False, out=None):
+        if out is None:
+            if blocks is None:
+                dense_block_count = (dense.shape[0] * dense.shape[1]) // (block_shape[0] * block_shape[1])
+                if block_count is None:
+                    block_count = dense_block_count
+            else:
+                block_count = None
+
+            ret = cls.zeros(
+                dense.shape,
+                n_blocks=block_count,
+                block_shape=block_shape,
+                blocks=blocks,
+                device=dense.device,
+            )
+        else:
+            ret = out
 
-        if block_count == dense_block_count:
-            # TODO : use some pytorch dimensions transposition to speed up this block by block copy
+        if out is not None or blocks is not None or block_count == dense_block_count:
+            # In case we keep the full matrix (block_count == dense_block_count), we make sure the
+            # order is the right one, mostly for testing purposes.
             coo = ret.build_coo_block_index().long()
-
-            for i in range(coo.shape[1]):
-                r, c = coo[0][i], coo[1][i]
-                bs = ret.block_shape
-                ret.data[i * bs[0] : (i + 1) * bs[0], :] = dense[
-                    r * bs[0] : (r + 1) * bs[0], c * bs[1] : (c + 1) * bs[1]
-                ].t()
+            if slow:
+                # Legacy version, used for testing only
+                for i in range(coo.shape[1]):
+                    r, c = coo[0][i], coo[1][i]
+                    bs = ret.block_shape
+                    part = dense[r * bs[0] : (r + 1) * bs[0], c * bs[1] : (c + 1) * bs[1]]
+                    part = part.t().reshape(block_shape[0], block_shape[1])
+                    with torch.no_grad():
+                        ret.data[i * bs[0] : (i + 1) * bs[0]] = part
+            else:
+                dense2 = dense.reshape(
+                    dense.shape[0] // block_shape[0], block_shape[0], dense.shape[1] // block_shape[1], block_shape[1]
+                )
+                dense2 = dense2.transpose(1, 2)
+                dense2 = dense2.transpose(2, 3)
+                dense2 = dense2.reshape(-1, block_shape[0], block_shape[1])
+                indices = coo[0] * (dense.shape[1] // block_shape[1]) + coo[1]
+                indices = indices.unsqueeze(-1).unsqueeze(-1).expand(-1, block_shape[0], block_shape[1])
+                new_data = torch.gather(dense2, 0, indices)
+                new_data = new_data.reshape(-1, block_shape[1])
+                with torch.no_grad():
+                    ret.data.copy_(new_data)
         else:
+            # We just keep the first elements in the dense matrix
+            # Of course this only captures the statistical distribution in the dense matrix
             param_count = ret.data.numel()
-            density = block_count / dense_block_count
-            ret.data.copy_(dense.flatten()[:param_count].reshape(ret.data.shape) / math.sqrt(density))
+            with torch.no_grad():
+                ret.data.copy_(dense.flatten()[:param_count].reshape(ret.data.shape))
+
+        ret.updated_data()
 
         return ret
 
@@ -315,6 +363,11 @@ def __repr__(self):
             self.block_shape,
         )
 
+    def multiply_(self, factor):
+        with torch.no_grad():
+            self.data.multiply_(factor)
+        self.updated_data()
+
     def build_coo_block_index(self):
         device = self.cols_a.device
         # Build a tensor to store the row indices.
@@ -356,11 +409,13 @@ def to_sparse(self, data_replace=None):
             data = data_replace
         else:
             data = self.data
-        data = data.reshape(-1, *self.block_shape).transpose(1, 2)
+        data = data.reshape(-1, self.block_shape[1], self.block_shape[0])
+        data = data.transpose(1, 2)
         out = torch.sparse.FloatTensor(
             coo,
             data,
-            (self.shape[0] // self.block_shape[0], self.shape[1] // self.block_shape[1]) + self.block_shape,
+            (self.shape[0] // self.block_shape[0], self.shape[1] // self.block_shape[1])
+            + (self.block_shape[0], self.block_shape[1]),
         )
 
         return out
@@ -489,7 +544,8 @@ def reverse_matmul_(self, dense_a, transpose=True):
             data_b = data.reshape(-1, block_shape[1]).contiguous()
 
         if not dense_a.is_contiguous():
-            # warnings.warn(f"pytorch_block_sparse.BlockSparseMatrix.reverse_matmul: DEGRADED performance, dense_a is not contiguous {dense_a.stride()}")
+            # warnings.warn(f"pytorch_block_sparse.BlockSparseMatrix.reverse_matmul:"
+            #               f" DEGRADED performance, dense_a is not contiguous {dense_a.stride()}")
             dense_a = dense_a.contiguous()
 
         verbose = False
@@ -572,12 +628,15 @@ def matmul_with_output_sparse_support_(self, dense_a, dense_b, overwrite_data=Fa
         else:
             data = torch.zeros_like(self.data)
 
-        message = "pytorch_block_sparse.BlockSparseMatrix.matmul_with_output_sparse_support: DEGRADED performance, dense_%s is not contiguous"
+        message = (
+            "pytorch_block_sparse.BlockSparseMatrix.matmul_with_output_sparse_support:"
+            " DEGRADED performance, dense_%s is not contiguous"
+        )
         prepared_a, transpose_a = self.tensor_prepare(dense_a, message % "a", True)
         prepared_b, transpose_b = self.tensor_prepare(dense_b, message % "b", False)
 
-        # We interpret a as transposed, so we pass shape_a[1], shape_a[0] as a shape,
-        # and transpose_a will be set correcly too (for a "normal" contiguous pytorch matrix a, transpose_a will be true)
+        # We interpret a as transposed, so we pass shape_a[1], shape_a[0] as a shape, and transpose_a
+        # will be set correcly too (for a "normal" contiguous pytorch matrix a, transpose_a will be true)
         block_sparse_native.blocksparse_matmul_back_cutlass(
             prepared_a,
             transpose_a,
@@ -626,18 +685,29 @@ class BlockSparseMatrixEmulator(BlockSparseMatrixBase):
     # Data is (len(cols), block_shape, block_shape)
     def __init__(self, shape, block_mask, data, block_shape):
         super(BlockSparseMatrixEmulator, self).__init__(shape, block_mask, data, block_shape)
+        self.register_parameter("_dense", None)
+        self.updated_data()
 
     def get_differentiable_data(self):
-        return self.dense_
+        return self._dense
 
-    def rebuild(self, block_mask, block_ptr=None):
-        super().rebuild(block_mask, block_ptr)
-        self._dense = self.to_dense()
-        self._mask = self.to_dense(data_replace=torch.ones_like(self.data)) == 1
+    def to_dense(self, data_replace=None):
+        if data_replace is None:
+            return self._dense
+        return data_replace * self._mask
 
-    def reverse_matmul(self, dense_a, transpose):
+    def _update_data_from_dense(self):
+        _ = self.from_dense(self._dense, out=self)
+
+    def updated_data(self):
+        with torch.no_grad():
+            self._dense = torch.nn.Parameter(super().to_dense())
+            self._mask = super().to_dense(data_replace=torch.ones_like(self.data)) == 1
+
+    def reverse_matmul(self, dense_a, transpose=True):
         m = self._dense.t() if transpose else self._dense
-        return dense_a.matmul(m * self._mask)  # The self._mask multiplication is not really needed, but ...
+        mask = self._mask.t() if transpose else self._mask
+        return dense_a.matmul(m * mask)  # The self._mask multiplication is not really needed, but ...
 
     def matmul_with_output_sparse_support(self, dense_a, dense_b, overwrite_data=False):
         """Compute  c = a.t().mm(b) where c is sparse (we just keep the results where c is non_zero)."""
 
@@ -1,8 +1,15 @@
+import math
+from typing import Tuple
+
 import torch
 import torch.autograd
 import torch.nn as nn
 
-from .block_sparse import BlockSparseMatrix
+from .block_sparse import (
+    BlockSparseMatrix,
+    BlockSparseMatrixBase,
+    BlockSparseMatrixEmulator,
+)
 
 
 class BlockSparseLinearFunction(torch.autograd.Function):
@@ -26,7 +33,7 @@ def forward(ctx, input, weight_data, weight):
                 weight.data[::stride, ::stride],
             )
 
-        assert isinstance(weight, BlockSparseMatrix)
+        assert isinstance(weight, BlockSparseMatrixBase)
 
         ctx.save_for_backward(input, weight_data)
         ctx.weight = weight
@@ -51,7 +58,7 @@ def backward(ctx, grad_output):
         verbose = False
         input, weight_data = ctx.saved_tensors
         weight = ctx.weight
-        assert isinstance(weight, BlockSparseMatrix)
+        assert isinstance(weight, BlockSparseMatrixBase)
 
         if verbose or check:
             dense_weight = weight.to_dense()
@@ -148,7 +155,7 @@ def backward(ctx, grad_output):
 
 
 class BlockSparseLinear(nn.Module):
-    BLOCK_SIZE = 32
+    OPTIMIZED_BLOCK_SIZE = 32
 
     def __init__(
         self,
@@ -157,40 +164,52 @@ def __init__(
         bias: bool = True,
         density: float = 0.5,
         torch_nn_linear=None,
-        verbose=False,
+        verbose: bool = False,
+        block_shape: Tuple[int, int] = (32, 32),
     ):
         super(BlockSparseLinear, self).__init__()
         self.fn = BlockSparseLinearFunction.apply
         self.verbose = verbose
+        self.block_shape = block_shape
+        self._optimized = (
+            self.block_shape[0] == self.OPTIMIZED_BLOCK_SIZE and self.block_shape[1] == self.OPTIMIZED_BLOCK_SIZE
+        )
 
         if torch_nn_linear is not None:
             in_features = torch_nn_linear.in_features
             out_features = torch_nn_linear.out_features
             bias = torch_nn_linear.bias is not None
 
-        if in_features % self.BLOCK_SIZE != 0:
+        if in_features % self.block_shape[1] != 0:
             raise Exception(
-                f"BlockSparseLinear invalid in_features={in_features}, should be multiple of {self.BLOCK_SIZE}"
+                f"BlockSparseLinear invalid in_features={in_features}, should be multiple of {self.block_shape[1]}"
             )
-        if out_features % self.BLOCK_SIZE != 0:
+        if out_features % self.block_shape[0] != 0:
             raise Exception(
-                f"BlockSparseLinear invalid in_features={in_features}, should be multiple of {self.BLOCK_SIZE}"
+                f"BlockSparseLinear invalid in_features={in_features}, should be multiple of {self.block_shape[0]}"
             )
 
         if density < 0 or density > 1:
             raise Exception(f"BlockSparseLinear invalid density={density}")
 
-        self.block_count = int(density * (in_features * out_features / (self.BLOCK_SIZE * self.BLOCK_SIZE)))
+        self.block_count = int(density * (in_features * out_features / (self.block_shape[0] * self.block_shape[1])))
 
         self.in_features = in_features
         self.out_features = out_features
 
-        block_shape = (self.BLOCK_SIZE, self.BLOCK_SIZE)
+        block_shape = self.block_shape
+
+        if self._optimized:
+            BlockSparseMatrixConstructor = BlockSparseMatrix
+        else:
+            BlockSparseMatrixConstructor = BlockSparseMatrixEmulator
+
         if torch_nn_linear is not None:
             with torch.no_grad():
-                weight = BlockSparseMatrix.from_dense(torch_nn_linear.weight, block_shape, self.block_count)
+                weight = BlockSparseMatrixConstructor.from_dense(torch_nn_linear.weight, block_shape, self.block_count)
+            weight.multiply_(1.0 / math.sqrt(density))
         else:
-            weight = BlockSparseMatrix.randn(
+            weight = BlockSparseMatrixConstructor.randn(
                 (out_features, in_features),
                 self.block_count,
                 blocks=None,