huggingface
diff --git a/‎pytorch_block_sparse/__init__.py‎
Lines changed: 3 additions & 1 deletion b/‎pytorch_block_sparse/__init__.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎pytorch_block_sparse/block_sparse.py‎
Lines changed: 199 additions & 110 deletions b/‎pytorch_block_sparse/block_sparse.py‎
Lines changed: 199 additions & 110 deletions
diff --git a/‎pytorch_block_sparse/block_sparse_linear.py‎
Lines changed: 80 additions & 41 deletions b/‎pytorch_block_sparse/block_sparse_linear.py‎
Lines changed: 80 additions & 41 deletions
@@ -1,4 +1,6 @@
 from .block_sparse import BlockSparseMatrix
 from .block_sparse_linear import BlockSparseLinear
-from .util import BlockSparseModelPatcher
 from .sparse_optimizer import SparseOptimizer
+from .util import BlockSparseModelPatcher
+
+__all__ = [BlockSparseMatrix, BlockSparseLinear, BlockSparseModelPatcher, SparseOptimizer]
@@ -1,8 +1,9 @@
 import torch
 import torch.autograd
 import torch.nn as nn
+
 from .block_sparse import BlockSparseMatrix
-import typing
+
 
 class BlockSparseLinearFunction(torch.autograd.Function):
     @staticmethod
@@ -16,14 +17,20 @@ def forward(ctx, input, weight_data, weight):
         if verbose:
             stride = 8
             print("BlockSparseLinearFunction.forward input\n", input[::stride, ::stride])
-            print("BlockSparseLinearFunction.forward dense_weight\n", dense_weight[::stride, ::stride])
-            print("BlockSparseLinearFunction.forward weight\n", weight.data[::stride, ::stride])
+            print(
+                "BlockSparseLinearFunction.forward dense_weight\n",
+                dense_weight[::stride, ::stride],
+            )
+            print(
+                "BlockSparseLinearFunction.forward weight\n",
+                weight.data[::stride, ::stride],
+            )
 
-        assert(isinstance(weight, BlockSparseMatrix))
+        assert isinstance(weight, BlockSparseMatrix)
 
         ctx.save_for_backward(input, weight_data)
         ctx.weight = weight
-        output = weight.reverse_matmul(input, transpose = True)
+        output = weight.reverse_matmul(input, transpose=True)
         if check:
             dense = weight.to_dense()
             output1 = input.matmul(dense.t())
@@ -42,17 +49,23 @@ def forward(ctx, input, weight_data, weight):
     def backward(ctx, grad_output):
         check = False
         verbose = False
-        input, weight_data= ctx.saved_tensors
+        input, weight_data = ctx.saved_tensors
         weight = ctx.weight
-        assert (isinstance(weight, BlockSparseMatrix))
+        assert isinstance(weight, BlockSparseMatrix)
 
         if verbose or check:
             dense_weight = weight.to_dense()
 
         if verbose:
             stride = 8
             print("input\n", input[::stride, ::stride])
-            print("grad_output\n", grad_output.stride(), grad_output.storage, grad_output.layout, grad_output[::stride, ::stride])
+            print(
+                "grad_output\n",
+                grad_output.stride(),
+                grad_output.storage,
+                grad_output.layout,
+                grad_output[::stride, ::stride],
+            )
             print("dense_weight\n", dense_weight[::stride, ::stride])
             print("weight\n", weight.data[::stride, ::stride])
 
@@ -61,15 +74,27 @@ def backward(ctx, grad_output):
 
             if verbose or check:
                 grad_input0 = grad_output.matmul(dense_weight)
+                atol = 1e-4
 
                 if check:
                     if not grad_input0.isclose(grad_input1).all():
                         print(f"grad_output.shape={grad_output.shape}, grad_output.stride={grad_output.stride()}")
-                        print("grad_input0/1 comparison\n", (grad_input0 - grad_input1)[1::32,1::32,1::32])
-                        print("grad_input0/1 comparison\n", (grad_input0 - grad_input1).abs().max())
-                        print("grad_input0/1 comparison: count of differences\n", ((grad_input0 - grad_input1).abs() > atol).sum())
-                        print("grad_input0/1 comparison: position of differences\n",
-                              ((grad_input0 - grad_input1).abs() > atol).nonzero())
+                        print(
+                            "grad_input0/1 comparison\n",
+                            (grad_input0 - grad_input1)[1::32, 1::32, 1::32],
+                        )
+                        print(
+                            "grad_input0/1 comparison\n",
+                            (grad_input0 - grad_input1).abs().max(),
+                        )
+                        print(
+                            "grad_input0/1 comparison: count of differences\n",
+                            ((grad_input0 - grad_input1).abs() > atol).sum(),
+                        )
+                        print(
+                            "grad_input0/1 comparison: position of differences\n",
+                            ((grad_input0 - grad_input1).abs() > atol).nonzero(),
+                        )
 
                         print("grad_input0 max\n", grad_input0.abs().max())
                         print("grad_input1 max\n", grad_input1.abs().max())
@@ -81,7 +106,7 @@ def backward(ctx, grad_output):
 
                 if verbose:
                     grad_input2 = weight.reverse_matmul(torch.ones_like(grad_output), transpose=False)
-                    print("grad_input0\n", grad_input0[::stride,::stride])
+                    print("grad_input0\n", grad_input0[::stride, ::stride])
                     print("grad_input1\n", grad_input1[::stride, ::stride])
                     print("grad_input2\n", grad_input2[::stride, ::stride])
         else:
@@ -90,7 +115,11 @@ def backward(ctx, grad_output):
         if ctx.needs_input_grad[1]:
             grad_weight1 = weight.matmul_with_output_sparse_support(grad_output, input)
             if verbose or check:
-                grad_weight0 = grad_output.reshape(-1, grad_output.shape[-1]).transpose(-1,-2).matmul(input.reshape(-1, input.shape[-1]))
+                grad_weight0 = (
+                    grad_output.reshape(-1, grad_output.shape[-1])
+                    .transpose(-1, -2)
+                    .matmul(input.reshape(-1, input.shape[-1]))
+                )
                 if check:
                     grad_weight1b = weight.to_dense(data_replace=grad_weight1)
                     grad_weight1mask = weight.to_dense(data_replace=torch.ones_like(grad_weight1))
@@ -110,35 +139,43 @@ def backward(ctx, grad_output):
         else:
             grad_weight1 = None
 
-        if grad_weight1 != None:
-            assert(not (grad_weight1 == 0).all())
-        if grad_input1 != None:
-            assert(grad_input1.shape == input.shape)
+        if grad_weight1 is not None:
+            assert not (grad_weight1 == 0).all()
+        if grad_input1 is not None:
+            assert grad_input1.shape == input.shape
 
         return grad_input1, grad_weight1, None
 
+
 class BlockSparseLinear(nn.Module):
-    BLOCK_SIZE=32
-    def __init__(self,
-                 in_features: int,
-                 out_features: int,
-                 bias: bool = True,
-                 density:float = 0.5,
-                 torch_nn_linear = None,
-                 verbose = False):
+    BLOCK_SIZE = 32
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        density: float = 0.5,
+        torch_nn_linear=None,
+        verbose=False,
+    ):
         super(BlockSparseLinear, self).__init__()
         self.fn = BlockSparseLinearFunction.apply
         self.verbose = verbose
 
-        if torch_nn_linear != None:
+        if torch_nn_linear is not None:
             in_features = torch_nn_linear.in_features
             out_features = torch_nn_linear.out_features
             bias = torch_nn_linear.bias is not None
 
         if in_features % self.BLOCK_SIZE != 0:
-            raise Exception(f"BlockSparseLinear invalid in_features={in_features}, should be multiple of {self.BLOCK_SIZE}")
+            raise Exception(
+                f"BlockSparseLinear invalid in_features={in_features}, should be multiple of {self.BLOCK_SIZE}"
+            )
         if out_features % self.BLOCK_SIZE != 0:
-            raise Exception(f"BlockSparseLinear invalid in_features={in_features}, should be multiple of {self.BLOCK_SIZE}")
+            raise Exception(
+                f"BlockSparseLinear invalid in_features={in_features}, should be multiple of {self.BLOCK_SIZE}"
+            )
 
         if density < 0 or density > 1:
             raise Exception(f"BlockSparseLinear invalid density={density}")
@@ -153,20 +190,22 @@ def __init__(self,
             with torch.no_grad():
                 weight = BlockSparseMatrix.from_dense(torch_nn_linear.weight, block_shape, self.block_count)
         else:
-            weight = BlockSparseMatrix.randn((out_features, in_features),
-                                             self.block_count,
-                                             blocks=None,
-                                             block_shape=block_shape,
-                                             device="cuda")
+            weight = BlockSparseMatrix.randn(
+                (out_features, in_features),
+                self.block_count,
+                blocks=None,
+                block_shape=block_shape,
+                device="cuda",
+            )
         self.weight = weight
 
         if bias:
-            self.bias = nn.Parameter(torch.zeros(out_features, device = "cuda"))
+            self.bias = nn.Parameter(torch.zeros(out_features, device="cuda"))
             if torch_nn_linear is not None:
                 with torch.no_grad():
                     self.bias.copy_(torch_nn_linear.bias)
         else:
-            self.register_parameter('bias', None)
+            self.register_parameter("bias", None)
 
     def forward(self, x):
         x = self.fn(x, self.weight.get_differentiable_data(), self.weight)
@@ -177,6 +216,7 @@ def forward(self, x):
 
 class PseudoBlockSparseLinear(torch.nn.Module):
     """For debugging purposes mostly: emulate a BlockSparseLinear with only PyTorch primitives."""
+
     def __init__(self, block_sparse_linear):
         super(PseudoBlockSparseLinear, self).__init__()
 
@@ -186,9 +226,9 @@ def __init__(self, block_sparse_linear):
         if block_sparse_linear.bias is not None:
             self.bias = torch.nn.Parameter(block_sparse_linear.bias)
         else:
-            self.register_parameter('bias', None)
+            self.register_parameter("bias", None)
 
-        self.register_buffer('mask', mask)
+        self.register_buffer("mask", mask)
         self.in_features = block_sparse_linear.in_features
         self.out_features = block_sparse_linear.out_features
         self.density = mask.sum().item() / (mask.shape[0] * mask.shape[1])
@@ -198,7 +238,6 @@ def forward(self, input):
         return torch.nn.functional.linear(input, weight, self.bias)
 
     def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}, fill_ratio={}'.format(
+        return "in_features={}, out_features={}, bias={}, fill_ratio={}".format(
             self.in_features, self.out_features, self.bias is not None, self.density
         )
-