Preliminary work for block sparse emulation code.

madlag · madlag · commit a19ee39a1d4f · 2020-10-22T15:27:19.000+02:00
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,6 +1,6 @@
 include README.md
-graft pytorch_block_sparse/cutlass/*.h
-graft pytorch_block_sparse/native/*.h
-graft pytorch_block_sparse/tests/.py
+graft pytorch_block_sparse/cutlass/
+graft pytorch_block_sparse/native/
+graft pytorch_block_sparse/tests/
 global-exclude *.py[cod]
 global-exclude *~
diff --git a/pytorch_block_sparse/block_sparse.py b/pytorch_block_sparse/block_sparse.py
@@ -4,19 +4,21 @@
 import warnings
 import math
 
-class BlockSparseMatrix(torch.nn.Module):
+class BlockSparseMatrixBase(torch.nn.Module):
     # cols is a list of nonzero block column indexes (int32)
     # row_start is a index into cols (int32)
     # Data is (len(cols), block_shape, block_shape)
-    def __init__(self, shape, block_mask, data, block_shape=(16, 16)):
-        super(BlockSparseMatrix, self).__init__()
+    def __init__(self, shape, block_mask, data, block_shape=(32, 32)):
+        super(BlockSparseMatrixBase, self).__init__()
         self.int_type = torch.int32
 
-        if len(shape) != 2 or shape[0] % 16 != 0 or shape[1] % 16 != 0:
-            raise Exception("shape should be a tuple of 2 multiples of 16")
+        if len(shape) != 2:
+            raise Exception("shape should be a tuple of 2 ints")
+
         self.shape = torch.Size(shape)
-        if len(block_shape) != 2 or block_shape[0] % 16 != 0 or block_shape[1] % 16 != 0:
-            raise Exception("block_shape should be a tuple of 2 multiples of 16")
+        if len(block_shape) != 2:
+            raise Exception("block_shape should be a tuple of 2 ints")
+
         self.block_shape = tuple(block_shape)
 
         self.data = torch.nn.Parameter(data)
@@ -74,15 +76,15 @@ def build_indices_(self, block_mask, block_ptr, nnzt, transpose_indices):
             # Reorganize the indexes with transposed ordering
             block_indices = block_indices.reshape(X, Y).t().reshape(X * Y)
             # Only keeps the non zero, and substract 1 to find back the right block index
-            block_ptr = block_indices[block_indices.nonzero()] - 1
+            block_ptr = block_indices[torch.nonzero(block_indices, as_tuple=False)] - 1
             # Remove spurious dimension
             block_ptr = block_ptr.squeeze(-1)
 
             X, Y = Y, X
 
             rows = cols
 
-            nnztt = block_mask.t().nonzero()
+            nnztt = torch.nonzero(block_mask.t(), as_tuple=False)
             cols = nnztt[:,1]
 
         row_start_ends = torch.zeros((X + 1,), dtype=torch.long, device = device)
@@ -100,7 +102,7 @@ def build_indices(self, block_mask, block_ptr = None):
         # assume that the content of block_ptr is just from 0..n_blocks
         # Used to recycle blocks
 
-        nnz = block_mask.nonzero()
+        nnz = torch.nonzero(block_mask, as_tuple=False)
 
         if block_ptr == None:
             block_ptr = torch.arange(0, nnz.shape[0], device=block_mask.device)
@@ -510,3 +512,17 @@ def matmul_with_output_sparse_support(self, dense_a, dense_b, overwrite_data = F
         ret = self.matmul_with_output_sparse_support_(rewritten_a, rewritten_b, overwrite_data)
 
         return ret
+
+
+class BlockSparseMatrix(BlockSparseMatrixBase):
+    # cols is a list of nonzero block column indexes (int32)
+    # row_start is a index into cols (int32)
+    # Data is (len(cols), block_shape, block_shape)
+    def __init__(self, shape, block_mask, data, block_shape=(32, 32)):
+        if len(shape) != 2 or shape[0] % 32 != 0 or shape[1] % 32 != 0:
+            raise Exception("shape should be a tuple of 2 multiples of 32")
+
+        if len(block_shape) != 2 or block_shape[0] % 32 != 0 or block_shape[1] % 32 != 0:
+            raise Exception("block_shape should be a tuple of 2 multiples of 32")
+
+        super(BlockSparseMatrix, self).__init__(shape, block_mask, data, block_shape)
diff --git a/pytorch_block_sparse/block_sparse_emulate.py b/pytorch_block_sparse/block_sparse_emulate.py
@@ -0,0 +1,14 @@
+import torch
+import torch.nn
+from . import block_sparse
+
+class BlockSparseMatrixEmulator(block_sparse.BlockSparseMatrixBase):
+    # cols is a list of nonzero block column indexes (int32)
+    # row_start is a index into cols (int32)
+    # Data is (len(cols), block_shape, block_shape)
+    def __init__(self, shape, block_mask, data, block_shape):
+        super(BlockSparseMatrixEmulator, self).__init__(shape, block_mask, data, block_shape)
+
+    def rebuild(self, block_mask, block_ptr=None):
+        super().rebuild(block_mask, block_ptr)
+        self._dense = self.to_dense()
diff --git a/pytorch_block_sparse/tests/test_emulate.py b/pytorch_block_sparse/tests/test_emulate.py
@@ -0,0 +1,35 @@
+from unittest import TestCase
+import torch
+import unittest
+import torch.optim as optim
+from pytorch_block_sparse.block_sparse import BlockSparseMatrix
+from pytorch_block_sparse.block_sparse_emulate import BlockSparseMatrixEmulator
+from pytorch_block_sparse.block_sparse_linear import PseudoBlockSparseLinear
+
+class TestFun(TestCase):
+    def help_contruct(self, shape, block_mask, data, block_shape=(16, 16)):
+        try:
+            real = BlockSparseMatrix(shape, block_mask, data, block_shape)
+        except:
+            real = None
+        emul = BlockSparseMatrixEmulator(shape, block_mask, data, block_shape)
+
+        return real, emul
+
+    def help_randn(cls, shape, n_blocks, blocks=None, block_shape=(32, 32), device="cuda", positive=False):
+        try:
+            real = BlockSparseMatrix.randn(shape, n_blocks, blocks, block_shape, device=device, positive=positive)
+        except:
+            real = None
+        emul = BlockSparseMatrixEmulator.randn(shape, n_blocks, blocks, block_shape, device=device, positive=positive)
+
+        return real, emul
+
+    def test0(self):
+        d = dict
+        test_sizes = [d(nb=2, s=(3,5), bs=(1,1))]
+        map = d(nb= "n_blocks", s="shape", bs="block_shape")
+
+        for ts in test_sizes:
+            ts = {map[k]:v for k,v in ts.items()}
+            self.help_randn(**ts, device="cpu")
diff --git a/setup.py b/setup.py
@@ -6,6 +6,20 @@
 
 version = "0.1.2"
 
+ext_modules = []
+
+import torch
+if torch.cuda.is_available():
+    ext = CUDAExtension('block_sparse_native',
+                        ['pytorch_block_sparse/native/block_sparse_native.cpp',
+                         'pytorch_block_sparse/native/block_sparse_cutlass_kernel_back.cu',
+                         'pytorch_block_sparse/native/block_sparse_cutlass_kernel.cu'],
+                        extra_compile_args=['-I', '%s/pytorch_block_sparse' % rootdir]
+    )
+    ext_modules = [ext]
+else:
+    print("WARNING: torch cuda seems unavailable, emulated features only will be available.")
+
 setup(name='pytorch_block_sparse',
       version=version,
       description='PyTorch extension for fast block sparse matrices computation, drop in replacement for torch.nn.Linear.',
@@ -25,14 +39,7 @@
       install_requires=[],
       include_package_data=True,
       zip_safe=False,
-      ext_modules=[
-        CUDAExtension('block_sparse_native',
-                      ['pytorch_block_sparse/native/block_sparse_native.cpp',
-                      'pytorch_block_sparse/native/block_sparse_cutlass_kernel_back.cu',
-                      'pytorch_block_sparse/native/block_sparse_cutlass_kernel.cu'],
-                      extra_compile_args=['-I', '%s/pytorch_block_sparse' % rootdir]
-                      ),
-      ],
+      ext_modules=ext_modules,
       cmdclass={
         'build_ext': BuildExtension
       }