make a module

ai-compiler-study · Sep 17, 2024 · ae6fb66 · ae6fb66
1 parent 5707da8
commit ae6fb66
Show file tree

Hide file tree

Showing 14 changed files with 145 additions and 57 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Sinjin Jeong
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,9 +1,15 @@
 # Triton Kernels
 Triton kernels for [Stable Diffusion 3](https://arxiv.org/abs/2403.03206) and [Flux](https://github.com/black-forest-labs/flux)
 
+### Installation
+```bash
+pip install -e .
+```
+
+### Test
 - [LayerNorm + Modulation Kernel](./normalization.py)
-  - `python normalization_test.py`
+  - `python ./benchmarks/normalization_test.py`
 - [RMSNorm Kernel](./normalization.py)
-  - `python rms_norm_test.py`
+  - `python ./benchmarks/rms_norm_test.py`
 - [RoPE Kernel](./positional_embedding.py)
-  - `python rope_test.py`
+  - `python ./benchmakrs/rope_test.py`
diff --git a/normalization_test.py → benchmarks/normalization_test.py b/normalization_test.py → benchmarks/normalization_test.py
@@ -4,22 +4,7 @@
 import torch.nn.functional as F
 import triton
 
-from normalization import layer_norm_modulation
-
-
-def modulate(x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor) -> torch.Tensor:
-    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-
-
-def layer_norm_modulation_torch(x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor) -> torch.Tensor:
-    x = F.layer_norm(x, normalized_shape=(x.shape[-1],))
-    return modulate(x, scale=scale, shift=shift)
-
-
-@torch.compile
-def layer_norm_modulation_torch_compile(x: torch.Tensor, scale: torch.Tensor, shift: torch.Tensor) -> torch.Tensor:
-    x = F.layer_norm(x, normalized_shape=(x.shape[-1],))
-    return modulate(x, scale=scale, shift=shift)
+from triton_kernels import layer_norm_modulation, layer_norm_modulation_torch, layer_norm_modulation_torch_compile
 
 
 def test_layer_norm_modulation(batch_size, seq_len, embed_dim, dtype, device="cuda"):

diff --git a/rms_norm_test.py → benchmarks/rms_norm_test.py b/rms_norm_test.py → benchmarks/rms_norm_test.py
@@ -2,24 +2,8 @@
 
 import torch
 import triton
-from torch import Tensor
 
-from normalization import rms_norm
-
-
-def rms_norm_torch(x: Tensor, scale: Tensor) -> Tensor:
-    x_dtype = x.dtype
-    x = x.float()
-    rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-    return (x * rrms).to(dtype=x_dtype) * scale
-
-
-@torch.compile
-def rms_norm_torch_compile(x: Tensor, scale: Tensor) -> Tensor:
-    x_dtype = x.dtype
-    x = x.float()
-    rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-    return (x * rrms).to(dtype=x_dtype) * scale
+from triton_kernels import rms_norm, rms_norm_torch, rms_norm_torch_compile
 
 
 def test_rms_norm(batch_size, num_heads, seq_len, head_dim, dtype, device="cuda"):

diff --git a/rope_test.py → benchmarks/rope_test.py b/rope_test.py → benchmarks/rope_test.py
@@ -2,26 +2,8 @@
 
 import torch
 import triton
-from torch import Tensor
 
-from positional_embedding import apply_rope
-
-
-def apply_rope_torch(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
-    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
-    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
-
-
-@torch.compile
-def apply_rope_torch_compile(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
-    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
-    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
-    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+from triton_kernels import apply_rope, apply_rope_torch, apply_rope_torch_compile
 
 
 def test_apply_rope(batch_size, num_heads, seq_len, head_dim, dtype, device="cuda"):

diff --git a/setup.py b/setup.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+
+from setuptools import setup
+
+directory = Path(__file__).resolve().parent
+with open(directory / "README.md", encoding="utf-8") as f:
+    long_description = f.read()
+
+setup(
+    name="triton-kernels",
+    version="0.1.0",
+    author="Sinjin Jeong",
+    description="Triton kernels for SD3 and Flux",
+    license="MIT",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/ai-compiler-study/triton-kernels",
+    packages=[
+        "triton_kernels",
+    ],
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+    ],
+    install_requires=[
+        "numpy",
+        "torch",
+        "triton>=2.2.0",
+    ],
+    extras_require={
+        "linting": [
+            "pre-commit>=3.5.0",
+        ],
+        "testing": [
+            "pytest>=7.4.0",
+            "pytest-xdist>=3.5.0",
+        ],
+    },
+    python_requires=">=3.8",
+)
diff --git a/triton_kernels/__init__.py b/triton_kernels/__init__.py
@@ -0,0 +1,9 @@
+from triton_kernels.functional import (
+    apply_rope_torch,
+    apply_rope_torch_compile,
+    layer_norm_modulation_torch,
+    layer_norm_modulation_torch_compile,
+    rms_norm_torch,
+    rms_norm_torch_compile,
+)
+from triton_kernels.kernels import apply_rope, layer_norm_modulation, rms_norm
diff --git a/triton_kernels/functional/__init__.py b/triton_kernels/functional/__init__.py
@@ -0,0 +1,7 @@
+from triton_kernels.functional.normalization import (
+    layer_norm_modulation_torch,
+    layer_norm_modulation_torch_compile,
+    rms_norm_torch,
+    rms_norm_torch_compile,
+)
+from triton_kernels.functional.positional_embedding import apply_rope_torch, apply_rope_torch_compile
diff --git a/triton_kernels/functional/normalization.py b/triton_kernels/functional/normalization.py
@@ -0,0 +1,33 @@
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+
+
+def modulate(x: Tensor, scale: Tensor, shift: Tensor) -> Tensor:
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+
+
+def layer_norm_modulation_torch(x: Tensor, scale: Tensor, shift: Tensor) -> Tensor:
+    x = F.layer_norm(x, normalized_shape=(x.shape[-1],))
+    return modulate(x, scale=scale, shift=shift)
+
+
+@torch.compile
+def layer_norm_modulation_torch_compile(x: Tensor, scale: Tensor, shift: Tensor) -> Tensor:
+    x = F.layer_norm(x, normalized_shape=(x.shape[-1],))
+    return modulate(x, scale=scale, shift=shift)
+
+
+def rms_norm_torch(x: Tensor, scale: Tensor) -> Tensor:
+    x_dtype = x.dtype
+    x = x.float()
+    rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+    return (x * rrms).to(dtype=x_dtype) * scale
+
+
+@torch.compile
+def rms_norm_torch_compile(x: Tensor, scale: Tensor) -> Tensor:
+    x_dtype = x.dtype
+    x = x.float()
+    rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+    return (x * rrms).to(dtype=x_dtype) * scale
diff --git a/triton_kernels/functional/positional_embedding.py b/triton_kernels/functional/positional_embedding.py
@@ -0,0 +1,19 @@
+import torch
+from torch import Tensor
+
+
+def apply_rope_torch(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+
+
+@torch.compile
+def apply_rope_torch_compile(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
diff --git a/triton_kernels/kernels/__init__.py b/triton_kernels/kernels/__init__.py
@@ -0,0 +1,2 @@
+from triton_kernels.kernels.normalization import layer_norm_modulation, rms_norm
+from triton_kernels.kernels.positional_embedding import apply_rope
diff --git a/normalization.py → triton_kernels/kernels/normalization.py b/normalization.py → triton_kernels/kernels/normalization.py
@@ -2,7 +2,7 @@
 import triton
 import triton.language as tl
 
-from utils import calculate_settings
+from triton_kernels.kernels.utils import calculate_settings
 
 
 @triton.jit

diff --git a/positional_embedding.py → ...n_kernels/kernels/positional_embedding.py b/positional_embedding.py → ...n_kernels/kernels/positional_embedding.py
@@ -3,7 +3,7 @@
 import triton.language as tl
 from torch import Tensor
 
-from utils import calculate_settings
+from triton_kernels.kernels.utils import calculate_settings
 
 
 @triton.jit

diff --git a/utils.py → triton_kernels/kernels/utils.py b/utils.py → triton_kernels/kernels/utils.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from triton_kernels.kernels.normalization import layer_norm_modulation, rms_norm
		from triton_kernels.kernels.positional_embedding import apply_rope