tile-ai
diff --git a/‎examples/deepseek_v32/inference/convert.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/deepseek_v32/inference/convert.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/deepseek_v32/inference/generate.py‎
Lines changed: 25 additions & 14 deletions b/‎examples/deepseek_v32/inference/generate.py‎
Lines changed: 25 additions & 14 deletions
diff --git a/‎examples/deepseek_v32/inference/kernel.py‎
Lines changed: 19 additions & 25 deletions b/‎examples/deepseek_v32/inference/kernel.py‎
Lines changed: 19 additions & 25 deletions
@@ -7,7 +7,6 @@
 import torch
 from safetensors.torch import safe_open, save_file
 
-
 mapping = {
     "embed_tokens": ("embed", 0),
     "input_layernorm": ("attn_norm", None),
@@ -74,7 +73,8 @@ def main(hf_ckpt_path, save_path, n_experts, mp):
                         if idx < i * n_local_experts or idx >= (i + 1) * n_local_experts:
                             continue
                     elif dim is not None:
-                        assert param.size(dim) % mp == 0, f"Dimension {dim} must be divisible by {mp}"
+                        assert param.size(
+                            dim) % mp == 0, f"Dimension {dim} must be divisible by {mp}"
                         shard_size = param.size(dim) // mp
                         new_param = param.narrow(dim, i * shard_size, shard_size).contiguous()
                     state_dicts[i][name] = new_param
 
@@ -28,13 +28,11 @@ def sample(logits, temperature: float = 1.0):
 
 
 @torch.inference_mode()
-def generate(
-    model: Transformer,
-    prompt_tokens: List[List[int]],
-    max_new_tokens: int,
-    eos_id: int,
-    temperature: float = 1.0
-) -> List[List[int]]:
+def generate(model: Transformer,
+             prompt_tokens: List[List[int]],
+             max_new_tokens: int,
+             eos_id: int,
+             temperature: float = 1.0) -> List[List[int]]:
     """
     Generates new tokens based on the given prompt tokens using the specified model.
 
@@ -49,7 +47,9 @@ def generate(
         List[List[int]]: A list of lists containing the generated tokens for each sequence.
     """
     prompt_lens = [len(t) for t in prompt_tokens]
-    assert max(prompt_lens) <= model.max_seq_len, f"Prompt length exceeds model maximum sequence length (max_seq_len={model.max_seq_len})"
+    assert max(
+        prompt_lens
+    ) <= model.max_seq_len, f"Prompt length exceeds model maximum sequence length (max_seq_len={model.max_seq_len})"
     total_len = min(model.max_seq_len, max_new_tokens + max(prompt_lens))
     tokens = torch.full((len(prompt_tokens), total_len), -1, dtype=torch.long, device="cuda")
     for i, t in enumerate(prompt_tokens):
@@ -71,7 +71,7 @@ def generate(
             break
     completion_tokens = []
     for i, toks in enumerate(tokens.tolist()):
-        toks = toks[prompt_lens[i]:prompt_lens[i]+max_new_tokens]
+        toks = toks[prompt_lens[i]:prompt_lens[i] + max_new_tokens]
         if eos_id in toks:
             toks = toks[:toks.index(eos_id)]
         completion_tokens.append(toks)
@@ -139,16 +139,26 @@ def main(
                 continue
             messages.append({"role": "user", "content": prompt})
             prompt_tokens = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
-            completion_tokens = generate(model, [prompt_tokens], max_new_tokens, tokenizer.eos_token_id, temperature)
+            completion_tokens = generate(model, [prompt_tokens], max_new_tokens,
+                                         tokenizer.eos_token_id, temperature)
             completion = tokenizer.decode(completion_tokens[0], skip_special_tokens=True)
             print(completion)
             messages.append({"role": "assistant", "content": completion})
     else:
         with open(input_file) as f:
             prompts = f.read().split("\n\n")
-        assert len(prompts) <= args.max_batch_size, f"Number of prompts exceeds maximum batch size ({args.max_batch_size})"
-        prompt_tokens = [tokenizer.apply_chat_template([{"role": "user", "content": prompt}], add_generation_prompt=True) for prompt in prompts]
-        completion_tokens = generate(model, prompt_tokens, max_new_tokens, tokenizer.eos_token_id, temperature)
+        assert len(
+            prompts
+        ) <= args.max_batch_size, f"Number of prompts exceeds maximum batch size ({args.max_batch_size})"
+        prompt_tokens = [
+            tokenizer.apply_chat_template([{
+                "role": "user",
+                "content": prompt
+            }],
+                                          add_generation_prompt=True) for prompt in prompts
+        ]
+        completion_tokens = generate(model, prompt_tokens, max_new_tokens, tokenizer.eos_token_id,
+                                     temperature)
         completions = tokenizer.batch_decode(completion_tokens, skip_special_tokens=True)
         for prompt, completion in zip(prompts, completions):
             print("Prompt:", prompt)
@@ -183,4 +193,5 @@ def main(
     parser.add_argument("--temperature", type=float, default=0.6)
     args = parser.parse_args()
     assert args.input_file or args.interactive, "Either input-file or interactive mode must be specified"
-    main(args.ckpt_path, args.config, args.input_file, args.interactive, args.max_new_tokens, args.temperature)
+    main(args.ckpt_path, args.config, args.input_file, args.interactive, args.max_new_tokens,
+         args.temperature)
@@ -3,7 +3,6 @@
 import tilelang.language as T
 from typing import Tuple, Optional
 
-
 tilelang.set_log_level("WARNING")
 
 pass_configs = {
@@ -34,9 +33,7 @@ def fast_round_scale(amax, fp8_max_inv):
 
 
 @tilelang.jit(pass_configs=pass_configs)
-def act_quant_kernel(
-    N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False
-):
+def act_quant_kernel(N, in_dtype=BF16, out_dtype=FP8, scale_dtype=FP32, round_scale=False):
     M = T.symbolic("M")
     fp8_min = -448.0
     fp8_max = 448.0
@@ -51,10 +48,11 @@ def act_quant_kernel_(
         Y: T.Tensor[(M, N), out_dtype],
         S: T.Tensor[(M, T.ceildiv(N, group_size)), scale_dtype],
     ):
-        with T.Kernel(T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (
-            pid_m,
-            pid_n,
-        ):
+        with T.Kernel(
+                T.ceildiv(M, blk_m), T.ceildiv(N, group_size), threads=128) as (
+                    pid_m,
+                    pid_n,
+                ):
             x_shared = T.alloc_shared((blk_m, group_size), in_dtype)
             x_local = T.alloc_fragment((blk_m, group_size), in_dtype)
             amax_local = T.alloc_fragment((blk_m,), scale_dtype)
@@ -73,9 +71,7 @@ def act_quant_kernel_(
                     else:
                         s_local[i] = amax_local[i] * fp8_max_inv
                 for i, j in T.Parallel(blk_m, group_size):
-                    y_local[i, j] = T.clamp(
-                        x_local[i, j] / s_local[i], fp8_min, fp8_max
-                    )
+                    y_local[i, j] = T.clamp(x_local[i, j] / s_local[i], fp8_min, fp8_max)
                 for i in T.Parallel(blk_m):
                     S[pid_m * blk_m + i, pid_n] = s_local[i]
                 T.copy(y_local, y_shared)
@@ -84,9 +80,9 @@ def act_quant_kernel_(
     return act_quant_kernel_
 
 
-def act_quant(
-    x: torch.Tensor, block_size: int = 128, scale_fmt: Optional[str] = None
-) -> Tuple[torch.Tensor, torch.Tensor]:
+def act_quant(x: torch.Tensor,
+              block_size: int = 128,
+              scale_fmt: Optional[str] = None) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     Quantizes the input tensor `x` using block-wise quantization.
 
@@ -101,8 +97,7 @@ def act_quant(
     """
     assert x.is_contiguous(), "Input tensor must be contiguous"
     assert x.size(-1) % block_size == 0, (
-        f"Last dimension size must be divisible by block_size (block_size={block_size})"
-    )
+        f"Last dimension size must be divisible by block_size (block_size={block_size})")
     N = x.size(-1)
     y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
     s = x.new_empty(*x.size()[:-1], N // block_size, dtype=torch.float32)
@@ -129,10 +124,11 @@ def fp8_gemm_kernel_(
         scales_a: T.Tensor[(M, T.ceildiv(K, group_size)), FP32],
         scales_b: T.Tensor[(T.ceildiv(N, group_size), T.ceildiv(K, group_size)), FP32],
     ):
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
-            bx,
-            by,
-        ):
+        with T.Kernel(
+                T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (
+                    bx,
+                    by,
+                ):
             A_shared = T.alloc_shared((block_M, block_K), FP8)
             B_shared = T.alloc_shared((block_N, block_K), FP8)
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
@@ -168,9 +164,8 @@ def fp8_gemm_kernel_(
     return fp8_gemm_kernel_
 
 
-def fp8_gemm(
-    a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor, b_s: torch.Tensor
-) -> torch.Tensor:
+def fp8_gemm(a: torch.Tensor, a_s: torch.Tensor, b: torch.Tensor,
+             b_s: torch.Tensor) -> torch.Tensor:
     """
     Perform a matrix multiplication using FP8 precision.
 
@@ -185,8 +180,7 @@ def fp8_gemm(
     """
     assert a.is_contiguous() and b.is_contiguous(), "Input tensors must be contiguous"
     assert a_s.is_contiguous() and b_s.is_contiguous(), (
-        "Scaling factor tensors must be contiguous"
-    )
+        "Scaling factor tensors must be contiguous")
     K = a.size(-1)
     M = a.numel() // K
     N = b.size(0)