Skip to content

Fix ragged paged attention v2 to resolve the recompilation issue #8797

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions test/test_pallas.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,7 @@ def test_ragged_paged_attention_wrapper_without_dynamo(self):
kv_lens_xla = kv_lens.to("xla")
page_indices_xla = page_indices.to("xla")
cu_q_lens_xla = cu_q_lens.to("xla")
num_seqs_xla = torch.tensor([num_seqs], dtype=torch.int32).to("xla")

output = ragged_paged_attention(
q_xla,
Expand All @@ -684,7 +685,7 @@ def test_ragged_paged_attention_wrapper_without_dynamo(self):
kv_lens_xla,
page_indices_xla,
cu_q_lens_xla,
num_seqs=num_seqs,
num_seqs=num_seqs_xla,
num_kv_pages_per_block=num_kv_pages_per_block,
num_queries_per_block=num_queries_per_block,
use_kernel=True)[:cu_q_lens[num_seqs]]
Expand All @@ -696,7 +697,7 @@ def test_ragged_paged_attention_wrapper_without_dynamo(self):
kv_lens_xla,
page_indices_xla,
cu_q_lens_xla,
num_seqs=num_seqs,
num_seqs=num_seqs_xla,
num_kv_pages_per_block=num_kv_pages_per_block,
num_queries_per_block=num_queries_per_block,
use_kernel=False)
Expand All @@ -707,6 +708,7 @@ def test_ragged_paged_attention_wrapper_without_dynamo(self):
kv_lens_jax = jnp.array(kv_lens.numpy(), dtype=jnp.int32)
page_indices_jax = jnp.array(page_indices.numpy(), dtype=jnp.int32)
cu_q_lens_jax = jnp.array(cu_q_lens.numpy(), dtype=jnp.int32)
num_seqs_jax = jnp.array([num_seqs], dtype=jnp.int32)

expected_output = torch.from_numpy(
np.array(
Expand All @@ -717,7 +719,7 @@ def test_ragged_paged_attention_wrapper_without_dynamo(self):
kv_lens_jax,
page_indices_jax,
cu_q_lens_jax,
num_seqs=num_seqs,
num_seqs=num_seqs_jax,
num_kv_pages_per_block=num_kv_pages_per_block,
num_queries_per_block=num_queries_per_block,
)[:cu_q_lens[num_seqs]]))
Expand Down Expand Up @@ -765,6 +767,7 @@ def _verify_ragged_paged_attention_with_dynamo(
kv_lens_xla = kv_lens.to("xla")
page_indices_xla = page_indices.to("xla")
cu_q_lens_xla = cu_q_lens.to("xla")
num_seqs_xla = torch.tensor([num_seqs], dtype=torch.int32).to("xla")

kernel_output = torch.ops.xla.ragged_paged_attention(
q_xla,
Expand All @@ -773,7 +776,7 @@ def _verify_ragged_paged_attention_with_dynamo(
kv_lens_xla,
page_indices_xla,
cu_q_lens_xla,
num_seqs=num_seqs,
num_seqs=num_seqs_xla,
num_kv_pages_per_block=num_kv_pages_per_block,
num_queries_per_block=num_queries_per_block,
use_kernel=True,
Expand All @@ -787,7 +790,7 @@ def _verify_ragged_paged_attention_with_dynamo(
kv_lens_xla,
page_indices_xla,
cu_q_lens_xla,
num_seqs=num_seqs,
num_seqs=num_seqs_xla,
num_kv_pages_per_block=num_kv_pages_per_block,
num_queries_per_block=num_queries_per_block,
use_kernel=False,
Expand All @@ -805,6 +808,7 @@ def _verify_ragged_paged_attention_with_dynamo(
kv_lens_jax = jnp.array(kv_lens.numpy(), dtype=jnp.int32)
page_indices_jax = jnp.array(page_indices.numpy(), dtype=jnp.int32)
cu_q_lens_jax = jnp.array(cu_q_lens.numpy(), dtype=jnp.int32)
num_seqs_jax = jnp.array([num_seqs], dtype=jnp.int32)

from torch_xla.experimental.pallas_kernels.ragged_paged_attention_v2 import ragged_paged_attention as jax_ragged_paged_attention
jax_kernel_output = torch.from_numpy(
Expand All @@ -816,7 +820,7 @@ def _verify_ragged_paged_attention_with_dynamo(
kv_lens_jax,
page_indices_jax,
cu_q_lens_jax,
num_seqs=num_seqs,
num_seqs=num_seqs_jax,
num_kv_pages_per_block=num_kv_pages_per_block,
num_queries_per_block=num_queries_per_block,
sm_scale=sm_scale,
Expand Down Expand Up @@ -867,7 +871,7 @@ def test_ragged_paged_attention_wrapper_no_padding_with_dynamo(self):

@parameterized.product(
seq_lens=[[(1, 1328), (5, 18), (500, 563)]],
num_queries_per_block=[16, 64, 128],
num_queries_per_block=[16, 32],
)
@unittest.skipIf(xr.device_type() != 'TPU' or tpu.version() < 4,
"This test only works on TPUv4+.")
Expand Down
62 changes: 28 additions & 34 deletions torch_xla/experimental/custom_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,11 +900,11 @@ def validate_ragged_paged_attention_inputs(
kv_lens, # i32[max_num_seqs]
page_indices, # i32[max_num_seqs, pages_per_seq]
cu_q_lens, # i32[max_num_seqs + 1]
num_seqs, # i32
num_seqs, # i32[1]
):
max_num_batched_tokens, num_q_heads, head_dim = q.shape
_, page_size, num_kv_heads, head_dim_k = k_pages.shape
max_num_seqs, pages_per_seq = page_indices.shape
_, num_q_heads, head_dim = q.shape
_, _, num_kv_heads, head_dim_k = k_pages.shape
max_num_seqs, _ = page_indices.shape
if k_pages.shape != v_pages.shape:
raise ValueError(
f"{k_pages.shape=} and {v_pages.shape=} must have the same shape.")
Expand All @@ -918,9 +918,6 @@ def validate_ragged_paged_attention_inputs(
raise ValueError(
f"Expected {cu_q_lens.shape=} to be ({max_num_seqs + 1},) where"
" `max_num_seqs` is `page_indices.shape[0]`.")
if max_num_seqs > max_num_batched_tokens:
raise ValueError(
f"{max_num_seqs=} must be less or equal to {max_num_batched_tokens=}.")
if (kv_lens.dtype != torch.int32 or page_indices.dtype != torch.int32 or
cu_q_lens.dtype != torch.int32):
raise ValueError(
Expand All @@ -931,24 +928,24 @@ def validate_ragged_paged_attention_inputs(
raise ValueError(f"{num_q_heads=} must be divisible by {num_kv_heads=}")

# Must check below on runtime!
if num_seqs > max_num_seqs:
raise ValueError(f"{num_seqs=} must be less or equal to {max_num_seqs=}")
max_kv_len = torch.max(kv_lens)
min_pages_per_seq = ceil_div(max_kv_len, page_size)
if pages_per_seq < min_pages_per_seq:
raise ValueError(
f"{pages_per_seq=} must be greater or equal to"
f" {min_pages_per_seq=} given {max_kv_len=} and {page_size=}.")
if cu_q_lens[num_seqs] > max_num_batched_tokens:
raise ValueError(
f"Total q tokens {cu_q_lens[num_seqs]} must be less or equal to"
f" {max_num_batched_tokens=}.")
for i in range(num_seqs):
q_len = cu_q_lens[i + 1] - cu_q_lens[i]
kv_len = kv_lens[i]
if q_len > kv_len:
raise ValueError(
f"{q_len=} must be less or equal to {kv_len=} at sequence {i}.")
# if num_seqs > max_num_seqs:
# raise ValueError(f"{num_seqs=} must be less or equal to {max_num_seqs=}")
# max_kv_len = torch.max(kv_lens)
# min_pages_per_seq = ceil_div(max_kv_len, page_size)
# if pages_per_seq < min_pages_per_seq:
# raise ValueError(
# f"{pages_per_seq=} must be greater or equal to"
# f" {min_pages_per_seq=} given {max_kv_len=} and {page_size=}.")
# if cu_q_lens[num_seqs] > max_num_batched_tokens:
# raise ValueError(
# f"Total q tokens {cu_q_lens[num_seqs]} must be less or equal to"
# f" {max_num_batched_tokens=}.")
# for i in range(num_seqs):
# q_len = cu_q_lens[i + 1] - cu_q_lens[i]
# kv_len = kv_lens[i]
# if q_len > kv_len:
# raise ValueError(
# f"{q_len=} must be less or equal to {kv_len=} at sequence {i}.")


def _ragged_paged_attention_nonkernel(
Expand Down Expand Up @@ -1001,7 +998,7 @@ def ragged_paged_attention(
kv_lens, # i32[max_num_seqs]
page_indices, # i32[max_num_seqs, pages_per_seq]
cu_q_lens, # i32[max_num_seqs + 1]
num_seqs, # i32
num_seqs, # i32[1]
*,
sm_scale=1.0,
mask_value=None,
Expand All @@ -1022,7 +1019,7 @@ def ragged_paged_attention(
kv_lens,
page_indices,
cu_q_lens,
num_seqs,
num_seqs.item(),
sm_scale=sm_scale,
mask_value=mask_value,
)
Expand Down Expand Up @@ -1054,17 +1051,14 @@ def ragged_paged_attention(
],
)

num_q_blks = ceil_div(cu_q_lens[num_seqs], num_queries_per_block)
seq_buf_idx = torch.tensor([0, 0], dtype=torch.int32).to("xla")
num_seqs_ref = torch.tensor([num_seqs], dtype=torch.int32).to("xla")
output = torch_xla._XLAC._xla_tpu_custom_call(
[
num_q_blks,
kv_lens,
page_indices,
cu_q_lens,
seq_buf_idx,
num_seqs_ref,
num_seqs,
q,
k_pages,
v_pages,
Expand Down Expand Up @@ -1733,7 +1727,7 @@ def multi_queries_paged_attention_non_xla(q: torch.Tensor,

XLA_LIB.define(
"ragged_paged_attention(Tensor q, Tensor k_pages, Tensor v_pages, Tensor kv_lens, Tensor page_indices, "
"Tensor cu_q_lens, int num_seqs, int num_kv_pages_per_block, int num_queries_per_block, bool use_kernel, "
"Tensor cu_q_lens, Tensor num_seqs, int num_kv_pages_per_block, int num_queries_per_block, bool use_kernel, "
"float sm_scale=1.0, float? mask_value=None, int? vmem_limit_bytes=None) -> Tensor",
)

Expand All @@ -1746,7 +1740,7 @@ def ragged_paged_attention_xla(
kv_lens: torch.Tensor,
page_indices: torch.Tensor,
cu_q_lens: torch.Tensor,
num_seqs: int,
num_seqs: torch.Tensor,
num_kv_pages_per_block: int,
num_queries_per_block: int,
use_kernel: bool,
Expand Down Expand Up @@ -1777,7 +1771,7 @@ def ragged_paged_attention_non_xla(q: torch.Tensor,
kv_lens: torch.Tensor,
page_indices: torch.Tensor,
cu_q_lens: torch.Tensor,
num_seqs: int,
num_seqs: torch.Tensor,
num_kv_pages_per_block: int,
num_queries_per_block: int,
use_kernel: bool,
Expand Down
17 changes: 7 additions & 10 deletions torch_xla/experimental/pallas_kernels/ragged_paged_attention_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,6 @@ def check_inputs_shapes(
raise ValueError(
f"Expected {cu_q_lens.shape=} to be ({max_num_seqs + 1},) where"
" `max_num_seqs` is `page_indices.shape[0]`.")
if max_num_seqs > max_num_batched_tokens:
raise ValueError(
f"{max_num_seqs=} must be less or equal to {max_num_batched_tokens=}.")
if (kv_lens.dtype != jnp.int32 or page_indices.dtype != jnp.int32 or
cu_q_lens.dtype != jnp.int32):
raise ValueError(
Expand Down Expand Up @@ -253,7 +250,9 @@ def prefetch_first_kv_blk():

def is_cur_q_blk_needed(q_states):
done, cur_seq_idx, _ = q_states
return jnp.logical_and(done == 0, cur_seq_idx < num_seqs)
should_run = jnp.logical_and(q_len_start < cu_q_lens_ref[num_seqs],
cur_seq_idx < num_seqs)
return jnp.logical_and(done == 0, should_run)

def compute_with_cur_q_blk(q_states):
done, cur_seq_idx, cur_buf_idx = q_states
Expand Down Expand Up @@ -551,7 +550,7 @@ def ragged_paged_attention(
kv_lens: jax.Array, # i32[max_num_seqs]
page_indices: jax.Array, # i32[max_num_seqs, pages_per_seq]
cu_q_lens: jax.Array, # i32[max_num_seqs + 1]
num_seqs, # i32
num_seqs, # i32[1]
*,
sm_scale: float = 1.0,
mask_value: float = DEFAULT_MASK_VALUE,
Expand Down Expand Up @@ -583,12 +582,12 @@ def ragged_paged_attention(
The output of the attention.
"""
check_inputs_shapes(q, k_pages, v_pages, kv_lens, page_indices, cu_q_lens)
_, num_q_heads, head_dim = q.shape
num_q, num_q_heads, head_dim = q.shape
_, page_size, num_kv_heads, _ = k_pages.shape
num_q_per_blk = num_queries_per_block
num_kv_pages_per_blk = num_kv_pages_per_block
num_q_heads_per_kv_head = num_q_heads // num_kv_heads
num_q_blks = ceil_div(cu_q_lens[num_seqs], num_q_per_blk)
num_q_blks = ceil_div(num_q, num_q_per_blk)
num_q_heads_per_blk, num_kv_heads_per_blk = get_min_heads_per_blk(
num_q_heads, num_kv_heads, q.dtype, k_pages.dtype)
assert num_q_heads_per_blk % num_q_heads_per_kv_head == 0
Expand Down Expand Up @@ -636,9 +635,7 @@ def q_index_map(heads_blk_idx, q_blk_idx, *_):
page_indices,
cu_q_lens,
jnp.array((0, 0), jnp.int32), # seq_idx, buf_idx
# Mosaic only takes dynamic scalar as ref, so we wrap it.
jnp.array([num_seqs], jnp.int32), # num_seqs
)
num_seqs)
kernel = pl.pallas_call(
functools.partial(
ragged_paged_attention_kernel,
Expand Down
Loading