Skip to content

Commit 7857a75

Browse files
committed
Merge remote-tracking branch 'upstream/develop' into develop+simplify_pd_config_1206
2 parents 3e6fe8b + c9b47f9 commit 7857a75

31 files changed

Lines changed: 1951 additions & 966 deletions
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
num_gpu_blocks_override: 1024
2+
max_model_len: 8192
3+
max_num_seqs: 64
4+
data_parallel_size: 4
5+
tensor_parallel_size: 1
6+
enable_expert_parallel: True
7+
quantization: wint4
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
max_model_len: 32768
2+
max_num_seqs: 256
3+
gpu_memory_utilization: 0.9
4+
kv_cache_ratio: 0.8
5+
tensor_parallel_size: 4
6+
cache_queue_port: 55663
7+
enable_chunked_prefill: True
8+
splitwise_role: decode
9+
engine_worker_queue_port: 6678
10+
cache_transfer_protocol: "rdma,ipc"
11+
rdma_comm_ports: "7671,7672,7673,7674"
12+
pd_comm_port: "2334"
13+
max_num_batched_tokens: 384
14+
max_num_partial_prefills: 3
15+
max_long_partial_prefills: 3
16+
quantization: wint4
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
max_model_len: 32768
2+
max_num_seqs: 16
3+
gpu_memory_utilization: 0.9
4+
kv_cache_ratio: 0.9
5+
tensor_parallel_size: 4
6+
splitwise_role: prefill
7+
enable_prefix_caching: True
8+
cache_queue_port: 55664
9+
engine_worker_queue_port: 6677
10+
cache_transfer_protocol: "rdma,ipc"
11+
rdma_comm_ports: "7675,7676,7677,7678"
12+
pd_comm_port: "2333"
13+
quantization: wint4

custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel(
3131
const int* __restrict__ batch_id_per_token, // [num_tokens]
3232
const int* __restrict__ cu_seqlens_q,
3333
const int* __restrict__ seq_lens_decoder, // [bsz]
34+
const int* __restrict__ seq_lens_encoder, // [bsz]
3435
const float* __restrict__ cos_emb,
3536
const float* __restrict__ sin_emb,
3637
const float*
@@ -75,7 +76,7 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel(
7576

7677
const int ori_bi = batch_id_per_token[token_id];
7778
if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding
78-
if (seq_lens_decoder[ori_bi] == 0) continue;
79+
if (seq_lens_encoder[ori_bi] > 0) continue;
7980
const int bias = linear_index % hidden_size;
8081
const int hi = bias / head_size; // q + k + v
8182
const int h_bias = bias % head_size;
@@ -87,7 +88,7 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel(
8788
const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
8889
const int block_idx = block_table_now[write_seq_id / block_size];
8990
if (block_idx < 0) {
90-
return; // NOTE(gongshaotian): For CUDAGraph padding
91+
continue; // NOTE(gongshaotian): For CUDAGraph padding
9192
}
9293
const int block_offset = write_seq_id % block_size;
9394

@@ -343,6 +344,7 @@ __global__ void append_speculate_cache_rope_kernel(
343344
const int* __restrict__ batch_id_per_token, // [num_tokens]
344345
const int* __restrict__ cu_seqlens_q,
345346
const int* __restrict__ seq_lens_decoder, // [bsz]
347+
const int* __restrict__ seq_lens_encoder, // [bsz]
346348
const float* __restrict__ cos_emb,
347349
const float* __restrict__ sin_emb,
348350
const float*
@@ -380,7 +382,7 @@ __global__ void append_speculate_cache_rope_kernel(
380382
const int ori_bi = batch_id_per_token[token_id];
381383
if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding
382384

383-
if (seq_lens_decoder[ori_bi] == 0) continue;
385+
if (seq_lens_encoder[ori_bi] > 0) continue;
384386
const int bias = linear_index % hidden_size;
385387
const int hi = bias / head_size; // q + k + v
386388
const int h_bias = bias % head_size;
@@ -392,7 +394,7 @@ __global__ void append_speculate_cache_rope_kernel(
392394
const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
393395
const int block_idx = block_table_now[write_seq_id / block_size];
394396
if (block_idx < 0) {
395-
return; // NOTE(gongshaotian): For CUDAGraph padding
397+
continue; // NOTE(gongshaotian): For CUDAGraph padding
396398
}
397399
const int block_offset = write_seq_id % block_size;
398400

@@ -473,6 +475,7 @@ __global__ void append_speculate_cache_neox_rope_kernel(
473475
const int* __restrict__ batch_id_per_token, // [num_tokens]
474476
const int* __restrict__ cu_seqlens_q,
475477
const int* __restrict__ seq_lens_decoder, // [bsz]
478+
const int* __restrict__ seq_lens_encoder, // [bsz]
476479
const float* __restrict__ cos_emb,
477480
const float* __restrict__ sin_emb,
478481
const float*
@@ -509,7 +512,7 @@ __global__ void append_speculate_cache_neox_rope_kernel(
509512
const int token_id = linear_index / half_hidden_size;
510513
const int ori_bi = batch_id_per_token[token_id];
511514
if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding
512-
if (seq_lens_decoder[ori_bi] == 0) continue;
515+
if (seq_lens_encoder[ori_bi] > 0) continue;
513516
const int bias = linear_index % half_hidden_size;
514517
const int hi = bias / half_head_size; // q + k + v
515518
const int h_bias = bias % half_head_size;
@@ -521,7 +524,7 @@ __global__ void append_speculate_cache_neox_rope_kernel(
521524
const int* block_table_now = block_tables + ori_bi * max_blocks_per_seq;
522525
const int block_idx = block_table_now[write_seq_id / block_size];
523526
if (block_idx < 0) {
524-
return; // NOTE(gongshaotian): For CUDAGraph padding
527+
continue; // NOTE(gongshaotian): For CUDAGraph padding
525528
}
526529
const int block_offset = write_seq_id % block_size;
527530

custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ void append_speculate_cache_rope_qk_norm(const QKV_TYPE* qkv,
6767
batch_id_per_token,
6868
cu_seqlens_q,
6969
seq_lens,
70+
seq_lens_encoder,
7071
cos_emb,
7172
sin_emb,
7273
qkv_out_scales,
@@ -134,6 +135,7 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
134135
batch_id_per_token,
135136
cu_seqlens_q,
136137
seq_lens,
138+
seq_lens_encoder,
137139
cos_emb,
138140
sin_emb,
139141
qkv_out_scales,
@@ -158,6 +160,7 @@ void append_speculate_cache_rope(const QKV_TYPE* qkv,
158160
batch_id_per_token,
159161
cu_seqlens_q,
160162
seq_lens,
163+
seq_lens_encoder,
161164
cos_emb,
162165
sin_emb,
163166
qkv_out_scales,

0 commit comments

Comments
 (0)