@@ -523,14 +523,16 @@ def build(self,
523523        head_dim  =  self .kv_cache_spec .head_size 
524524
525525        # currently prefill trtllm attention does not support fp8 kv cache 
526-         prefill_use_trtllm  =  not  cache_dtype .startswith (
527-             "fp8" ) and  use_trtllm_attention (num_prefill_tokens , max_seq_len ,
528-                                             cache_dtype , num_qo_heads ,
529-                                             num_kv_heads , head_dim )
530-         decode_use_trtllm  =  use_trtllm_attention (num_decode_tokens ,
531-                                                  max_seq_len , cache_dtype ,
532-                                                  num_qo_heads , num_kv_heads ,
533-                                                  head_dim )
526+         # trtllm may not support sliding window 
527+         prefill_use_trtllm  =  (self .global_hyperparameters .window_left  ==  - 1 
528+                               and  not  cache_dtype .startswith ("fp8" )
529+                               and  use_trtllm_attention (
530+                                 num_prefill_tokens , max_seq_len , cache_dtype ,
531+                                 num_qo_heads , num_kv_heads , head_dim ))
532+         decode_use_trtllm  =  (self .global_hyperparameters .window_left  ==  - 1 
533+                              and  use_trtllm_attention (
534+                                 num_decode_tokens , max_seq_len , cache_dtype ,
535+                                 num_qo_heads , num_kv_heads , head_dim ))
534536
535537        attn_metadata  =  FlashInferMetadata (
536538            num_actual_tokens = num_actual_tokens ,
0 commit comments