flash-algo
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎README_zh.md‎
Lines changed: 3 additions & 3 deletions b/‎README_zh.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎csrc/flash_dmattn/flash_api.cpp‎
Lines changed: 200 additions & 204 deletions b/‎csrc/flash_dmattn/flash_api.cpp‎
Lines changed: 200 additions & 204 deletions
@@ -157,7 +157,7 @@ import math
 
 # Setup
 batch_size, seq_len, num_heads, num_kv_heads, head_dim = 1, 256, 2, 1, 64
-keep_window_size = 128
+window_size = 128
 device = torch.device('cuda')
 dtype = torch.bfloat16
 min_dtype = torch.finfo(dtype).min  # dtype minimum value
@@ -172,10 +172,10 @@ attention_mask = torch.ones(batch_size, num_kv_heads, seq_len, seq_len, device=d
 attention_bias = torch.randn(batch_size, num_kv_heads, seq_len, seq_len, device=device, dtype=dtype)
 
 # Generate sparse mask based on bias
-if seq_len > keep_window_size:
+if seq_len > window_size:
     # Select top-k most important keys for each query
     topk_values, topk_indices = torch.topk(
-        attention_bias, keep_window_size, dim=-1, 
+        attention_bias, window_size, dim=-1, 
         largest=True, sorted=False
     )
     # Generate valid top-k mask
 
@@ -157,7 +157,7 @@ import math
 
 # 设置
 batch_size, seq_len, num_heads, num_kv_heads, head_dim = 1, 256, 2, 1, 64
-keep_window_size = 128
+window_size = 128
 device = torch.device('cuda')
 dtype = torch.bfloat16
 min_dtype = torch.finfo(dtype).min  # dtype 的最小值
@@ -172,10 +172,10 @@ attention_mask = torch.ones(batch_size, num_kv_heads, seq_len, seq_len, device=d
 attention_bias = torch.randn(batch_size, num_kv_heads, seq_len, seq_len, device=device, dtype=dtype)
 
 # 基于 bias 生成稀疏 mask
-if seq_len > keep_window_size:
+if seq_len > window_size:
     # 为每个查询选择 top-k 最重要的键
     topk_values, topk_indices = torch.topk(
-        attention_bias, keep_window_size, dim=-1, 
+        attention_bias, window_size, dim=-1, 
         largest=True, sorted=False
     )
     # 生成有效的 top-k mask