feat: support ascend moe_gating_topk_softmax

yao-fengchen · yao-fengchen · commit b50dbe2b3669 · 2024-08-22T07:55:46.000Z
diff --git a/lmdeploy/pytorch/kernels/ascend/__init__.py b/lmdeploy/pytorch/kernels/ascend/__init__.py
@@ -3,6 +3,7 @@
 from .apply_rotary_pos_emb import apply_rotary_pos_emb
 from .fill_kv_cache import fill_kv_cache
 from .fused_rotary_emb import fused_rotary_emb
+from .moe_gating_topk_softmax import moe_gating_topk_softmax
 from .paged_attention_fwd import paged_attention_fwd
 from .rms_norm import rms_norm
 
@@ -12,5 +13,6 @@
     'fused_rotary_emb',
     'fill_kv_cache',
     'paged_attention_fwd',
+    'moe_gating_topk_softmax',
     'multinomial_sampling',
 ]
diff --git a/lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/ascend/apply_rotary_pos_emb.py
@@ -26,10 +26,8 @@ def apply_rotary_pos_emb(
             setattr(context, 'sin', sin)
     cached_cos = context.cos if context else cos
     cached_sin = context.sin if context else sin
-    ext_ops.apply_rotary_pos_emb(
-        query_states_reshaped, key_states_reshaped, cached_cos, cached_sin,
-        None, None, None
-    )
+    ext_ops.apply_rotary_pos_emb(query_states_reshaped, key_states_reshaped,
+                                 cached_cos, cached_sin, None, None, None)
     if q_embed is None:
         q_embed = query_states
     else:
diff --git a/lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py b/lmdeploy/pytorch/kernels/ascend/fill_kv_cache.py
@@ -16,5 +16,5 @@ def fill_kv_cache(
     context: None,
 ):
     """fill key/value state to cache for paged attention."""
-    ext_ops.fill_kv_cache(key_states, value_states, key_caches,
-                          value_caches, context.kv_start_indices)
+    ext_ops.fill_kv_cache(key_states, value_states, key_caches, value_caches,
+                          context.kv_start_indices)
diff --git a/lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py b/lmdeploy/pytorch/kernels/ascend/fused_rotary_emb.py
@@ -21,10 +21,12 @@ def fused_rotary_emb(
     position_ids = position_ids.squeeze(0).unsqueeze(-1)
     pos_freq = position_ids / scaling_factor * inv_freq
     if not (hasattr(context, 'cos') or hasattr(context, 'sin')):
-        cos = (torch.cos(pos_freq).view(batch, seqlen, 1, -1)
-                                  .repeat(1, 1, 1, 2).to(query_states.dtype))
-        sin = (torch.sin(pos_freq).view(batch, seqlen, 1, -1)
-                                  .repeat(1, 1, 1, 2).to(query_states.dtype))
+        cos = (torch.cos(pos_freq).view(batch, seqlen, 1,
+                                        -1).repeat(1, 1, 1,
+                                                   2).to(query_states.dtype))
+        sin = (torch.sin(pos_freq).view(batch, seqlen, 1,
+                                        -1).repeat(1, 1, 1,
+                                                   2).to(query_states.dtype))
         if context:
             setattr(context, 'cos', cos)
             setattr(context, 'sin', sin)
diff --git a/lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/ascend/moe_gating_topk_softmax.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import infer_ext.ops as ext_ops
+import torch
+from torch import Tensor
+
+
+def moe_gating_topk_softmax(router_logits: Tensor, topk: int):
+    routing_weights, selected_experts = ext_ops.moe_gating_topk_softmax(
+        router_logits, topk)
+    return routing_weights.to(torch.float32), selected_experts.to(torch.int64)
diff --git a/lmdeploy/pytorch/kernels/ascend/paged_attention_fwd.py b/lmdeploy/pytorch/kernels/ascend/paged_attention_fwd.py
@@ -21,7 +21,7 @@ def flash_context_attention(
 ):
     num_q_heads, dim = query_states.shape[1:3]
     num_kv_heads = value_states.shape[1]
-    batch = q_start_loc.shape[0]    
+    batch = q_start_loc.shape[0]
 
     for i in range(batch):
         if torch.equal(q_seq_len[i], kv_seq_len[i]):
@@ -30,30 +30,32 @@ def flash_context_attention(
                 query_states,
                 key_states,
                 value_states,
-                q_start_loc[i:i+1],
-                q_seq_len[i:i+1],
+                q_start_loc[i:i + 1],
+                q_seq_len[i:i + 1],
                 num_q_heads,
                 num_kv_heads,
-                context.attention_mask[i:i+1],
+                context.attention_mask[i:i + 1],
             )
         else:
             key_cache = key_cache.reshape(1, kv_cache_len, num_kv_heads * dim)
-            value_cache = value_cache.reshape(1, kv_cache_len, num_kv_heads * dim)
+            value_cache = value_cache.reshape(1, kv_cache_len,
+                                              num_kv_heads * dim)
             ext_ops.paged_prefill_attention(
                 attn_output,
                 query_states,
                 key_cache,
                 value_cache,
                 block_offsets,
                 block_size,
-                q_start_loc[i:i+1],
-                q_seq_len[i:i+1],
-                kv_seq_len[i:i+1],
+                q_start_loc[i:i + 1],
+                q_seq_len[i:i + 1],
+                kv_seq_len[i:i + 1],
                 num_q_heads,
                 num_kv_heads,
-                context.attention_mask[i:i+1],
+                context.attention_mask[i:i + 1],
             )
 
+
 def paged_token_attention(q, k_cache, v_cache, attn_output, kv_seq_len,
                           block_offsets, block_size):
     num_kv_heads, num_q_heads = k_cache.shape[1], q.shape[1]
@@ -69,6 +71,7 @@ def paged_token_attention(q, k_cache, v_cache, attn_output, kv_seq_len,
         num_kv_heads,
     )
 
+
 def paged_attention_fwd(
     query_states: Tensor,
     key_states: torch.Tensor,
diff --git a/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py b/lmdeploy/pytorch/kernels/moe_gating_topk_softmax.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dispatcher import FunctionDispatcher
+
+moe_gating_topk_softmax = FunctionDispatcher(
+    'moe_gating_topk_softmax').make_caller()