[Inference]Repalce Attention layer and MLP layer by shardformer to optimize the weight transpose operation，add fused_qkv and fused linear_add (#5340)

isky-cd · web-flow · commit 249644c23b04 · 2024-02-01T15:49:39.000+08:00
* add fused qkv

* replace attn and mlp by shardformer

* fix bugs in mlp

* add docstrings

* fix test_inference_engine.py

* add optimize unbind

* add fused_addmm

* rm squeeze(1)

* refactor codes

* fix ci bugs

* rename ShardFormerLlamaMLP and ShardFormerLlamaAttention

* Removed the dependency on LlamaFlashAttention2

* rollback test_inference_engine.py
diff --git a/colossalai/inference/modeling/models/nopadding_llama.py b/colossalai/inference/modeling/models/nopadding_llama.py
diff --git a/colossalai/inference/modeling/models/padding_llama.py b/colossalai/inference/modeling/models/padding_llama.py
diff --git a/colossalai/inference/modeling/policy/nopadding_llama.py b/colossalai/inference/modeling/policy/nopadding_llama.py
@@ -1,25 +1,18 @@
 from functools import partial
 
 import torch
-from transformers.models.llama.modeling_llama import (
-    LlamaAttention,
-    LlamaDecoderLayer,
-    LlamaFlashAttention2,
-    LlamaForCausalLM,
-    LlamaMLP,
-    LlamaModel,
-    LlamaRMSNorm,
-    LlamaSdpaAttention,
-)
+from torch.nn import Parameter
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaModel, LlamaRMSNorm
 
 from colossalai.inference.modeling.models.nopadding_llama import (
-    llama_attn_forward,
+    NopadLlamaAttention,
+    NopadLlamaMLP,
     llama_causal_lm_forward,
     llama_decoder_layer_forward,
     llama_model_forward,
-    nopad_mlp,
 )
 from colossalai.inference.utils import init_to_get_rotary
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription
 
 # import colossalai
 from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy
@@ -50,6 +43,27 @@ def __init__(self) -> None:
 
     def module_policy(self):
         policy = super().module_policy()
+
+        decoder_attribute_replacement = {
+            "lm_head.weight": Parameter(self.model.lm_head.weight.transpose(0, 1), requires_grad=False),
+        }
+        policy[LlamaForCausalLM] = ModulePolicyDescription(
+            attribute_replacement=decoder_attribute_replacement,
+        )
+
+        policy[LlamaDecoderLayer] = ModulePolicyDescription(
+            sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="mlp",
+                    target_module=NopadLlamaMLP,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="self_attn",
+                    target_module=NopadLlamaAttention,
+                ),
+            ]
+        )
+
         self.shard_config._infer()
 
         infer_forward = llama_causal_lm_forward
@@ -68,28 +82,6 @@ def module_policy(self):
             description=method_replacement, policy=policy, target_key=LlamaDecoderLayer
         )
 
-        infer_forward = nopad_mlp
-        method_replacement = {"forward": partial(infer_forward)}
-        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=LlamaMLP)
-
-        infer_forward = llama_attn_forward
-        method_replacement = {"forward": partial(infer_forward)}
-        self.append_or_create_method_replacement(
-            description=method_replacement, policy=policy, target_key=LlamaAttention
-        )
-
-        infer_forward = llama_attn_forward
-        method_replacement = {"forward": partial(infer_forward)}
-        self.append_or_create_method_replacement(
-            description=method_replacement, policy=policy, target_key=LlamaFlashAttention2
-        )
-
-        infer_forward = llama_attn_forward
-        method_replacement = {"forward": partial(infer_forward)}
-        self.append_or_create_method_replacement(
-            description=method_replacement, policy=policy, target_key=LlamaSdpaAttention
-        )
-
         infer_forward = None
         if HAS_TRITON_RMSNORM:
             infer_forward = get_triton_rmsnorm_forward()
diff --git a/colossalai/inference/modeling/policy/padding_llama.py b/colossalai/inference/modeling/policy/padding_llama.py
@@ -1,18 +1,10 @@
 from functools import partial
 
 import torch
-from transformers.models.llama.modeling_llama import (
-    LlamaAttention,
-    LlamaDecoderLayer,
-    LlamaFlashAttention2,
-    LlamaForCausalLM,
-    LlamaModel,
-    LlamaRMSNorm,
-    LlamaSdpaAttention,
-)
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaModel, LlamaRMSNorm
 
 from colossalai.inference.modeling.models.padding_llama import (
-    llama_attn_forward,
+    PadLlamaAttention,
     llama_causal_lm_forward,
     llama_decoder_layer_forward,
     llama_model_forward,
@@ -49,105 +41,16 @@ def __init__(self) -> None:
 
     def module_policy(self):
         policy = super().module_policy()
-        decoder_attribute_replacement = {
-            "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
-            "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
-            "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
-            // self.shard_config.tensor_parallel_size,
-        }
-        if self.shard_config.extra_kwargs.get("quant", None) == "gptq":
-            from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear
-
-            policy[LlamaDecoderLayer] = ModulePolicyDescription(
-                attribute_replacement=decoder_attribute_replacement,
-                sub_module_replacement=[
-                    SubModuleReplacementDescription(
-                        suffix="self_attn.q_proj",
-                        target_module=ColCaiQuantLinear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="self_attn.k_proj",
-                        target_module=ColCaiQuantLinear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="self_attn.v_proj",
-                        target_module=ColCaiQuantLinear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="self_attn.o_proj",
-                        target_module=RowCaiQuantLinear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="mlp.gate_proj",
-                        target_module=ColCaiQuantLinear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="mlp.up_proj",
-                        target_module=ColCaiQuantLinear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="mlp.down_proj",
-                        target_module=RowCaiQuantLinear,
-                        kwargs={"split_num": 1},
-                    ),
-                ],
-            )
 
-        elif self.shard_config.extra_kwargs.get("quant", None) == "smoothquant":
-            from colossalai.inference.quant.smoothquant.models.llama import LlamaSmoothquantDecoderLayer
-            from colossalai.inference.quant.smoothquant.models.parallel_linear import (
-                ColW8A8BFP32OFP32Linear,
-                RowW8A8B8O8Linear,
-                RowW8A8BFP32O32LinearSiLU,
-                RowW8A8BFP32OFP32Linear,
-            )
+        policy[LlamaDecoderLayer] = ModulePolicyDescription(
+            sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="self_attn",
+                    target_module=PadLlamaAttention,
+                ),
+            ]
+        )
 
-            policy[LlamaSmoothquantDecoderLayer] = ModulePolicyDescription(
-                attribute_replacement=decoder_attribute_replacement,
-                sub_module_replacement=[
-                    SubModuleReplacementDescription(
-                        suffix="self_attn.q_proj",
-                        target_module=RowW8A8B8O8Linear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="self_attn.k_proj",
-                        target_module=RowW8A8B8O8Linear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="self_attn.v_proj",
-                        target_module=RowW8A8B8O8Linear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="self_attn.o_proj",
-                        target_module=ColW8A8BFP32OFP32Linear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="mlp.gate_proj",
-                        target_module=RowW8A8BFP32O32LinearSiLU,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="mlp.up_proj",
-                        target_module=RowW8A8BFP32OFP32Linear,
-                        kwargs={"split_num": 1},
-                    ),
-                    SubModuleReplacementDescription(
-                        suffix="mlp.down_proj",
-                        target_module=ColW8A8BFP32OFP32Linear,
-                        kwargs={"split_num": 1},
-                    ),
-                ],
-            )
         self.shard_config._infer()
 
         infer_forward = llama_causal_lm_forward
@@ -166,24 +69,6 @@ def module_policy(self):
             description=method_replacement, policy=policy, target_key=LlamaDecoderLayer
         )
 
-        infer_forward = llama_attn_forward
-        method_replacement = {"forward": partial(infer_forward)}
-        self.append_or_create_method_replacement(
-            description=method_replacement, policy=policy, target_key=LlamaAttention
-        )
-
-        infer_forward = llama_attn_forward
-        method_replacement = {"forward": partial(infer_forward)}
-        self.append_or_create_method_replacement(
-            description=method_replacement, policy=policy, target_key=LlamaFlashAttention2
-        )
-
-        infer_forward = llama_attn_forward
-        method_replacement = {"forward": partial(infer_forward)}
-        self.append_or_create_method_replacement(
-            description=method_replacement, policy=policy, target_key=LlamaSdpaAttention
-        )
-
         infer_forward = None
         if HAS_TRITON_RMSNORM:
             infer_forward = get_triton_rmsnorm_forward()
diff --git a/colossalai/kernel/triton/flash_decoding.py b/colossalai/kernel/triton/flash_decoding.py
@@ -143,8 +143,7 @@ def _flash_decoding_fwd_reduce_kernel(
     stride_o_lset,
     stride_o_lseh,
     stride_o_lseb,
-    stride_ob,
-    stride_ol,
+    stride_ot,
     stride_oh,
     stride_od,
     BLOCK_KV: tl.constexpr,
@@ -180,7 +179,7 @@ def _flash_decoding_fwd_reduce_kernel(
         m_i = m_ij
 
     acc = acc / l
-    offsets_O = cur_seq_idx * stride_ob + cur_head_idx * stride_oh + offsets_dmodel
+    offsets_O = cur_seq_idx * stride_ot + cur_head_idx * stride_oh + offsets_dmodel
     tl.store(O + offsets_O, acc.to(O.type.element_ty))
     return
 
@@ -212,7 +211,7 @@ def flash_decoding_attention(
             records the (kv) sequence lengths incorporating past kv sequence lengths.
         block_tables (torch.Tensor): [batch_size, max_blocks_per_sequence]
         max_seq_len_in_batch (int): Maximum sequence length in the batch.
-        output (torch.Tensor):  [bsz, 1, num_heads, head_dim]
+        output (torch.Tensor):  [bsz, num_heads, head_dim]
         mid_output (torch.Tensor): [ max_bsz , num_heads, kv_max_split_num, head_dim]
             Intermediate output tensor. `max_bsz` should be greater than or equal to `bsz`.
         mid_output_lse (torch.Tensor): [ max_bsz , num_heads, kv_max_split_num]
@@ -294,7 +293,7 @@ def flash_decoding_attention(
         HEAD_DIM=head_dim,
     )
 
-    output = torch.empty((bsz, 1, num_heads, head_dim), dtype=q.dtype, device=q.device) if output is None else output
+    output = torch.empty((bsz, num_heads, head_dim), dtype=q.dtype, device=q.device) if output is None else output
 
     grid = (triton.next_power_of_2(bsz), num_heads)
 
@@ -314,7 +313,6 @@ def flash_decoding_attention(
         output.stride(0),
         output.stride(1),
         output.stride(2),
-        output.stride(3),
         BLOCK_KV=block_size,
         HEAD_DIM=head_dim,
     )
diff --git a/examples/inference/run_benchmark.sh b/examples/inference/run_benchmark.sh
@@ -25,10 +25,20 @@ CUDA_VISIBLE_DEVICES_set_n_least_memory_usage 1
 # benchmark llama2-7b one single GPU
 
 for bsz in 16 32 64; do
-    python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b $bsz -s 512 --output_len 256 --mode ${mode} | tee logs/${mode}_${GPU}_${bsz}_256.txt
+    python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b $bsz -s 512 --output_len 256 --mode ${mode} | tee logs/${mode}_${GPU}_${bsz}_512_256.txt
 done
 
 
 for bsz in 16 32 64; do
-    python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b $bsz -s 1024 --output_len 256 --mode ${mode} | tee logs/${mode}_${GPU}_${bsz}_1024.txt
+    python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b $bsz -s 1024 --output_len 256 --mode ${mode} | tee logs/${mode}_${GPU}_${bsz}_1024_256.txt
+done
+
+
+for bsz in 16 32 64; do
+    python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b $bsz -s 256 --output_len 128 --mode ${mode} | tee logs/${mode}_${GPU}_${bsz}_256_128.txt
+done
+
+
+for bsz in 16 32 64; do
+    python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b $bsz -s 1024 --output_len 128 --mode ${mode} | tee logs/${mode}_${GPU}_${bsz}_1024_128.txt
 done
diff --git a/tests/test_infer_ops/triton/kernel_utils.py b/tests/test_infer_ops/triton/kernel_utils.py
@@ -69,6 +69,7 @@ def torch_attn_ref(
             f"`attn_output` should be of size {(bsz, num_heads, seq_len, head_dim)}, but is" f" {out.size()}"
         )
     out = out.transpose(1, 2).contiguous()
+    out = out.squeeze(1)
     return out
 
 
diff --git a/tests/test_infer_ops/triton/test_decoding_attn.py b/tests/test_infer_ops/triton/test_decoding_attn.py
@@ -94,7 +94,7 @@ def test_flash_decoding(
     max_seq_len_in_b = kv_seq_lengths.max().item()
     # The maximum block length splitted on kv should be the kv cache block size
     kv_max_split_num = (max_seq_len_in_b + block_size - 1) // block_size
-    output = torch.empty((bsz, 1, num_attn_heads, HEAD_DIM), dtype=q.dtype, device=q.device)
+    output = torch.empty((bsz, num_attn_heads, HEAD_DIM), dtype=q.dtype, device=q.device)
     mid_output = torch.empty(
         size=(bsz, num_attn_heads, kv_max_split_num, HEAD_DIM), dtype=torch.float32, device=q.device
     )
@@ -189,7 +189,7 @@ def bench_kernel(
         block_tables = block_tables.to(device=device)
         # the maximum block length splitted on kv should be the kv cache block size
         kv_max_split_num = (max_seq_len_in_b + block_size - 1) // block_size
-        output = torch.empty((bsz, 1, num_attn_heads, HEAD_DIM), dtype=dtype, device=device)
+        output = torch.empty((bsz, num_attn_heads, HEAD_DIM), dtype=dtype, device=device)
         mid_output = torch.empty(
             size=(bsz, num_attn_heads, kv_max_split_num, HEAD_DIM), dtype=torch.float32, device=q.device
         )

Original file line number	Diff line number	Diff line change
`@@ -69,6 +69,7 @@ def torch_attn_ref(`
`69`	`69`	f"`attn_output` should be of size {(bsz, num_heads, seq_len, head_dim)}, but is" f" {out.size()}"
`70`	`70`	`)`
`71`	`71`	`out = out.transpose(1, 2).contiguous()`
	`72`	`+ out = out.squeeze(1)`
`72`	`73`	`return out`
`73`	`74`
`74`	`75`