add npu optimize and apply wan vae dp

TmacAaron · TmacAaron · commit d4c168458ba9 · 2025-11-02T14:17:08.000Z
diff --git a/examples/parallelism/run.sh b/examples/parallelism/run.sh
@@ -0,0 +1,10 @@
+export HCCL_OP_EXPANSION_MODE="AIV"
+export TASK_QUEUE_ENABLE=2
+export CPU_AFFINITY_CONF=2
+
+export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2:$LD_PRELOAD
+
+
+FLUX_DIR=/home/weights/FLUX.1-dev/ torchrun --nproc_per_node=1 run_flux_cp_npu.py --attn "_native_npu" --height 1024 --width 1024
+
+# WAN_2_2_DIR=/home/weights/Wan2.1-T2V-14B-Diffusers/ torchrun --nproc_per_node=8 run_wan_cp_npu.py --attn "_native_npu" --height 1024 --width 1024 --steps 10 --parallel ulysses --vae-dp
diff --git a/examples/parallelism/run_flux_cp_npu.py b/examples/parallelism/run_flux_cp_npu.py
@@ -0,0 +1,86 @@
+import os
+import sys
+
+sys.path.append("..")
+
+import time
+import torch
+import torch_npu
+from torch_npu.contrib import transfer_to_npu
+
+from diffusers import (
+    FluxPipeline,
+    FluxTransformer2DModel,
+    PipelineQuantizationConfig,
+)
+from utils import (
+    get_args,
+    strify,
+    cachify,
+    maybe_init_distributed,
+    maybe_destroy_distributed,
+)
+import cache_dit
+from cache_dit.npu_optim import npu_optimize
+
+
+npu_optimize([
+    "npu_fast_gelu",
+    "npu_rms_norm",
+    "npu_layer_norm_eval",
+    "npu_rotary_mul",
+])
+
+args = get_args()
+print(args)
+
+rank, device = maybe_init_distributed(args)
+
+pipe: FluxPipeline = FluxPipeline.from_pretrained(
+    os.environ.get(
+        "FLUX_DIR",
+        "black-forest-labs/FLUX.1-dev",
+    ),
+    torch_dtype=torch.bfloat16,
+).to("cuda")
+
+if args.cache or args.parallel_type is not None:
+    cachify(args, pipe)
+
+assert isinstance(pipe.transformer, FluxTransformer2DModel)
+
+pipe.set_progress_bar_config(disable=rank != 0)
+
+
+def run_pipe(pipe: FluxPipeline):
+    image = pipe(
+        "A cat holding a sign that says hello world",
+        height=1024 if args.height is None else args.height,
+        width=1024 if args.width is None else args.width,
+        num_inference_steps=28 if args.steps is None else args.steps,
+        generator=torch.Generator("cpu").manual_seed(0),
+    ).images[0]
+    return image
+
+
+if args.compile:
+    cache_dit.set_compile_configs()
+    pipe.transformer = torch.compile(pipe.transformer)
+
+# warmup
+_ = run_pipe(pipe)
+
+start = time.time()
+image = run_pipe(pipe)
+end = time.time()
+
+if rank == 0:
+    cache_dit.summary(pipe)
+
+    time_cost = end - start
+    save_path = f"flux.{strify(args, pipe)}.png"
+    print(f"Time cost: {time_cost:.2f}s")
+    print(f"Saving image to {save_path}")
+    image.save(save_path)
+
+maybe_destroy_distributed()
diff --git a/examples/parallelism/run_wan_cp_npu.py b/examples/parallelism/run_wan_cp_npu.py
@@ -20,6 +20,7 @@
 )
 
 import cache_dit
+from cache_dit.npu_optim import npu_optimize
 
 
 def run_pipe(args, pipe, warmup: bool = False):
@@ -73,12 +74,15 @@ def main():
     else:
         pipe.to(device)
 
+    if args.vae_dp:
+        pipe.vae.enable_dp(world_size=8, hw_splits=(2, 4)) # , overlap_ratio=0.01, overlap_pixels=64)
+
     if args.vae_tiling:
         pipe.vae.enable_tiling(
-            tile_sample_min_height=int(args.height / 2 * 3),
-            tile_sample_min_width=int(args.width / 2 * 3),
-            tile_sample_stride_height=int(args.height / 2),
-            tile_sample_stride_width=int(args.width / 2),
+            # tile_sample_min_height=int(args.height / 2 * 3),
+            # tile_sample_min_width=int(args.width / 2 * 3),
+            # tile_sample_stride_height=int(args.height / 2),
+            # tile_sample_stride_width=int(args.width / 2),
         )
 
     assert isinstance(pipe.transformer, WanTransformer3DModel)
@@ -105,4 +109,10 @@ def main():
 
 
 if __name__ == "__main__":
+    npu_optimize([
+        "npu_fast_gelu",
+        "npu_rms_norm",
+        "npu_layer_norm_eval",
+        "npu_rotary_mul",
+    ])
     main()
diff --git a/examples/utils.py b/examples/utils.py
@@ -85,6 +85,7 @@ def get_args(
     )
     parser.add_argument("--perf", action="store_true", default=False)
     parser.add_argument("--vae-tiling", action="store_true", default=False)
+    parser.add_argument("--vae-dp", action="store_true", default=False)
     parser.add_argument("--cpu-offload", action="store_true", default=False)
     return parser.parse_args() if parse else parser
 
diff --git a/src/cache_dit/npu_optim/__init__.py b/src/cache_dit/npu_optim/__init__.py
@@ -0,0 +1 @@
+from .utils import npu_optimize
diff --git a/src/cache_dit/npu_optim/npu_ops/__init__.py b/src/cache_dit/npu_optim/npu_ops/__init__.py
@@ -0,0 +1,12 @@
+from .npu_fast_gelu import replace_npu_fast_gelu
+from .npu_rms_norm import replace_npu_rms_norm
+from .npu_layer_norm_eval import replace_npu_layer_norm_eval
+from .npu_rotary_mul import replace_npu_rotary_mul
+
+
+NPU_OPTIM_MAP = {
+    "npu_fast_gelu": replace_npu_fast_gelu,
+    "npu_rms_norm": replace_npu_rms_norm,
+    "npu_layer_norm_eval": replace_npu_layer_norm_eval,
+    "npu_rotary_mul": replace_npu_rotary_mul,
+}
diff --git a/src/cache_dit/npu_optim/npu_ops/npu_fast_gelu.py b/src/cache_dit/npu_optim/npu_ops/npu_fast_gelu.py
@@ -0,0 +1,30 @@
+import torch
+import torch_npu
+import torch.nn as nn
+
+from diffusers.models.activations import GELU as GeluDiffuser
+
+from ..utils import log_replace_info
+
+
+class NpuFastGelu(nn.GELU):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return torch_npu.npu_fast_gelu(input)
+
+
+class NpuFastGeluDiffuser(GeluDiffuser):
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        return torch_npu.npu_fast_gelu(gate)
+
+
+def replace_func():
+    from diffusers.models import activations
+    activations.GELU = NpuFastGeluDiffuser
+
+    from torch import nn
+    nn.GELU = NpuFastGelu
+
+
+def replace_npu_fast_gelu():
+    replace_func()
+    log_replace_info("nn.GELU and GELU of Diffusers", "npu_fast_gelu")
diff --git a/src/cache_dit/npu_optim/npu_ops/npu_layer_norm_eval.py b/src/cache_dit/npu_optim/npu_ops/npu_layer_norm_eval.py
@@ -0,0 +1,31 @@
+
+import torch
+import torch_npu
+
+import torch.nn as nn
+
+from ..utils import log_replace_info
+
+
+class NpuLayerNorm(nn.LayerNorm):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        return torch_npu.npu_layer_norm_eval(
+            inputs,
+            normalized_shape=self.normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            eps=self.eps,
+        )
+
+
+def replace_func():
+    # from torch import nn
+    # nn.LayerNorm = NpuLayerNorm
+
+    from diffusers.models import normalization
+    normalization.FP32LayerNorm = NpuLayerNorm
+
+
+def replace_npu_layer_norm_eval():
+    replace_func()
+    log_replace_info("FP32LayerNorm of Diffusers", "npu_layer_norm_eval")
diff --git a/src/cache_dit/npu_optim/npu_ops/npu_rms_norm.py b/src/cache_dit/npu_optim/npu_ops/npu_rms_norm.py
@@ -0,0 +1,20 @@
+import torch
+import torch_npu
+import torch.nn as nn
+
+from ..utils import log_replace_info
+
+
+class NpuRMSNorm(nn.RMSNorm):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch_npu.npu_rms_norm(x, self.weight, self.eps)[0]
+
+
+def replace_func():
+    from torch import nn
+    nn.RMSNorm = NpuRMSNorm
+
+
+def replace_npu_rms_norm():
+    replace_func()
+    log_replace_info("nn.RMSNorm", "npu_rms_norm")
diff --git a/src/cache_dit/npu_optim/npu_ops/npu_rotary_mul.py b/src/cache_dit/npu_optim/npu_ops/npu_rotary_mul.py
@@ -0,0 +1,73 @@
+from typing import Tuple, Union
+
+import torch
+import torch_npu
+
+from ..utils import log_replace_info
+
+
+def npu_apply_rotary_emb(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+    sequence_dim: int = 2,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        if sequence_dim == 2:
+            cos = cos[None, None, :, :]
+            sin = sin[None, None, :, :]
+        elif sequence_dim == 1:
+            cos = cos[None, :, None, :]
+            sin = sin[None, :, None, :]
+        else:
+            raise ValueError(f"`sequence_dim={sequence_dim}` but should be 1 or 2.")
+
+        cos, sin = cos.to(x.device), sin.to(x.device)
+
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            rotary_mode = "interleave"
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            rotary_mode = "half"
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = torch_npu.npu_rotary_mul(x, cos, sin, rotary_mode=rotary_mode).to(x.dtype)
+
+        return out
+    else:
+        # used for lumina
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(2)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+
+        return x_out.type_as(x)
+
+
+def replace_func():
+    from diffusers.models import embeddings
+    from diffusers.models.transformers import transformer_flux
+
+    embeddings.apply_rotary_emb = npu_apply_rotary_emb
+    transformer_flux.apply_rotary_emb = npu_apply_rotary_emb
+
+
+def replace_npu_rotary_mul():
+    replace_func()
+    log_replace_info("apply_rotary_emb", "npu_rotary_mul")
diff --git a/src/cache_dit/npu_optim/utils.py b/src/cache_dit/npu_optim/utils.py
@@ -0,0 +1,34 @@
+try:
+    from diffusers.utils.import_utils import is_torch_npu_available
+
+except:
+    def is_torch_npu_available():
+        try:
+            import torch
+            import torch_npu
+            return torch.npu.is_available()
+        except:
+            return False
+
+from cache_dit.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def log_replace_info(ori_module, npu_module):
+    logger.info(f"Replaced '{ori_module}' with '{npu_module}' successfully")
+
+
+def npu_optimize(optim_modules: dict = None):
+    if is_torch_npu_available():
+        from .npu_ops import NPU_OPTIM_MAP
+        if optim_modules is None:
+            optim_modules = list(NPU_OPTIM_MAP.keys())
+
+        for module in optim_modules:
+            try:
+                NPU_OPTIM_MAP[module]()
+            except:
+                logger.warning(f"Apply {module} failed, will still use original module")
+    else:
+        logger.warning("NPU is unavailable, will not apply npu optimizations")

Original file line number	Diff line number	Diff line change
`@@ -85,6 +85,7 @@ def get_args(`
`85`	`85`	`)`
`86`	`86`	`parser.add_argument("--perf", action="store_true", default=False)`
`87`	`87`	`parser.add_argument("--vae-tiling", action="store_true", default=False)`
	`88`	`+ parser.add_argument("--vae-dp", action="store_true", default=False)`
`88`	`89`	`parser.add_argument("--cpu-offload", action="store_true", default=False)`
`89`	`90`	`return parser.parse_args() if parse else parser`
`90`	`91`