sdxl sd vae support attn impl (#52)

tenderness-git · web-flow · commit 3e65b06ea221 · 2025-05-13T11:52:58.000+08:00
* sdxl sd vae support attn impl

* ruff format
diff --git a/diffsynth_engine/models/basic/transformer_helper.py b/diffsynth_engine/models/basic/transformer_helper.py
@@ -65,17 +65,29 @@ def forward(self, ids):
 
 
 class RMSNorm(nn.Module):
-    def __init__(self, dim, eps, device: str, dtype: torch.dtype):
+    def __init__(
+        self,
+        dim,
+        eps=1e-5,
+        elementwise_affine=True,
+        device: str = "cuda:0",
+        dtype: torch.dtype = torch.bfloat16,
+    ):
         super().__init__()
-        self.weight = nn.Parameter(torch.ones((dim,), device=device, dtype=dtype))
         self.eps = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        variance = hidden_states.to(torch.float32).square().mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
-        hidden_states = hidden_states.to(input_dtype) * self.weight
-        return hidden_states
+        self.dim = dim
+        self.elementwise_affine = elementwise_affine
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim, device=device, dtype=dtype))
+
+    def norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        norm_result = self.norm(x.float()).to(x.dtype)
+        if self.elementwise_affine:
+            return norm_result * self.weight
+        return norm_result
 
 
 class NewGELUActivation(nn.Module):
diff --git a/diffsynth_engine/models/components/vae.py b/diffsynth_engine/models/components/vae.py
@@ -67,6 +67,7 @@ def __init__(
         num_layers=1,
         norm_num_groups=32,
         eps=1e-5,
+        attn_impl: str = "auto",
         device: str = "cuda:0",
         dtype: torch.dtype = torch.float32,
     ):
@@ -86,6 +87,7 @@ def __init__(
                     bias_q=True,
                     bias_kv=True,
                     bias_out=True,
+                    attn_impl=attn_impl,
                     device=device,
                     dtype=dtype,
                 )
@@ -119,6 +121,7 @@ def __init__(
         scaling_factor: float = 0.18215,
         shift_factor: float = 0,
         use_post_quant_conv: bool = True,
+        attn_impl: str = "auto",
         device: str = "cuda:0",
         dtype: torch.dtype = torch.float32,
     ):
@@ -137,7 +140,7 @@ def __init__(
             [
                 # UNetMidBlock2D
                 ResnetBlock(512, 512, eps=1e-6, device=device, dtype=dtype),
-                VAEAttentionBlock(1, 512, 512, 1, eps=1e-6, device=device, dtype=dtype),
+                VAEAttentionBlock(1, 512, 512, 1, eps=1e-6, device=device, dtype=dtype, attn_impl=attn_impl),
                 ResnetBlock(512, 512, eps=1e-6, device=device, dtype=dtype),
                 # UpDecoderBlock2D
                 ResnetBlock(512, 512, eps=1e-6, device=device, dtype=dtype),
@@ -202,6 +205,7 @@ def from_state_dict(
         scaling_factor: float = 0.18215,
         shift_factor: float = 0,
         use_post_quant_conv: bool = True,
+        attn_impl: str = "auto",
     ):
         with no_init_weights():
             model = torch.nn.utils.skip_init(
@@ -210,6 +214,7 @@ def from_state_dict(
                 scaling_factor=scaling_factor,
                 shift_factor=shift_factor,
                 use_post_quant_conv=use_post_quant_conv,
+                attn_impl=attn_impl,
                 device=device,
                 dtype=dtype,
             )
@@ -230,6 +235,7 @@ def __init__(
         scaling_factor: float = 0.18215,
         shift_factor: float = 0,
         use_quant_conv: bool = True,
+        attn_impl: str = "auto",
         device: str = "cuda:0",
         dtype: torch.dtype = torch.float32,
     ):
@@ -263,7 +269,7 @@ def __init__(
                 ResnetBlock(512, 512, eps=1e-6, device=device, dtype=dtype),
                 # UNetMidBlock2D
                 ResnetBlock(512, 512, eps=1e-6, device=device, dtype=dtype),
-                VAEAttentionBlock(1, 512, 512, 1, eps=1e-6, device=device, dtype=dtype),
+                VAEAttentionBlock(1, 512, 512, 1, eps=1e-6, device=device, dtype=dtype, attn_impl=attn_impl),
                 ResnetBlock(512, 512, eps=1e-6, device=device, dtype=dtype),
             ]
         )
@@ -309,6 +315,7 @@ def from_state_dict(
         scaling_factor: float = 0.18215,
         shift_factor: float = 0,
         use_quant_conv: bool = True,
+        attn_impl: str = "auto",
     ):
         with no_init_weights():
             model = torch.nn.utils.skip_init(
@@ -317,6 +324,7 @@ def from_state_dict(
                 scaling_factor=scaling_factor,
                 shift_factor=shift_factor,
                 use_quant_conv=use_quant_conv,
+                attn_impl=attn_impl,
                 device=device,
                 dtype=dtype,
             )
@@ -338,6 +346,7 @@ def __init__(
         shift_factor: float = 0,
         use_quant_conv: bool = True,
         use_post_quant_conv: bool = True,
+        attn_impl: str = "auto",
         device: str = "cuda:0",
         dtype: torch.dtype = torch.float32,
     ):
@@ -347,6 +356,7 @@ def __init__(
             scaling_factor=scaling_factor,
             shift_factor=shift_factor,
             use_quant_conv=use_quant_conv,
+            attn_impl=attn_impl,
             device=device,
             dtype=dtype,
         )
@@ -355,6 +365,7 @@ def __init__(
             scaling_factor=scaling_factor,
             shift_factor=shift_factor,
             use_post_quant_conv=use_post_quant_conv,
+            attn_impl=attn_impl,
             device=device,
             dtype=dtype,
         )
@@ -376,6 +387,7 @@ def from_state_dict(
         shift_factor: float = 0,
         use_quant_conv: bool = True,
         use_post_quant_conv: bool = True,
+        attn_impl: str = "auto",
     ):
         with no_init_weights():
             model = torch.nn.utils.skip_init(
@@ -385,6 +397,7 @@ def from_state_dict(
                 shift_factor=shift_factor,
                 use_quant_conv=use_quant_conv,
                 use_post_quant_conv=use_post_quant_conv,
+                attn_impl=attn_impl,
                 device=device,
                 dtype=dtype,
             )
diff --git a/diffsynth_engine/models/flux/flux_dit.py b/diffsynth_engine/models/flux/flux_dit.py
@@ -227,7 +227,7 @@ def __init__(
             nn.Linear(dim * 4, dim, device=device, dtype=dtype),
         )
 
-    def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary_emb):
+    def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary_emb, image_emb):
         norm_hidden_states_a, gate_msa_a, shift_mlp_a, scale_mlp_a, gate_mlp_a = self.norm1_a(hidden_states_a, emb=temb)
         norm_hidden_states_b, gate_msa_b, shift_mlp_b, scale_mlp_b, gate_mlp_b = self.norm1_b(hidden_states_b, emb=temb)
 
@@ -293,7 +293,7 @@ def process_attention(self, hidden_states, image_rotary_emb):
         hidden_states = hidden_states.to(q.dtype)
         return hidden_states
 
-    def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary_emb):
+    def forward(self, hidden_states_a, hidden_states_b, temb, image_rotary_emb, image_emb):
         residual = hidden_states_a
         norm_hidden_states, gate = self.norm(hidden_states_a, emb=temb)
         hidden_states_a = self.to_qkv_mlp(norm_hidden_states)
@@ -386,6 +386,7 @@ def forward(
         timestep,
         prompt_emb,
         pooled_prompt_emb,
+        image_emb,
         guidance,
         text_ids,
         image_ids=None,
@@ -421,10 +422,13 @@ def forward(
                         prompt_emb,
                         conditioning,
                         image_rotary_emb,
+                        image_emb,
                         use_reentrant=False,
                     )
                 else:
-                    hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
+                    hidden_states, prompt_emb = block(
+                        hidden_states, prompt_emb, conditioning, image_rotary_emb, image_emb
+                    )
                 if controlnet_double_block_output is not None:
                     interval_control = len(self.blocks) / len(controlnet_double_block_output)
                     interval_control = int(np.ceil(interval_control))
@@ -439,10 +443,13 @@ def forward(
                         prompt_emb,
                         conditioning,
                         image_rotary_emb,
+                        image_emb,
                         use_reentrant=False,
                     )
                 else:
-                    hidden_states, prompt_emb = block(hidden_states, prompt_emb, conditioning, image_rotary_emb)
+                    hidden_states, prompt_emb = block(
+                        hidden_states, prompt_emb, conditioning, image_rotary_emb, image_emb
+                    )
                 if controlnet_single_block_output is not None:
                     interval_control = len(self.single_blocks) / len(controlnet_double_block_output)
                     interval_control = int(np.ceil(interval_control))
diff --git a/diffsynth_engine/models/sd/sd_vae.py b/diffsynth_engine/models/sd/sd_vae.py
@@ -6,33 +6,44 @@
 
 
 class SDVAEEncoder(VAEEncoder):
-    def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.float32):
+    def __init__(self, attn_impl: str = "auto", device: str = "cuda:0", dtype: torch.dtype = torch.float32):
         super().__init__(
-            latent_channels=4, scaling_factor=0.18215, shift_factor=0, use_quant_conv=True, device=device, dtype=dtype
+            latent_channels=4,
+            scaling_factor=0.18215,
+            shift_factor=0,
+            use_quant_conv=True,
+            attn_impl=attn_impl,
+            device=device,
+            dtype=dtype,
         )
 
     @classmethod
-    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
+    def from_state_dict(
+        cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype, attn_impl: str = "auto"
+    ):
         with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype, attn_impl=attn_impl)
         model.load_state_dict(state_dict)
         return model
 
 
 class SDVAEDecoder(VAEDecoder):
-    def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.float32):
+    def __init__(self, attn_impl: str = "auto", device: str = "cuda:0", dtype: torch.dtype = torch.float32):
         super().__init__(
             latent_channels=4,
             scaling_factor=0.18215,
             shift_factor=0,
             use_post_quant_conv=True,
+            attn_impl=attn_impl,
             device=device,
             dtype=dtype,
         )
 
     @classmethod
-    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
+    def from_state_dict(
+        cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype, attn_impl: str = "auto"
+    ):
         with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype, attn_impl=attn_impl)
         model.load_state_dict(state_dict)
         return model
diff --git a/diffsynth_engine/models/sdxl/sdxl_vae.py b/diffsynth_engine/models/sdxl/sdxl_vae.py
@@ -6,33 +6,44 @@
 
 
 class SDXLVAEEncoder(VAEEncoder):
-    def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.float32):
+    def __init__(self, attn_impl: str = "auto", device: str = "cuda:0", dtype: torch.dtype = torch.float32):
         super().__init__(
-            latent_channels=4, scaling_factor=0.13025, shift_factor=0, use_quant_conv=True, device=device, dtype=dtype
+            latent_channels=4,
+            scaling_factor=0.13025,
+            shift_factor=0,
+            use_quant_conv=True,
+            attn_impl=attn_impl,
+            device=device,
+            dtype=dtype,
         )
 
     @classmethod
-    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
+    def from_state_dict(
+        cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype, attn_impl: str = "auto"
+    ):
         with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype, attn_impl=attn_impl)
         model.load_state_dict(state_dict)
         return model
 
 
 class SDXLVAEDecoder(VAEDecoder):
-    def __init__(self, device: str = "cuda:0", dtype: torch.dtype = torch.float32):
+    def __init__(self, attn_impl: str = "auto", device: str = "cuda:0", dtype: torch.dtype = torch.float32):
         super().__init__(
             latent_channels=4,
             scaling_factor=0.13025,
             shift_factor=0,
             use_post_quant_conv=True,
+            attn_impl=attn_impl,
             device=device,
             dtype=dtype,
         )
 
     @classmethod
-    def from_state_dict(cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype):
+    def from_state_dict(
+        cls, state_dict: Dict[str, torch.Tensor], device: str, dtype: torch.dtype, attn_impl: str = "auto"
+    ):
         with no_init_weights():
-            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype)
+            model = torch.nn.utils.skip_init(cls, device=device, dtype=dtype, attn_impl=attn_impl)
         model.load_state_dict(state_dict)
         return model
diff --git a/diffsynth_engine/models/wan/wan_dit.py b/diffsynth_engine/models/wan/wan_dit.py
@@ -8,6 +8,7 @@
 
 from diffsynth_engine.models.base import StateDictConverter, PreTrainedModel
 from diffsynth_engine.models.basic.attention import attention, long_context_attention
+from diffsynth_engine.models.basic.transformer_helper import RMSNorm
 from diffsynth_engine.models.utils import no_init_weights
 from diffsynth_engine.utils.constants import (
     WAN_DIT_1_3B_T2V_CONFIG_FILE,
@@ -57,26 +58,6 @@ def rope_apply(x, freqs):
     return x_out.to(x.dtype).flatten(3)
 
 
-class RMSNorm(nn.Module):
-    def __init__(
-        self,
-        dim,
-        eps=1e-5,
-        device: str = "cuda:0",
-        dtype: torch.dtype = torch.bfloat16,
-    ):
-        super().__init__()
-        self.eps = eps
-        self.dim = dim
-        self.weight = nn.Parameter(torch.ones(dim, device=device, dtype=dtype))
-
-    def norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
-
-    def forward(self, x):
-        return self.norm(x.float()).to(x.dtype) * self.weight
-
-
 class SelfAttention(nn.Module):
     def __init__(
         self,
diff --git a/diffsynth_engine/pipelines/flux_image.py b/diffsynth_engine/pipelines/flux_image.py
diff --git a/diffsynth_engine/pipelines/sd_image.py b/diffsynth_engine/pipelines/sd_image.py
diff --git a/diffsynth_engine/pipelines/sdxl_image.py b/diffsynth_engine/pipelines/sdxl_image.py