Support pass kwargs to sd3 custom attention processor

Matrix53 · Matrix53 · commit f2ae8a13c8e1 · 2024-10-31T18:15:32.000+08:00
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -22,7 +22,13 @@
 from .activations import GEGLU, GELU, ApproximateGELU, FP32SiLU, SwiGLU
 from .attention_processor import Attention, JointAttnProcessor2_0
 from .embeddings import SinusoidalPositionalEmbedding
-from .normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm, SD35AdaLayerNormZeroX
+from .normalization import (
+    AdaLayerNorm,
+    AdaLayerNormContinuous,
+    AdaLayerNormZero,
+    RMSNorm,
+    SD35AdaLayerNormZeroX,
+)
 
 
 logger = logging.get_logger(__name__)
@@ -122,7 +128,12 @@ def __init__(
 
         if context_norm_type == "ada_norm_continous":
             self.norm1_context = AdaLayerNormContinuous(
-                dim, dim, elementwise_affine=False, eps=1e-6, bias=True, norm_type="layer_norm"
+                dim,
+                dim,
+                elementwise_affine=False,
+                eps=1e-6,
+                bias=True,
+                norm_type="layer_norm",
             )
         elif context_norm_type == "ada_norm_zero":
             self.norm1_context = AdaLayerNormZero(dim)
@@ -188,33 +199,51 @@ def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
         self._chunk_dim = dim
 
     def forward(
-        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        joint_attention_kwargs: Dict[str, Any] = None,
     ):
         if self.use_dual_attention:
-            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
-                hidden_states, emb=temb
-            )
+            (
+                norm_hidden_states,
+                gate_msa,
+                shift_mlp,
+                scale_mlp,
+                gate_mlp,
+                norm_hidden_states2,
+                gate_msa2,
+            ) = self.norm1(hidden_states, emb=temb)
         else:
             norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
 
         if self.context_pre_only:
             norm_encoder_hidden_states = self.norm1_context(encoder_hidden_states, temb)
         else:
-            norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
-                encoder_hidden_states, emb=temb
-            )
+            (
+                norm_encoder_hidden_states,
+                c_gate_msa,
+                c_shift_mlp,
+                c_scale_mlp,
+                c_gate_mlp,
+            ) = self.norm1_context(encoder_hidden_states, emb=temb)
+
+        joint_attention_kwargs = joint_attention_kwargs.copy() if joint_attention_kwargs is not None else {}
 
         # Attention.
         attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            **joint_attention_kwargs,
         )
 
         # Process attention outputs for the `hidden_states`.
         attn_output = gate_msa.unsqueeze(1) * attn_output
         hidden_states = hidden_states + attn_output
 
         if self.use_dual_attention:
-            attn_output2 = self.attn2(hidden_states=norm_hidden_states2)
+            attn_output2 = self.attn2(hidden_states=norm_hidden_states2, **joint_attention_kwargs)
             attn_output2 = gate_msa2.unsqueeze(1) * attn_output2
             hidden_states = hidden_states + attn_output2
 
@@ -241,7 +270,10 @@ def forward(
             if self._chunk_size is not None:
                 # "feed_forward_chunk_size" can be used to save memory
                 context_ff_output = _chunked_feed_forward(
-                    self.ff_context, norm_encoder_hidden_states, self._chunk_dim, self._chunk_size
+                    self.ff_context,
+                    norm_encoder_hidden_states,
+                    self._chunk_dim,
+                    self._chunk_size,
                 )
             else:
                 context_ff_output = self.ff_context(norm_encoder_hidden_states)
@@ -402,7 +434,7 @@ def __init__(
 
             self.attn2 = Attention(
                 query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                cross_attention_dim=(cross_attention_dim if not double_self_attention else None),
                 heads=num_attention_heads,
                 dim_head=attention_head_dim,
                 dropout=dropout,
@@ -506,7 +538,7 @@ def forward(
 
         attn_output = self.attn1(
             norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            encoder_hidden_states=(encoder_hidden_states if self.only_cross_attention else None),
             attention_mask=attention_mask,
             **cross_attention_kwargs,
         )
@@ -979,7 +1011,7 @@ def __init__(
 
             self.attn2 = Attention(
                 query_dim=dim,
-                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                cross_attention_dim=(cross_attention_dim if not double_self_attention else None),
                 heads=num_attention_heads,
                 dim_head=attention_head_dim,
                 dropout=dropout,
@@ -1045,7 +1077,10 @@ def _get_frame_weights(self, num_frames: int, weighting_scheme: str = "pyramid")
         return weights
 
     def set_free_noise_properties(
-        self, context_length: int, context_stride: int, weighting_scheme: str = "pyramid"
+        self,
+        context_length: int,
+        context_stride: int,
+        weighting_scheme: str = "pyramid",
     ) -> None:
         self.context_length = context_length
         self.context_stride = context_stride
@@ -1112,7 +1147,7 @@ def forward(
 
             attn_output = self.attn1(
                 norm_hidden_states,
-                encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+                encoder_hidden_states=(encoder_hidden_states if self.only_cross_attention else None),
                 attention_mask=attention_mask,
                 **cross_attention_kwargs,
             )
@@ -1158,7 +1193,11 @@ def forward(
         # looked into this deeply because other memory optimizations led to more pronounced reductions.
         hidden_states = torch.cat(
             [
-                torch.where(num_times_split > 0, accumulated_split / num_times_split, accumulated_split)
+                torch.where(
+                    num_times_split > 0,
+                    accumulated_split / num_times_split,
+                    accumulated_split,
+                )
                 for accumulated_split, num_times_split in zip(
                     accumulated_values.split(self.context_length, dim=1),
                     num_times_accumulated.split(self.context_length, dim=1),
diff --git a/src/diffusers/models/transformers/transformer_sd3.py b/src/diffusers/models/transformers/transformer_sd3.py
@@ -21,10 +21,20 @@
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...models.attention import JointTransformerBlock
-from ...models.attention_processor import Attention, AttentionProcessor, FusedJointAttnProcessor2_0
+from ...models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FusedJointAttnProcessor2_0,
+)
 from ...models.modeling_utils import ModelMixin
 from ...models.normalization import AdaLayerNormContinuous
-from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from ...utils import (
+    USE_PEFT_BACKEND,
+    is_torch_version,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
 from ..embeddings import CombinedTimestepTextProjEmbeddings, PatchEmbed
 from ..modeling_outputs import Transformer2DModelOutput
 
@@ -88,7 +98,8 @@ def __init__(
             pos_embed_max_size=pos_embed_max_size,  # hard-code for now.
         )
         self.time_text_embed = CombinedTimestepTextProjEmbeddings(
-            embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
+            embedding_dim=self.inner_dim,
+            pooled_projection_dim=self.config.pooled_projection_dim,
         )
         self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.config.caption_projection_dim)
 
@@ -166,7 +177,11 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
         # set recursively
         processors = {}
 
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+        def fn_recursive_add_processors(
+            name: str,
+            module: torch.nn.Module,
+            processors: Dict[str, AttentionProcessor],
+        ):
             if hasattr(module, "get_processor"):
                 processors[f"{name}.processor"] = module.get_processor()
 
@@ -334,12 +349,16 @@ def custom_forward(*inputs):
                     hidden_states,
                     encoder_hidden_states,
                     temb,
+                    joint_attention_kwargs,
                     **ckpt_kwargs,
                 )
 
             else:
                 encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    joint_attention_kwargs=joint_attention_kwargs,
                 )
 
             # controlnet residual
@@ -356,11 +375,23 @@ def custom_forward(*inputs):
         width = width // patch_size
 
         hidden_states = hidden_states.reshape(
-            shape=(hidden_states.shape[0], height, width, patch_size, patch_size, self.out_channels)
+            shape=(
+                hidden_states.shape[0],
+                height,
+                width,
+                patch_size,
+                patch_size,
+                self.out_channels,
+            )
         )
         hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
         output = hidden_states.reshape(
-            shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
+            shape=(
+                hidden_states.shape[0],
+                self.out_channels,
+                height * patch_size,
+                width * patch_size,
+            )
         )
 
         if USE_PEFT_BACKEND: