unify the structure of the forward block

davidb · davidb · commit fafd77475320 · 2025-10-10T13:58:52.000Z
diff --git a/src/diffusers/models/transformers/transformer_photon.py b/src/diffusers/models/transformers/transformer_photon.py
@@ -299,9 +299,8 @@ class PhotonBlock(nn.Module):
             Produces scale/shift/gating parameters for modulated layers.
 
     Methods:
-        attn_forward(img, txt, pe, modulation, spatial_conditioning=None, attention_mask=None):
-            Compute cross-attention between image and text tokens, with optional spatial conditioning and attention
-            masking.
+        attn_forward(img, txt, pe, modulation, attention_mask=None):
+            Compute cross-attention between image and text tokens, with optional attention masking.
 
             Parameters:
                 img (`torch.Tensor`):
@@ -312,8 +311,6 @@ class PhotonBlock(nn.Module):
                     Rotary positional embeddings to apply to queries and keys.
                 modulation (`ModulationOut`):
                     Scale and shift parameters for modulating image tokens.
-                spatial_conditioning (`torch.Tensor`, *optional*):
-                    Extra conditioning tokens of shape `(B, L_cond, hidden_size)`.
                 attention_mask (`torch.Tensor`, *optional*):
                     Boolean mask of shape `(B, L_txt)` where 0 marks padding.
 
@@ -372,7 +369,6 @@ def _attn_forward(
         txt: Tensor,
         pe: Tensor,
         modulation: ModulationOut,
-        spatial_conditioning: None | Tensor = None,
         attention_mask: None | Tensor = None,
     ) -> Tensor:
         # image tokens proj and norm
@@ -444,7 +440,6 @@ def forward(
         txt: Tensor,
         vec: Tensor,
         pe: Tensor,
-        spatial_conditioning: Tensor | None = None,
         attention_mask: Tensor | None = None,
         **_: dict[str, Any],
     ) -> Tensor:
@@ -461,9 +456,6 @@ def forward(
                 broadcastable).
             pe (`torch.Tensor`):
                 Rotary positional embeddings applied inside attention.
-            spatial_conditioning (`torch.Tensor`, *optional*):
-                Extra conditioning tokens of shape `(B, L_cond, hidden_size)`. Used only if spatial conditioning is
-                enabled in the block.
             attention_mask (`torch.Tensor`, *optional*):
                 Boolean mask for text tokens of shape `(B, L_txt)`, where `0` marks padding.
             **_:
@@ -481,7 +473,6 @@ def forward(
             txt,
             pe,
             mod_attn,
-            spatial_conditioning=spatial_conditioning,
             attention_mask=attention_mask,
         )
         img = img + mod_mlp.gate * self._ffn_forward(img, mod_mlp)
@@ -698,14 +689,6 @@ def __init__(
 
         self.gradient_checkpointing = False
 
-    def _process_inputs(self, image_latent: Tensor, txt: Tensor, **_: Any) -> tuple[Tensor, Tensor, Tensor]:
-        txt = self.txt_in(txt)
-        img = img2seq(image_latent, self.patch_size)
-        bs, _, h, w = image_latent.shape
-        img_ids = get_image_ids(bs, h, w, patch_size=self.patch_size, device=image_latent.device)
-        pe = self.pe_embedder(img_ids)
-        return img, txt, pe
-
     def _compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> Tensor:
         return self.time_in(
             get_timestep_embedding(
@@ -717,43 +700,6 @@ def _compute_timestep_embedding(self, timestep: Tensor, dtype: torch.dtype) -> T
             ).to(dtype)
         )
 
-    def _forward_transformers(
-        self,
-        image_latent: Tensor,
-        cross_attn_conditioning: Tensor,
-        timestep: Optional[Tensor] = None,
-        time_embedding: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        **block_kwargs: Any,
-    ) -> Tensor:
-        img = self.img_in(image_latent)
-
-        if time_embedding is not None:
-            vec = time_embedding
-        else:
-            if timestep is None:
-                raise ValueError("Please provide either a timestep or a timestep_embedding")
-            vec = self._compute_timestep_embedding(timestep, dtype=img.dtype)
-
-        for block in self.blocks:
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                img = self._gradient_checkpointing_func(
-                    block.__call__,
-                    img,
-                    cross_attn_conditioning,
-                    vec,
-                    block_kwargs.get("pe"),
-                    block_kwargs.get("spatial_conditioning"),
-                    attention_mask,
-                )
-            else:
-                img = block(
-                    img=img, txt=cross_attn_conditioning, vec=vec, attention_mask=attention_mask, **block_kwargs
-                )
-
-        img = self.final_layer(img, vec)
-        return img
-
     def forward(
         self,
         image_latent: Tensor,
@@ -797,6 +743,7 @@ def forward(
             lora_scale = attention_kwargs.pop("scale", 1.0)
         else:
             lora_scale = 1.0
+
         if USE_PEFT_BACKEND:
             # weight the lora layers by setting `lora_scale` for each PEFT layer
             scale_lora_layers(self, lora_scale)
@@ -805,12 +752,50 @@ def forward(
                 logger.warning(
                     "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
                 )
-        img_seq, txt, pe = self._process_inputs(image_latent, cross_attn_conditioning)
-        img_seq = self._forward_transformers(img_seq, txt, timestep, pe=pe, attention_mask=cross_attn_mask)
-        output = seq2img(img_seq, self.patch_size, image_latent.shape)
+
+        # Process text conditioning
+        txt = self.txt_in(cross_attn_conditioning)
+
+        # Convert image to sequence and embed
+        img = img2seq(image_latent, self.patch_size)
+        img = self.img_in(img)
+
+        # Generate positional embeddings
+        bs, _, h, w = image_latent.shape
+        img_ids = get_image_ids(bs, h, w, patch_size=self.patch_size, device=image_latent.device)
+        pe = self.pe_embedder(img_ids)
+
+        # Compute time embedding
+        vec = self._compute_timestep_embedding(timestep, dtype=img.dtype)
+
+        # Apply transformer blocks
+        for block in self.blocks:
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                img = self._gradient_checkpointing_func(
+                    block.__call__,
+                    img,
+                    txt,
+                    vec,
+                    pe,
+                    cross_attn_mask,
+                )
+            else:
+                img = block(
+                    img=img,
+                    txt=txt,
+                    vec=vec,
+                    pe=pe,
+                    attention_mask=cross_attn_mask,
+                )
+
+        # Final layer and convert back to image
+        img = self.final_layer(img, vec)
+        output = seq2img(img, self.patch_size, image_latent.shape)
+
         if USE_PEFT_BACKEND:
             # remove `lora_scale` from each PEFT layer
             unscale_lora_layers(self, lora_scale)
+
         if not return_dict:
             return (output,)
         return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/pipelines/photon/pipeline_photon.py b/src/diffusers/pipelines/photon/pipeline_photon.py
@@ -31,7 +31,7 @@
 from diffusers.image_processor import PixArtImageProcessor
 from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderDC, AutoencoderKL
-from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel, seq2img
+from diffusers.models.transformers.transformer_photon import PhotonTransformer2DModel
 from diffusers.pipelines.photon.pipeline_output import PhotonPipelineOutput
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
@@ -572,20 +572,15 @@ def __call__(
                     # Normalize timestep for the transformer
                     t_cont = (t.float() / self.scheduler.config.num_train_timesteps).view(1).to(device)
 
-                # Process inputs for transformer
-                img_seq, txt, pe = self.transformer._process_inputs(latents_in, ca_embed)
-
-                # Forward through transformer layers
-                img_seq = self.transformer._forward_transformers(
-                    img_seq,
-                    txt,
-                    time_embedding=self.transformer._compute_timestep_embedding(t_cont, img_seq.dtype),
-                    pe=pe,
-                    attention_mask=ca_mask,
-                )
-
-                # Convert back to image format
-                noise_pred = seq2img(img_seq, self.transformer.patch_size, latents_in.shape)
+                # Forward through transformer
+                noise_pred = self.transformer(
+                    image_latent=latents_in,
+                    timestep=t_cont,
+                    cross_attn_conditioning=ca_embed,
+                    micro_conditioning=None,
+                    cross_attn_mask=ca_mask,
+                    return_dict=False,
+                )[0]
 
                 # Apply CFG
                 if self.do_classifier_free_guidance: