Skip to content

Commit dc05c68

Browse files
committed
up
2 parents f04c012 + 7f3e9b8 commit dc05c68

File tree

18 files changed

+491
-29
lines changed

18 files changed

+491
-29
lines changed

examples/text_to_image/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ datasets>=2.19.1
55
ftfy
66
tensorboard
77
Jinja2
8-
peft==0.7.0
8+
peft>=0.17.0

examples/text_to_image/requirements_sdxl.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ ftfy
55
tensorboard
66
Jinja2
77
datasets
8-
peft==0.7.0
8+
peft>=0.17.0

src/diffusers/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,8 @@
390390
"QwenImageAutoBlocks",
391391
"QwenImageEditAutoBlocks",
392392
"QwenImageEditModularPipeline",
393+
"QwenImageEditPlusAutoBlocks",
394+
"QwenImageEditPlusModularPipeline",
393395
"QwenImageModularPipeline",
394396
"StableDiffusionXLAutoBlocks",
395397
"StableDiffusionXLModularPipeline",
@@ -1052,6 +1054,8 @@
10521054
QwenImageAutoBlocks,
10531055
QwenImageEditAutoBlocks,
10541056
QwenImageEditModularPipeline,
1057+
QwenImageEditPlusAutoBlocks,
1058+
QwenImageEditPlusModularPipeline,
10551059
QwenImageModularPipeline,
10561060
StableDiffusionXLAutoBlocks,
10571061
StableDiffusionXLModularPipeline,

src/diffusers/models/transformers/transformer_ltx.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,9 @@ def forward(
353353
norm_hidden_states = self.norm1(hidden_states)
354354

355355
num_ada_params = self.scale_shift_table.shape[0]
356-
ada_values = self.scale_shift_table[None, None] + temb.reshape(batch_size, temb.size(1), num_ada_params, -1)
356+
ada_values = self.scale_shift_table[None, None].to(temb.device) + temb.reshape(
357+
batch_size, temb.size(1), num_ada_params, -1
358+
)
357359
shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ada_values.unbind(dim=2)
358360
norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
359361

src/diffusers/models/transformers/transformer_wan.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -682,12 +682,12 @@ def forward(
682682
# 5. Output norm, projection & unpatchify
683683
if temb.ndim == 3:
684684
# batch_size, seq_len, inner_dim (wan 2.2 ti2v)
685-
shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
685+
shift, scale = (self.scale_shift_table.unsqueeze(0).to(temb.device) + temb.unsqueeze(2)).chunk(2, dim=2)
686686
shift = shift.squeeze(2)
687687
scale = scale.squeeze(2)
688688
else:
689689
# batch_size, inner_dim
690-
shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
690+
shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)
691691

692692
# Move the shift and scale tensors to the same device as hidden_states.
693693
# When using multi-GPU inference via accelerate these will be on the

src/diffusers/models/transformers/transformer_wan_vace.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def forward(
103103
control_hidden_states = control_hidden_states + hidden_states
104104

105105
shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
106-
self.scale_shift_table + temb.float()
106+
self.scale_shift_table.to(temb.device) + temb.float()
107107
).chunk(6, dim=1)
108108

109109
# 1. Self-attention
@@ -361,7 +361,7 @@ def forward(
361361
hidden_states = hidden_states + control_hint * scale
362362

363363
# 6. Output norm, projection & unpatchify
364-
shift, scale = (self.scale_shift_table + temb.unsqueeze(1)).chunk(2, dim=1)
364+
shift, scale = (self.scale_shift_table.to(temb.device) + temb.unsqueeze(1)).chunk(2, dim=1)
365365

366366
# Move the shift and scale tensors to the same device as hidden_states.
367367
# When using multi-GPU inference via accelerate these will be on the

src/diffusers/modular_pipelines/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@
5353
"QwenImageModularPipeline",
5454
"QwenImageEditModularPipeline",
5555
"QwenImageEditAutoBlocks",
56+
"QwenImageEditPlusModularPipeline",
57+
"QwenImageEditPlusAutoBlocks",
5658
]
5759
_import_structure["components_manager"] = ["ComponentsManager"]
5860

@@ -79,6 +81,8 @@
7981
QwenImageAutoBlocks,
8082
QwenImageEditAutoBlocks,
8183
QwenImageEditModularPipeline,
84+
QwenImageEditPlusAutoBlocks,
85+
QwenImageEditPlusModularPipeline,
8286
QwenImageModularPipeline,
8387
)
8488
from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline

src/diffusers/modular_pipelines/flux/before_denoise.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,11 +255,13 @@ def inputs(self) -> List[InputParam]:
255255
InputParam(
256256
"prompt_embeds",
257257
required=True,
258+
kwargs_type="denoiser_input_fields",
258259
type_hint=torch.Tensor,
259260
description="Pre-generated text embeddings. Can be generated from text_encoder step.",
260261
),
261262
InputParam(
262263
"pooled_prompt_embeds",
264+
kwargs_type="denoiser_input_fields",
263265
type_hint=torch.Tensor,
264266
description="Pre-generated pooled text embeddings. Can be generated from text_encoder step.",
265267
),
@@ -282,11 +284,13 @@ def intermediate_outputs(self) -> List[str]:
282284
OutputParam(
283285
"prompt_embeds",
284286
type_hint=torch.Tensor,
287+
kwargs_type="denoiser_input_fields",
285288
description="text embeddings used to guide the image generation",
286289
),
287290
OutputParam(
288291
"pooled_prompt_embeds",
289292
type_hint=torch.Tensor,
293+
kwargs_type="denoiser_input_fields",
290294
description="pooled text embeddings used to guide the image generation",
291295
),
292296
# TODO: support negative embeddings?

src/diffusers/modular_pipelines/flux/encoders.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ def inputs(self) -> List[InputParam]:
181181
return [
182182
InputParam("prompt"),
183183
InputParam("prompt_2"),
184+
InputParam("max_sequence_length", type_hint=int, default=512, required=False),
184185
InputParam("joint_attention_kwargs"),
185186
]
186187

@@ -189,16 +190,19 @@ def intermediate_outputs(self) -> List[OutputParam]:
189190
return [
190191
OutputParam(
191192
"prompt_embeds",
193+
kwargs_type="denoiser_input_fields",
192194
type_hint=torch.Tensor,
193195
description="text embeddings used to guide the image generation",
194196
),
195197
OutputParam(
196198
"pooled_prompt_embeds",
199+
kwargs_type="denoiser_input_fields",
197200
type_hint=torch.Tensor,
198201
description="pooled text embeddings used to guide the image generation",
199202
),
200203
OutputParam(
201204
"text_ids",
205+
kwargs_type="denoiser_input_fields",
202206
type_hint=torch.Tensor,
203207
description="ids from the text sequence for RoPE",
204208
),
@@ -404,6 +408,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
404408
pooled_prompt_embeds=None,
405409
device=block_state.device,
406410
num_images_per_prompt=1, # TODO: hardcoded for now.
411+
max_sequence_length=block_state.max_sequence_length,
407412
lora_scale=block_state.text_encoder_lora_scale,
408413
)
409414

src/diffusers/modular_pipelines/flux/modular_blocks.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,9 @@ def description(self):
106106

107107
# before_denoise: all task (text2img, img2img)
108108
class FluxAutoBeforeDenoiseStep(AutoPipelineBlocks):
109-
block_classes = [FluxBeforeDenoiseStep, FluxImg2ImgBeforeDenoiseStep]
110-
block_names = ["text2image", "img2img"]
111-
block_trigger_inputs = [None, "image_latents"]
109+
block_classes = [FluxImg2ImgBeforeDenoiseStep, FluxBeforeDenoiseStep]
110+
block_names = ["img2img", "text2image"]
111+
block_trigger_inputs = ["image_latents", None]
112112

113113
@property
114114
def description(self):
@@ -177,16 +177,32 @@ def description(self):
177177
return "Decode step that decode the denoised latents into image outputs.\n - `FluxDecodeStep`"
178178

179179

180-
# text2image, img2img
180+
class FluxCoreDenoiseStep(SequentialPipelineBlocks):
181+
block_classes = [FluxInputStep, FluxAutoBeforeDenoiseStep, FluxAutoDenoiseStep]
182+
block_names = ["input", "before_denoise", "denoise"]
183+
184+
@property
185+
def description(self):
186+
return (
187+
"Core step that performs the denoising process. \n"
188+
+ " - `FluxInputStep` (input) standardizes the inputs for the denoising step.\n"
189+
+ " - `FluxAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
190+
+ " - `FluxAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
191+
+ "This step support text-to-image and image-to-image tasks for Flux:\n"
192+
+ " - for image-to-image generation, you need to provide `image_latents`\n"
193+
+ " - for text-to-image generation, all you need to provide is prompt embeddings"
194+
)
195+
196+
197+
# text2image
181198
class FluxAutoBlocks(SequentialPipelineBlocks):
182199
block_classes = [
183200
FluxTextEncoderStep,
184201
FluxAutoVaeEncoderStep,
185-
FluxAutoBeforeDenoiseStep,
186-
FluxAutoDenoiseStep,
202+
FluxCoreDenoiseStep,
187203
FluxAutoDecodeStep,
188204
]
189-
block_names = ["text_encoder", "image_encoder", "before_denoise", "denoise", "decoder"]
205+
block_names = ["text_encoder", "image_encoder", "denoise", "decode"]
190206

191207
@property
192208
def description(self):
@@ -243,8 +259,7 @@ def description(self):
243259
[
244260
("text_encoder", FluxTextEncoderStep),
245261
("image_encoder", FluxAutoVaeEncoderStep),
246-
("before_denoise", FluxAutoBeforeDenoiseStep),
247-
("denoise", FluxAutoDenoiseStep),
262+
("denoise", FluxCoreDenoiseStep),
248263
("decode", FluxAutoDecodeStep),
249264
]
250265
)

0 commit comments

Comments
 (0)