Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion diffsynth_engine/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import torch.nn as nn
from contextlib import contextmanager


# mofified from transformers.modeling_utils
TORCH_INIT_FUNCTIONS = {
"uniform_": nn.init.uniform_,
Expand Down
4 changes: 4 additions & 0 deletions diffsynth_engine/models/vae/vae.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,8 @@ def __init__(
self.conv_norm_out = nn.GroupNorm(num_channels=128, num_groups=32, eps=1e-6, device=device, dtype=dtype)
self.conv_act = nn.SiLU()
self.conv_out = nn.Conv2d(128, 3, kernel_size=3, padding=1, device=device, dtype=dtype)
self.device = device
self.dtype = dtype

def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
original_dtype = sample.dtype
Expand Down Expand Up @@ -277,6 +279,8 @@ def __init__(
self.conv_norm_out = nn.GroupNorm(num_channels=512, num_groups=32, eps=1e-6, device=device, dtype=dtype)
self.conv_act = nn.SiLU()
self.conv_out = nn.Conv2d(512, 2 * latent_channels, kernel_size=3, padding=1, device=device, dtype=dtype)
self.device = device
self.dtype = dtype

def forward(self, sample, tiled=False, tile_size=64, tile_stride=32, **kwargs):
original_dtype = sample.dtype
Expand Down
9 changes: 6 additions & 3 deletions diffsynth_engine/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from diffsynth_engine.utils.offload import enable_sequential_cpu_offload
from diffsynth_engine.utils.gguf import load_gguf_checkpoint
from diffsynth_engine.utils import logging
from diffsynth_engine.utils.platform import empty_cache

logger = logging.get_logger(__name__)

Expand Down Expand Up @@ -144,15 +145,17 @@ def generate_noise(shape, seed=None, device="cpu", dtype=torch.float16):
return noise

def encode_image(self, image: torch.Tensor) -> torch.Tensor:
image = image.to(self.device, self.vae_encoder.dtype)
Copy link

Copilot AI May 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Consider using keyword arguments for the to() method (e.g., image.to(device=self.device, dtype=self.vae_encoder.dtype)) to improve code clarity.

Suggested change
image = image.to(self.device, self.vae_encoder.dtype)
image = image.to(device=self.device, dtype=self.vae_encoder.dtype)

Copilot uses AI. Check for mistakes.
latents = self.vae_encoder(
image, tiled=self.vae_tiled, tile_size=self.vae_tile_size, tile_stride=self.vae_tile_stride
)
return latents

def decode_image(self, latent: torch.Tensor) -> torch.Tensor:
vae_dtype = self.vae_decoder.conv_in.weight.dtype
latent = latent.to(self.device, vae_dtype)
image = self.vae_decoder(
latent.to(vae_dtype), tiled=self.vae_tiled, tile_size=self.vae_tile_size, tile_stride=self.vae_tile_stride
latent, tiled=self.vae_tiled, tile_size=self.vae_tile_size, tile_stride=self.vae_tile_stride
)
return image

Expand Down Expand Up @@ -233,7 +236,7 @@ def load_models_to_device(self, load_model_names: List[str] | None = None):
return
if self.offload_mode == "sequential_cpu_offload":
# fresh the cuda cache
torch.cuda.empty_cache()
empty_cache()
return

# offload unnecessary models to cpu
Expand All @@ -248,4 +251,4 @@ def load_models_to_device(self, load_model_names: List[str] | None = None):
if model is not None and (p := next(model.parameters(), None)) is not None and p.device != self.device:
model.to(self.device)
# fresh the cuda cache
torch.cuda.empty_cache()
empty_cache()
16 changes: 13 additions & 3 deletions diffsynth_engine/pipelines/flux_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from diffsynth_engine.utils import logging
from diffsynth_engine.utils.fp8_linear import enable_fp8_linear
from diffsynth_engine.utils.download import fetch_model
from diffsynth_engine.utils.platform import empty_cache

logger = logging.get_logger(__name__)

Expand Down Expand Up @@ -546,6 +547,7 @@ def predict_noise(
current_step=current_step,
total_step=total_step,
)
self.load_models_to_device(["dit"])
noise_pred = self.dit(
hidden_states=latents,
timestep=timestep,
Expand All @@ -570,15 +572,14 @@ def prepare_latents(
):
# Prepare scheduler
if input_image is not None:
self.load_models_to_device(["vae_encoder"])
total_steps = num_inference_steps
sigmas, timesteps = self.noise_scheduler.schedule(
total_steps, mu=mu, sigma_min=1 / total_steps, sigma_max=1.0
)
t_start = max(total_steps - int(num_inference_steps * denoising_strength), 1)
sigma_start, sigmas = sigmas[t_start - 1], sigmas[t_start - 1 :]
timesteps = timesteps[t_start - 1 :]

self.load_models_to_device(["vae_encoder"])
noise = latents
image = self.preprocess_image(input_image).to(device=self.device, dtype=self.dtype)
latents = self.encode_image(image)
Expand All @@ -593,6 +594,7 @@ def prepare_latents(
return init_latents, latents, sigmas, timesteps

def prepare_masked_latent(self, image: Image.Image, mask: Image.Image | None, height: int, width: int):
self.load_models_to_device(["vae_encoder"])
if mask is None:
image = image.resize((width, height))
image = self.preprocess_image(image).to(device=self.device, dtype=self.dtype)
Expand Down Expand Up @@ -637,6 +639,8 @@ def predict_multicontrolnet(
total_step: int,
):
double_block_output_results, single_block_output_results = None, None
if len(controlnet_params) > 0:
self.load_models_to_device([])
for param in controlnet_params:
current_scale = param.scale
if not (
Expand All @@ -645,6 +649,9 @@ def predict_multicontrolnet(
# if current_step is not in the control range
# skip thie controlnet
continue
if self.offload_mode == "sequential_cpu_offload" or self.offload_mode == "cpu_offload":
empty_cache()
param.model.to(self.device)
double_block_output, single_block_output = param.model(
latents,
param.image,
Expand All @@ -656,6 +663,9 @@ def predict_multicontrolnet(
image_ids,
text_ids,
)
if self.offload_mode == "sequential_cpu_offload" or self.offload_mode == "cpu_offload":
empty_cache()
param.model.to("cpu")
double_block_output_results = accumulate(double_block_output_results, double_block_output)
single_block_output_results = accumulate(single_block_output_results, single_block_output)
return double_block_output_results, single_block_output_results
Expand Down Expand Up @@ -741,7 +751,7 @@ def __call__(
)

# Denoise
self.load_models_to_device(["dit"])
self.load_models_to_device([])
for i, timestep in enumerate(tqdm(timesteps)):
timestep = timestep.unsqueeze(0).to(dtype=self.dtype)
noise_pred = self.predict_noise_with_cfg(
Expand Down
4 changes: 3 additions & 1 deletion diffsynth_engine/tools/flux_inpainting_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ def __init__(
dtype: torch.dtype = torch.bfloat16,
offload_mode: Optional[str] = None,
):
self.pipe = FluxImagePipeline.from_pretrained(flux_model_path, device=device, offload_mode=offload_mode)
self.pipe = FluxImagePipeline.from_pretrained(
flux_model_path, device=device, offload_mode=offload_mode, dtype=dtype
)
self.pipe.load_loras(lora_list)
self.controlnet = FluxControlNet.from_pretrained(
fetch_model(
Expand Down
4 changes: 3 additions & 1 deletion diffsynth_engine/tools/flux_outpainting_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ def __init__(
dtype: torch.dtype = torch.bfloat16,
offload_mode: Optional[str] = None,
):
self.pipe = FluxImagePipeline.from_pretrained(flux_model_path, device=device, offload_mode=offload_mode)
self.pipe = FluxImagePipeline.from_pretrained(
flux_model_path, device=device, offload_mode=offload_mode, dtype=dtype
)
self.pipe.load_loras(lora_list)
self.controlnet = FluxControlNet.from_pretrained(
fetch_model(
Expand Down
2 changes: 1 addition & 1 deletion diffsynth_engine/tools/flux_reference_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(
offload_mode: Optional[str] = None,
):
self.pipe: FluxImagePipeline = FluxImagePipeline.from_pretrained(
flux_model_path, load_text_encoder=load_text_encoder, device=device, offload_mode=offload_mode
flux_model_path, load_text_encoder=load_text_encoder, device=device, offload_mode=offload_mode, dtype=dtype
)
self.pipe.load_loras(lora_list)
redux_model_path = fetch_model("muse/flux1-redux-dev", path="flux1-redux-dev.safetensors", revision="v1")
Expand Down
2 changes: 1 addition & 1 deletion diffsynth_engine/tools/flux_replace_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def __init__(
offload_mode: Optional[str] = None,
):
self.pipe: FluxImagePipeline = FluxImagePipeline.from_pretrained(
flux_model_path, load_text_encoder=load_text_encoder, device=device, offload_mode=offload_mode
flux_model_path, load_text_encoder=load_text_encoder, device=device, offload_mode=offload_mode, dtype=dtype
)
self.pipe.load_loras(lora_list)
redux_model_path = fetch_model("muse/flux1-redux-dev", path="flux1-redux-dev.safetensors", revision="v1")
Expand Down
12 changes: 12 additions & 0 deletions diffsynth_engine/utils/platform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import torch
import gc

# 存放跨平台的工具类


def empty_cache():
if torch.cuda.is_available():
torch.cuda.empty_cache()
if torch.mps.is_available():
torch.mps.empty_cache()
gc.collect()