feat: turn flow_shift into a generation parameter (leejet#1289)

wbruna · leejet · web-flow · commit b314d80ad051 · 2026-02-26T00:26:04.000+08:00
* feat: turn flow_shift into a generation parameter

* format code

* simplify set_shift/set_parameters

* fix sd_sample_params_to_str

* remove unused variable

* update docs

---------

Co-authored-by: leejet &lt;leejet714@gmail.com&gt;
diff --git a/examples/cli/README.md b/examples/cli/README.md
@@ -44,7 +44,6 @@ Context Options:
                                            CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
   --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
   --vae-tiling                             process vae in tiles to reduce memory usage
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
@@ -109,6 +108,7 @@ Generation Options:
   --skip-layer-start <float>               SLG enabling point (default: 0.01)
   --skip-layer-end <float>                 SLG disabling point (default: 0.2)
   --eta <float>                            eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
+  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
   --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
   --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
   --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
@@ -581,10 +581,6 @@ struct SDContextParams {
              "--vae-tile-overlap",
              "tile overlap for vae tiling, in fraction of tile size (default: 0.5)",
              &vae_tiling_params.target_overlap},
-            {"",
-             "--flow-shift",
-             "shift value for Flow models like SD3.x or WAN (default: auto)",
-             &flow_shift},
         };
 
         options.bool_options = {
@@ -903,7 +899,6 @@ struct SDContextParams {
             << "  photo_maker_path: \"" << photo_maker_path << "\",\n"
             << "  rng_type: " << sd_rng_type_name(rng_type) << ",\n"
             << "  sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
-            << "  flow_shift: " << (std::isinf(flow_shift) ? "INF" : std::to_string(flow_shift)) << "\n"
             << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
             << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
             << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
@@ -986,7 +981,6 @@ struct SDContextParams {
             chroma_use_t5_mask,
             chroma_t5_mask_pad,
             qwen_image_zero_cond_t,
-            flow_shift,
         };
         return sd_ctx_params;
     }
@@ -1206,6 +1200,10 @@ struct SDGenerationParams {
              "--eta",
              "eta in DDIM, only for DDIM and TCD (default: 0)",
              &sample_params.eta},
+            {"",
+             "--flow-shift",
+             "shift value for Flow models like SD3.x or WAN (default: auto)",
+             &sample_params.flow_shift},
             {"",
              "--high-noise-cfg-scale",
              "(high noise) unconditional guidance scale: (default: 7.0)",
@@ -1606,6 +1604,7 @@ struct SDGenerationParams {
         load_if_exists("cfg_scale", sample_params.guidance.txt_cfg);
         load_if_exists("img_cfg_scale", sample_params.guidance.img_cfg);
         load_if_exists("guidance", sample_params.guidance.distilled_guidance);
+        load_if_exists("flow_shift", sample_params.flow_shift);
 
         auto load_sampler_if_exists = [&](const char* key, enum sample_method_t& out) {
             if (j.contains(key) && j[key].is_string()) {
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -36,7 +36,6 @@ Context Options:
                                            CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
   --vae-tile-overlap <float>               tile overlap for vae tiling, in fraction of tile size (default: 0.5)
-  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
   --vae-tiling                             process vae in tiles to reduce memory usage
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
@@ -101,6 +100,7 @@ Default Generation Options:
   --skip-layer-start <float>               SLG enabling point (default: 0.01)
   --skip-layer-end <float>                 SLG disabling point (default: 0.2)
   --eta <float>                            eta in DDIM, only for DDIM/TCD/res_multistep/res_2s (default: 0)
+  --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
   --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
   --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
   --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
@@ -201,7 +201,6 @@ typedef struct {
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
-    float flow_shift;
 } sd_ctx_params_t;
 
 typedef struct {
@@ -235,6 +234,7 @@ typedef struct {
     int shifted_timestep;
     float* custom_sigmas;
     int custom_sigmas_count;
+    float flow_shift;
 } sd_sample_params_t;
 
 typedef struct {
diff --git a/src/denoiser.hpp b/src/denoiser.hpp
@@ -657,9 +657,8 @@ struct DiscreteFlowDenoiser : public Denoiser {
 
     float sigma_data = 1.0f;
 
-    DiscreteFlowDenoiser(float shift = 3.0f)
-        : shift(shift) {
-        set_parameters();
+    DiscreteFlowDenoiser(float shift = 3.0f) {
+        set_shift(shift);
     }
 
     void set_parameters() {
@@ -668,6 +667,11 @@ struct DiscreteFlowDenoiser : public Denoiser {
         }
     }
 
+    void set_shift(float shift) {
+        this->shift = shift;
+        set_parameters();
+    }
+
     float sigma_min() override {
         return sigmas[0];
     }
@@ -710,34 +714,8 @@ float flux_time_shift(float mu, float sigma, float t) {
     return ::expf(mu) / (::expf(mu) + ::powf((1.0f / t - 1.0f), sigma));
 }
 
-struct FluxFlowDenoiser : public Denoiser {
-    float sigmas[TIMESTEPS];
-    float shift = 1.15f;
-
-    float sigma_data = 1.0f;
-
-    FluxFlowDenoiser(float shift = 1.15f) {
-        set_parameters(shift);
-    }
-
-    void set_shift(float shift) {
-        this->shift = shift;
-    }
-
-    void set_parameters(float shift) {
-        set_shift(shift);
-        for (int i = 0; i < TIMESTEPS; i++) {
-            sigmas[i] = t_to_sigma(static_cast<float>(i));
-        }
-    }
-
-    float sigma_min() override {
-        return sigmas[0];
-    }
-
-    float sigma_max() override {
-        return sigmas[TIMESTEPS - 1];
-    }
+struct FluxFlowDenoiser : public DiscreteFlowDenoiser {
+    FluxFlowDenoiser() = default;
 
     float sigma_to_t(float sigma) override {
         return sigma;
@@ -747,26 +725,6 @@ struct FluxFlowDenoiser : public Denoiser {
         t = t + 1;
         return flux_time_shift(shift, 1.0f, t / TIMESTEPS);
     }
-
-    std::vector<float> get_scalings(float sigma) override {
-        float c_skip = 1.0f;
-        float c_out  = -sigma;
-        float c_in   = 1.0f;
-        return {c_skip, c_out, c_in};
-    }
-
-    // this function will modify noise/latent
-    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) override {
-        ggml_ext_tensor_scale_inplace(noise, sigma);
-        ggml_ext_tensor_scale_inplace(latent, 1.0f - sigma);
-        ggml_ext_tensor_add_inplace(latent, noise);
-        return latent;
-    }
-
-    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) override {
-        ggml_ext_tensor_scale_inplace(latent, 1.0f / (1.0f - sigma));
-        return latent;
-    }
 };
 
 struct Flux2FlowDenoiser : public FluxFlowDenoiser {
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
@@ -115,6 +115,7 @@ class StableDiffusionGGML {
     int n_threads                    = -1;
     float scale_factor               = 0.18215f;
     float shift_factor               = 0.f;
+    float default_flow_shift         = INFINITY;
 
     std::shared_ptr<Conditioner> cond_stage_model;
     std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd or wan2.1 i2v
@@ -881,7 +882,6 @@ class StableDiffusionGGML {
         // init denoiser
         {
             prediction_t pred_type = sd_ctx_params->prediction;
-            float flow_shift       = sd_ctx_params->flow_shift;
 
             if (pred_type == PREDICTION_COUNT) {
                 if (sd_version_is_sd2(version)) {
@@ -906,22 +906,19 @@ class StableDiffusionGGML {
                            sd_version_is_qwen_image(version) ||
                            sd_version_is_z_image(version)) {
                     pred_type = FLOW_PRED;
-                    if (flow_shift == INFINITY) {
-                        if (sd_version_is_wan(version)) {
-                            flow_shift = 5.f;
-                        } else {
-                            flow_shift = 3.f;
-                        }
+                    if (sd_version_is_wan(version)) {
+                        default_flow_shift = 5.f;
+                    } else {
+                        default_flow_shift = 3.f;
                     }
                 } else if (sd_version_is_flux(version)) {
                     pred_type = FLUX_FLOW_PRED;
 
-                    if (flow_shift == INFINITY) {
-                        flow_shift = 1.0f;  // TODO: validate
-                        for (const auto& [name, tensor_storage] : tensor_storage_map) {
-                            if (starts_with(name, "model.diffusion_model.guidance_in.in_layer.weight")) {
-                                flow_shift = 1.15f;
-                            }
+                    default_flow_shift = 1.0f;  // TODO: validate
+                    for (const auto& [name, tensor_storage] : tensor_storage_map) {
+                        if (starts_with(name, "model.diffusion_model.guidance_in.in_layer.weight")) {
+                            default_flow_shift = 1.15f;
+                            break;
                         }
                     }
                 } else if (sd_version_is_flux2(version)) {
@@ -945,12 +942,12 @@ class StableDiffusionGGML {
                     break;
                 case FLOW_PRED: {
                     LOG_INFO("running in FLOW mode");
-                    denoiser = std::make_shared<DiscreteFlowDenoiser>(flow_shift);
+                    denoiser = std::make_shared<DiscreteFlowDenoiser>();
                     break;
                 }
                 case FLUX_FLOW_PRED: {
                     LOG_INFO("running in Flux FLOW mode");
-                    denoiser = std::make_shared<FluxFlowDenoiser>(flow_shift);
+                    denoiser = std::make_shared<FluxFlowDenoiser>();
                     break;
                 }
                 case FLUX2_FLOW_PRED: {
@@ -2711,6 +2708,16 @@ class StableDiffusionGGML {
         ggml_ext_tensor_clamp_inplace(result, 0.0f, 1.0f);
         return result;
     }
+
+    void set_flow_shift(float flow_shift = INFINITY) {
+        auto flow_denoiser = std::dynamic_pointer_cast<DiscreteFlowDenoiser>(denoiser);
+        if (flow_denoiser) {
+            if (flow_shift == INFINITY) {
+                flow_shift = default_flow_shift;
+            }
+            flow_denoiser->set_shift(flow_shift);
+        }
+    }
 };
 
 /*================================================= SD API ==================================================*/
@@ -2931,7 +2938,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->chroma_use_dit_mask     = true;
     sd_ctx_params->chroma_use_t5_mask      = false;
     sd_ctx_params->chroma_t5_mask_pad      = 1;
-    sd_ctx_params->flow_shift              = INFINITY;
 }
 
 char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
@@ -3023,6 +3029,7 @@ void sd_sample_params_init(sd_sample_params_t* sample_params) {
     sample_params->sample_steps                = 20;
     sample_params->custom_sigmas               = nullptr;
     sample_params->custom_sigmas_count         = 0;
+    sample_params->flow_shift                  = INFINITY;
 }
 
 char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
@@ -3043,7 +3050,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
              "sample_method: %s, "
              "sample_steps: %d, "
              "eta: %.2f, "
-             "shifted_timestep: %d)",
+             "shifted_timestep: %d, "
+             "flow_shift: %.2f)",
              sample_params->guidance.txt_cfg,
              std::isfinite(sample_params->guidance.img_cfg)
                  ? sample_params->guidance.img_cfg
@@ -3057,7 +3065,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) {
              sd_sample_method_name(sample_params->sample_method),
              sample_params->sample_steps,
              sample_params->eta,
-             sample_params->shifted_timestep);
+             sample_params->shifted_timestep,
+             sample_params->flow_shift);
 
     return buf;
 }
@@ -3528,6 +3537,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
 
     size_t t0 = ggml_time_ms();
 
+    sd_ctx->sd->set_flow_shift(sd_img_gen_params->sample_params.flow_shift);
+
     // Apply lora
     sd_ctx->sd->apply_loras(sd_img_gen_params->loras, sd_img_gen_params->lora_count);
 
@@ -3803,6 +3814,8 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
     }
     LOG_INFO("generate_video %dx%dx%d", width, height, frames);
 
+    sd_ctx->sd->set_flow_shift(sd_vid_gen_params->sample_params.flow_shift);
+
     enum sample_method_t sample_method = sd_vid_gen_params->sample_params.sample_method;
     if (sample_method == SAMPLE_METHOD_COUNT) {
         sample_method = sd_get_default_sample_method(sd_ctx);