prepare for other pix2pix-like models

stduhpf · stduhpf · commit fa56e2f669ea · 2025-05-18T17:07:31.000+02:00
diff --git a/model.cpp b/model.cpp
@@ -1560,7 +1560,7 @@ SDVersion ModelLoader::get_sd_version() {
             return VERSION_SD1_INPAINT;
         }
         if(is_ip2p) {
-            return VERSION_INSTRUCT_PIX2PIX;
+            return VERSION_SD1_PIX2PIX;
         }
         return VERSION_SD1;
     } else if (token_embedding_weight.ne[0] == 1024) {
diff --git a/model.h b/model.h
@@ -21,7 +21,7 @@
 enum SDVersion {
     VERSION_SD1,
     VERSION_SD1_INPAINT,
-    VERSION_INSTRUCT_PIX2PIX,
+    VERSION_SD1_PIX2PIX,
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SDXL,
@@ -48,7 +48,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
 }
 
 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_INSTRUCT_PIX2PIX) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
         return true;
     }
     return false;
@@ -82,8 +82,12 @@ static inline bool sd_version_is_dit(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_edit(SDVersion version) {
+    return version == VERSION_SD1_PIX2PIX;
+}
+
 static bool sd_version_use_concat(SDVersion version) {
-    return version == VERSION_INSTRUCT_PIX2PIX || sd_version_is_inpaint(version);
+    return sd_version_is_edit(version) || sd_version_is_inpaint(version);
 }
 
 enum PMVersion {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -1422,7 +1422,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                                            sd_ctx->sd->diffusion_model->get_adm_in_channels());
 
     SDCondition uncond;
-    if (cfg_scale != 1.0 || sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != guidance) {
+    if (cfg_scale != 1.0 || sd_version_use_concat(sd_ctx->sd->version) && cfg_scale != guidance) {
         bool force_zero_embeddings = false;
         if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0) {
             force_zero_embeddings = true;
@@ -1493,7 +1493,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         cond.c_concat   = masked_latent;
         uncond.c_concat = empty_latent;
         // noise_mask = masked_latent;
-    } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) {
+    } else if (sd_version_is_edit(sd_ctx->sd->version)) {
         cond.c_concat     = masked_latent;
         auto empty_latent = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_latent->ne[0], masked_latent->ne[1], masked_latent->ne[2], masked_latent->ne[3]);
         ggml_set_f32(empty_latent, 0);
@@ -1825,7 +1825,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                 }
             }
         }
-    } else if (sd_ctx->sd->version == VERSION_INSTRUCT_PIX2PIX) {
+    } else if (sd_version_is_edit(sd_ctx->sd->version)) {
         // Not actually masked, we're just highjacking the masked_latent variable since it will be used the same way
         if (!sd_ctx->sd->use_tiny_autoencoder) {
             masked_latent = sd_ctx->sd->get_first_stage_encoding_mode(work_ctx, init_moments);
diff --git a/unet.hpp b/unet.hpp
@@ -207,7 +207,7 @@ class UnetModelBlock : public GGMLBlock {
         }
         if (sd_version_is_inpaint(version)) {
             in_channels = 9;
-        } else if (version == VERSION_INSTRUCT_PIX2PIX) {
+        } else if (version == VERSION_SD1_PIX2PIX) {
             in_channels = 8;
         }
 

Original file line number	Diff line number	Diff line change
`@@ -1560,7 +1560,7 @@ SDVersion ModelLoader::get_sd_version() {`
`1560`	`1560`	`return VERSION_SD1_INPAINT;`
`1561`	`1561`	`}`
`1562`	`1562`	`if(is_ip2p) {`
`1563`		`- return VERSION_INSTRUCT_PIX2PIX;`
	`1563`	`+ return VERSION_SD1_PIX2PIX;`
`1564`	`1564`	`}`
`1565`	`1565`	`return VERSION_SD1;`
`1566`	`1566`	`} else if (token_embedding_weight.ne[0] == 1024) {`
Original file line number	Diff line number	Diff line change
`@@ -207,7 +207,7 @@ class UnetModelBlock : public GGMLBlock {`
`207`	`207`	`}`
`208`	`208`	`if (sd_version_is_inpaint(version)) {`
`209`	`209`	`in_channels = 9;`
`210`		`- } else if (version == VERSION_INSTRUCT_PIX2PIX) {`
	`210`	`+ } else if (version == VERSION_SD1_PIX2PIX) {`
`211`	`211`	`in_channels = 8;`
`212`	`212`	`}`
`213`	`213`