@@ -49,8 +49,7 @@ const char* sampling_methods_str[] = {
4949 " iPNDM_v" ,
5050 " LCM" ,
5151 " DDIM \" trailing\" " ,
52- " TCD"
53- };
52+ " TCD" };
5453
5554/* ================================================== Helper Functions ================================================*/
5655
@@ -683,7 +682,7 @@ class StableDiffusionGGML {
683682 float curr_multiplier = kv.second ;
684683 lora_state_diff[lora_name] -= curr_multiplier;
685684 }
686-
685+
687686 size_t rm = lora_state_diff.size () - lora_state.size ();
688687 if (rm != 0 ) {
689688 LOG_INFO (" Attempting to apply %lu LoRAs (removing %lu applied LoRAs)" , lora_state.size (), rm);
@@ -806,7 +805,6 @@ class StableDiffusionGGML {
806805 float skip_layer_start = 0.01 ,
807806 float skip_layer_end = 0.2 ,
808807 ggml_tensor* noise_mask = nullptr ) {
809-
810808 // TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
811809
812810 float img_cfg_scale = guidance;
@@ -834,7 +832,7 @@ class StableDiffusionGGML {
834832
835833 bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL ;
836834 bool has_img_guidance = version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != img_cfg_scale;
837- has_unconditioned = has_unconditioned || has_img_guidance;
835+ has_unconditioned = has_unconditioned || has_img_guidance;
838836 bool has_skiplayer = slg_scale != 0.0 && skip_layers.size () > 0 ;
839837
840838 // denoise wrapper
@@ -988,7 +986,7 @@ class StableDiffusionGGML {
988986 int64_t i3 = i / out_cond->ne [0 ] * out_cond->ne [1 ] * out_cond->ne [2 ];
989987 float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1 .0f / ne3);
990988 } else {
991- if (has_img_guidance){
989+ if (has_img_guidance) {
992990 latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]);
993991 } else {
994992 latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
@@ -1393,7 +1391,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
13931391 sd_ctx->sd ->diffusion_model ->get_adm_in_channels ());
13941392
13951393 SDCondition uncond;
1396- if (cfg_scale != 1.0 || sd_ctx->sd ->version == VERSION_INSTRUCT_PIX2PIX && cfg_scale!= guidance) {
1394+ if (cfg_scale != 1.0 || sd_ctx->sd ->version == VERSION_INSTRUCT_PIX2PIX && cfg_scale != guidance) {
13971395 bool force_zero_embeddings = false ;
13981396 if (sd_version_is_sdxl (sd_ctx->sd ->version ) && negative_prompt.size () == 0 ) {
13991397 force_zero_embeddings = true ;
@@ -1739,6 +1737,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
17391737
17401738 sd_image_to_tensor (init_image.data , init_img);
17411739
1740+ ggml_tensor* init_latent = NULL ;
1741+ if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1742+ ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, init_img);
1743+ init_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1744+ } else {
1745+ init_latent = sd_ctx->sd ->encode_first_stage (work_ctx, init_img);
1746+ }
1747+
17421748 ggml_tensor* masked_image;
17431749
17441750 if (sd_version_is_inpaint (sd_ctx->sd ->version )) {
@@ -1786,12 +1792,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
17861792 }
17871793 } else if (sd_ctx->sd ->version == VERSION_INSTRUCT_PIX2PIX) {
17881794 // Not actually masked, we're just highjacking the masked_image variable since it will be used the same way
1789- if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1790- ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, init_img);
1791- masked_image = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1792- } else {
1793- masked_image = sd_ctx->sd ->encode_first_stage (work_ctx, init_img);
1794- }
1795+ masked_image = init_latent;
17951796 } else {
17961797 // LOG_WARN("Inpainting with a base model is not great");
17971798 masked_image = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width / 8 , height / 8 , 1 , 1 );
@@ -1805,14 +1806,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
18051806 }
18061807 }
18071808
1808- ggml_tensor* init_latent = NULL ;
1809- if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1810- ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, init_img);
1811- init_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1812- } else {
1813- init_latent = sd_ctx->sd ->encode_first_stage (work_ctx, init_img);
1814- }
1815-
18161809 print_ggml_tensor (init_latent, true );
18171810 size_t t1 = ggml_time_ms ();
18181811 LOG_INFO (" encode_first_stage completed, taking %.2fs" , (t1 - t0) * 1 .0f / 1000 );
0 commit comments