Skip to content
44 changes: 25 additions & 19 deletions otherarch/sdcpp/clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -545,9 +545,15 @@ class CLIPEmbeddings : public GGMLBlock {
int64_t vocab_size;
int64_t num_positions;

void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
enum ggml_type token_wtype = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type token_wtype = GGML_TYPE_F32;
#if 1
// kcpp reduce memory usage (reverts https://github.com/leejet/stable-diffusion.cpp/pull/601)
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
if (tensor_type != tensor_types.end())
token_wtype = tensor_type->second;
#endif
enum ggml_type position_wtype = GGML_TYPE_F32;

params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
params["position_embedding.weight"] = ggml_new_tensor_2d(ctx, position_wtype, embed_dim, num_positions);
Expand Down Expand Up @@ -594,10 +600,10 @@ class CLIPVisionEmbeddings : public GGMLBlock {
int64_t image_size;
int64_t num_patches;
int64_t num_positions;
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
enum ggml_type patch_wtype = GGML_TYPE_F16; // tensor_types.find(prefix + "patch_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "patch_embedding.weight"] : GGML_TYPE_F16;
enum ggml_type class_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "class_embedding") != tensor_types.end() ? tensor_types[prefix + "class_embedding"] : GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end() ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type patch_wtype = GGML_TYPE_F16;
enum ggml_type class_wtype = GGML_TYPE_F32;
enum ggml_type position_wtype = GGML_TYPE_F32;

params["patch_embedding.weight"] = ggml_new_tensor_4d(ctx, patch_wtype, patch_size, patch_size, num_channels, embed_dim);
params["class_embedding"] = ggml_new_tensor_1d(ctx, class_wtype, embed_dim);
Expand Down Expand Up @@ -657,9 +663,9 @@ enum CLIPVersion {

class CLIPTextModel : public GGMLBlock {
protected:
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
if (version == OPEN_CLIP_VIT_BIGG_14) {
enum ggml_type wtype = GGML_TYPE_F32; // tensor_types.find(prefix + "text_projection") != tensor_types.end() ? tensor_types[prefix + "text_projection"] : GGML_TYPE_F32;
enum ggml_type wtype = GGML_TYPE_F32;
params["text_projection"] = ggml_new_tensor_2d(ctx, wtype, projection_dim, hidden_size);
}
}
Expand All @@ -678,8 +684,8 @@ class CLIPTextModel : public GGMLBlock {
bool with_final_ln = true;

CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
int clip_skip_value = -1,
bool with_final_ln = true)
bool with_final_ln = true,
int clip_skip_value = -1)
: version(version), with_final_ln(with_final_ln) {
if (version == OPEN_CLIP_VIT_H_14) {
hidden_size = 1024;
Expand All @@ -701,7 +707,7 @@ class CLIPTextModel : public GGMLBlock {

void set_clip_skip(int skip) {
if (skip <= 0) {
return;
skip = -1;
}
clip_skip = skip;
}
Expand Down Expand Up @@ -805,8 +811,8 @@ class CLIPProjection : public UnaryBlock {
int64_t out_features;
bool transpose_weight;

void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
enum ggml_type wtype = tensor_types.find(prefix + "weight") != tensor_types.end() ? tensor_types[prefix + "weight"] : GGML_TYPE_F32;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
enum ggml_type wtype = get_type(prefix + "weight", tensor_types, GGML_TYPE_F32);
if (transpose_weight) {
params["weight"] = ggml_new_tensor_2d(ctx, wtype, out_features, in_features);
} else {
Expand Down Expand Up @@ -868,12 +874,12 @@ struct CLIPTextModelRunner : public GGMLRunner {
CLIPTextModel model;

CLIPTextModelRunner(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
const String2GGMLType& tensor_types,
const std::string prefix,
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
int clip_skip_value = 1,
bool with_final_ln = true)
: GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) {
bool with_final_ln = true,
int clip_skip_value = -1)
: GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
model.init(params_ctx, tensor_types, prefix);
}

Expand Down Expand Up @@ -949,4 +955,4 @@ struct CLIPTextModelRunner : public GGMLRunner {
}
};

#endif // __CLIP_HPP__
#endif // __CLIP_HPP__
14 changes: 7 additions & 7 deletions otherarch/sdcpp/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ class UpSampleBlock : public GGMLBlock {
// x: [N, channels, h, w]
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);

x = ggml_upscale(ctx, x, 2, ggml_scale_mode::GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
return x;
}
};
Expand Down Expand Up @@ -182,9 +182,9 @@ class GEGLU : public GGMLBlock {
int64_t dim_in;
int64_t dim_out;

void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
enum ggml_type wtype = (tensor_types.find(prefix + "proj.weight") != tensor_types.end()) ? tensor_types[prefix + "proj.weight"] : GGML_TYPE_F32;
enum ggml_type bias_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "proj.bias") != tensor_types.end()) ? tensor_types[prefix + "proj.bias"] : GGML_TYPE_F32;
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
enum ggml_type wtype = get_type(prefix + "proj.weight", tensor_types, GGML_TYPE_F32);
enum ggml_type bias_wtype = GGML_TYPE_F32;
params["proj.weight"] = ggml_new_tensor_2d(ctx, wtype, dim_in, dim_out * 2);
params["proj.bias"] = ggml_new_tensor_1d(ctx, bias_wtype, dim_out * 2);
}
Expand Down Expand Up @@ -440,9 +440,9 @@ class SpatialTransformer : public GGMLBlock {

class AlphaBlender : public GGMLBlock {
protected:
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, std::string prefix = "") {
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, std::string prefix = "") {
// Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
enum ggml_type wtype = GGML_TYPE_F32; //(tensor_types.ypes.find(prefix + "mix_factor") != tensor_types.end()) ? tensor_types[prefix + "mix_factor"] : GGML_TYPE_F32;
enum ggml_type wtype = GGML_TYPE_F32;
params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
}

Expand Down
71 changes: 37 additions & 34 deletions otherarch/sdcpp/conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,29 +57,30 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
std::vector<std::string> readed_embeddings;

FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
const String2GGMLType& tensor_types,
const std::string& embd_dir,
SDVersion version = VERSION_SD1,
PMVersion pv = PM_VERSION_1,
int clip_skip = -1)
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
if (clip_skip <= 0) {
clip_skip = 1;
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
clip_skip = 2;
}
}
if (sd_version_is_sd1(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
} else if (sd_version_is_sd2(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
} else if (sd_version_is_sdxl(version)) {
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
}
set_clip_skip(clip_skip);
}

void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 1;
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
clip_skip = 2;
}
}
text_model->set_clip_skip(clip_skip);
if (sd_version_is_sdxl(version)) {
text_model2->set_clip_skip(clip_skip);
Expand Down Expand Up @@ -458,8 +459,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
if (sd_version_is_sdxl(version)) {
text_model2->compute(n_threads,
input_ids2,
0,
NULL,
num_custom_embeddings,
token_embed_custom.data(),
max_token_idx,
false,
&chunk_hidden_states2, work_ctx);
Expand All @@ -469,8 +470,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
if (chunk_idx == 0) {
text_model2->compute(n_threads,
input_ids2,
0,
NULL,
num_custom_embeddings,
token_embed_custom.data(),
max_token_idx,
true,
&pooled,
Expand Down Expand Up @@ -617,7 +618,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
struct FrozenCLIPVisionEmbedder : public GGMLRunner {
CLIPVisionModelProjection vision_model;

FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
FrozenCLIPVisionEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {})
: vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
}
Expand Down Expand Up @@ -662,18 +663,19 @@ struct SD3CLIPEmbedder : public Conditioner {
std::shared_ptr<T5Runner> t5;

SD3CLIPEmbedder(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
int clip_skip = -1)
const String2GGMLType& tensor_types = {},
int clip_skip = -1)
: clip_g_tokenizer(0) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip);
}

void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l->set_clip_skip(clip_skip);
clip_g->set_clip_skip(clip_skip);
}
Expand Down Expand Up @@ -1008,16 +1010,17 @@ struct FluxCLIPEmbedder : public Conditioner {
size_t chunk_len = 256;

FluxCLIPEmbedder(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
int clip_skip = -1) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true);
const String2GGMLType& tensor_types = {},
int clip_skip = -1) {
clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
set_clip_skip(clip_skip);
}

void set_clip_skip(int clip_skip) {
if (clip_skip <= 0) {
clip_skip = 2;
}
clip_l->set_clip_skip(clip_skip);
}

Expand Down Expand Up @@ -1228,10 +1231,10 @@ struct PixArtCLIPEmbedder : public Conditioner {
int mask_pad = 1;

PixArtCLIPEmbedder(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
int clip_skip = -1,
bool use_mask = false,
int mask_pad = 1)
const String2GGMLType& tensor_types = {},
int clip_skip = -1,
bool use_mask = false,
int mask_pad = 1)
: use_mask(use_mask), mask_pad(mask_pad) {
t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
}
Expand Down Expand Up @@ -1422,4 +1425,4 @@ struct PixArtCLIPEmbedder : public Conditioner {
}
};

#endif
#endif
15 changes: 13 additions & 2 deletions otherarch/sdcpp/control.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -317,12 +317,23 @@ struct ControlNet : public GGMLRunner {
bool guided_hint_cached = false;

ControlNet(ggml_backend_t backend,
std::map<std::string, enum ggml_type>& tensor_types,
SDVersion version = VERSION_SD1)
const String2GGMLType& tensor_types = {},
SDVersion version = VERSION_SD1)
: GGMLRunner(backend), control_net(version) {
control_net.init(params_ctx, tensor_types, "");
}

void enable_conv2d_direct() {
std::vector<GGMLBlock*> blocks;
control_net.get_all_blocks(blocks);
for (auto block : blocks) {
if (block->get_desc() == "Conv2d") {
auto conv_block = (Conv2d*)block;
conv_block->enable_direct();
}
}
}

~ControlNet() {
free_control_ctx();
}
Expand Down
Loading