Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
774 changes: 774 additions & 0 deletions src/anima.hpp

Large diffs are not rendered by default.

136 changes: 136 additions & 0 deletions src/conditioner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1641,6 +1641,142 @@ struct T5CLIPEmbedder : public Conditioner {
}
};

struct AnimaConditioner : public Conditioner {
std::shared_ptr<LLM::BPETokenizer> qwen_tokenizer;
T5UniGramTokenizer t5_tokenizer;
std::shared_ptr<LLM::LLMRunner> llm;

AnimaConditioner(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {}) {
qwen_tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
llm = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
backend,
offload_params_to_cpu,
tensor_storage_map,
"text_encoders.llm",
false);
}

void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
llm->get_param_tensors(tensors, "text_encoders.llm");
}

void alloc_params_buffer() override {
llm->alloc_params_buffer();
}

void free_params_buffer() override {
llm->free_params_buffer();
}

size_t get_params_buffer_size() override {
return llm->get_params_buffer_size();
}

void set_flash_attention_enabled(bool enabled) override {
llm->set_flash_attention_enabled(enabled);
}

void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
llm->set_weight_adapter(adapter);
}

std::tuple<std::vector<int>, std::vector<float>, std::vector<int>, std::vector<float>> tokenize(std::string text) {
auto parsed_attention = parse_prompt_attention(text);

{
std::stringstream ss;
ss << "[";
for (const auto& item : parsed_attention) {
ss << "['" << item.first << "', " << item.second << "], ";
}
ss << "]";
LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
}

std::vector<int> qwen_tokens;
std::vector<float> qwen_weights;
std::vector<int> t5_tokens;
std::vector<float> t5_weights;

for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
std::vector<int> curr_tokens = qwen_tokenizer->tokenize(curr_text, nullptr);
qwen_tokens.insert(qwen_tokens.end(), curr_tokens.begin(), curr_tokens.end());
// Anima uses uniform Qwen token weights.
qwen_weights.insert(qwen_weights.end(), curr_tokens.size(), 1.f);
}
if (qwen_tokens.empty()) {
qwen_tokens.push_back(151643); // qwen3 pad token
qwen_weights.push_back(1.f);
}

for (const auto& item : parsed_attention) {
const std::string& curr_text = item.first;
float curr_weight = item.second;
std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
}

return {qwen_tokens, qwen_weights, t5_tokens, t5_weights};
}

SDCondition get_learned_condition(ggml_context* work_ctx,
int n_threads,
const ConditionerParams& conditioner_params) override {
int64_t t0 = ggml_time_ms();

auto tokenized = tokenize(conditioner_params.text);
auto& qwen_tokens = std::get<0>(tokenized);
auto& qwen_weights = std::get<1>(tokenized);
auto& t5_tokens = std::get<2>(tokenized);
auto& t5_weights = std::get<3>(tokenized);

auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens);

struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024]
llm->compute(n_threads,
input_ids,
nullptr,
{},
{},
&hidden_states,
work_ctx);

{
auto tensor = hidden_states;
float original_mean = ggml_ext_tensor_mean(tensor);
for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
value *= qwen_weights[i1];
ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
}
}
}
float new_mean = ggml_ext_tensor_mean(tensor);
if (new_mean != 0.f) {
ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
}
}

struct ggml_tensor* t5_ids_tensor = nullptr;
struct ggml_tensor* t5_weight_tensor = nullptr;
if (!t5_tokens.empty()) {
t5_ids_tensor = vector_to_ggml_tensor_i32(work_ctx, t5_tokens);
t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights);
}

int64_t t1 = ggml_time_ms();
LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);

return {hidden_states, t5_weight_tensor, t5_ids_tensor};
}
};

struct LLMEmbedder : public Conditioner {
SDVersion version;
std::shared_ptr<LLM::BPETokenizer> tokenizer;
Expand Down
67 changes: 67 additions & 0 deletions src/diffusion_model.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef __DIFFUSION_MODEL_H__
#define __DIFFUSION_MODEL_H__

#include "anima.hpp"
#include "flux.hpp"
#include "mmdit.hpp"
#include "qwen_image.hpp"
Expand Down Expand Up @@ -242,6 +243,72 @@ struct FluxModel : public DiffusionModel {
}
};

struct AnimaModel : public DiffusionModel {
std::string prefix;
Anima::AnimaRunner anima;

AnimaModel(ggml_backend_t backend,
bool offload_params_to_cpu,
const String2TensorStorage& tensor_storage_map = {},
const std::string prefix = "model.diffusion_model")
: prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
}

std::string get_desc() override {
return anima.get_desc();
}

void alloc_params_buffer() override {
anima.alloc_params_buffer();
}

void free_params_buffer() override {
anima.free_params_buffer();
}

void free_compute_buffer() override {
anima.free_compute_buffer();
}

void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
anima.get_param_tensors(tensors, prefix);
}

size_t get_params_buffer_size() override {
return anima.get_params_buffer_size();
}

void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
anima.set_weight_adapter(adapter);
}

int64_t get_adm_in_channels() override {
return 768;
}

void set_flash_attention_enabled(bool enabled) {
anima.set_flash_attention_enabled(enabled);
}

void set_circular_axes(bool circular_x, bool circular_y) override {
anima.set_circular_axes(circular_x, circular_y);
}

bool compute(int n_threads,
DiffusionParams diffusion_params,
struct ggml_tensor** output = nullptr,
struct ggml_context* output_ctx = nullptr) override {
return anima.compute(n_threads,
diffusion_params.x,
diffusion_params.timesteps,
diffusion_params.context,
diffusion_params.c_concat,
diffusion_params.y,
output,
output_ctx);
}
};

struct WanModel : public DiffusionModel {
std::string prefix;
WAN::WanRunner wan;
Expand Down
3 changes: 3 additions & 0 deletions src/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1057,6 +1057,9 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
return VERSION_QWEN_IMAGE;
}
if (tensor_storage.name.find("model.diffusion_model.net.llm_adapter.blocks.0.cross_attn.q_proj.weight") != std::string::npos) {
return VERSION_ANIMA;
}
if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
is_flux2 = true;
}
Expand Down
9 changes: 9 additions & 0 deletions src/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ enum SDVersion {
VERSION_WAN2_2_I2V,
VERSION_WAN2_2_TI2V,
VERSION_QWEN_IMAGE,
VERSION_ANIMA,
VERSION_FLUX2,
VERSION_FLUX2_KLEIN,
VERSION_Z_IMAGE,
Expand Down Expand Up @@ -122,6 +123,13 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
return false;
}

static inline bool sd_version_is_anima(SDVersion version) {
if (version == VERSION_ANIMA) {
return true;
}
return false;
}

static inline bool sd_version_is_z_image(SDVersion version) {
if (version == VERSION_Z_IMAGE) {
return true;
Expand All @@ -146,6 +154,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
sd_version_is_sd3(version) ||
sd_version_is_wan(version) ||
sd_version_is_qwen_image(version) ||
sd_version_is_anima(version) ||
sd_version_is_z_image(version)) {
return true;
}
Expand Down
8 changes: 8 additions & 0 deletions src/name_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1094,6 +1094,14 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
}
}

if (is_lora && sd_version_is_anima(version)) {
static const std::string anima_diffusion_prefix = "model.diffusion_model.";
static const std::string anima_net_prefix = "model.diffusion_model.net.";
if (starts_with(name, anima_diffusion_prefix) && !starts_with(name, anima_net_prefix)) {
name = anima_net_prefix + name.substr(anima_diffusion_prefix.size());
}
}

// cond_stage_model
{
for (const auto& prefix : cond_stage_model_prefix_vec) {
Expand Down
19 changes: 16 additions & 3 deletions src/rope.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace Rope {

__STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos,
int dim,
int theta,
float theta,
const std::vector<int>& axis_wrap_dims = {}) {
assert(dim % 2 == 0);
int half_dim = dim / 2;
Expand Down Expand Up @@ -167,7 +167,7 @@ namespace Rope {

__STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
int bs,
int theta,
const std::vector<float>& axis_thetas,
const std::vector<int>& axes_dim,
const std::vector<std::vector<int>>& wrap_dims = {}) {
std::vector<std::vector<float>> trans_ids = transpose(ids);
Expand All @@ -188,8 +188,12 @@ namespace Rope {
if (!wrap_dims.empty() && i < (int)wrap_dims.size()) {
axis_wrap_dims = wrap_dims[i];
}
float axis_theta = 10000.0f;
if (!axis_thetas.empty()) {
axis_theta = axis_thetas[std::min(i, axis_thetas.size() - 1)];
}
std::vector<std::vector<float>> rope_emb =
rope(trans_ids[i], axes_dim[i], theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
rope(trans_ids[i], axes_dim[i], axis_theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
for (int b = 0; b < bs; ++b) {
for (int j = 0; j < pos_len; ++j) {
for (int k = 0; k < rope_emb[0].size(); ++k) {
Expand All @@ -203,6 +207,15 @@ namespace Rope {
return flatten(emb);
}

__STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
int bs,
float theta,
const std::vector<int>& axes_dim,
const std::vector<std::vector<int>>& wrap_dims = {}) {
std::vector<float> axis_thetas(axes_dim.size(), theta);
return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims);
}

__STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
int bs,
int axes_dim_num,
Expand Down
Loading