src/llama-vision.cpp

#include "llama.h"
#include "llama-vision.h"
#include "llama-impl.h"

#include <string.h>  // memcpy
#include <cstring>   // std::memset
#include <algorithm> // std::clamp
#include <limits>
#include <cmath>
#include <stdexcept>

// Disables all warnings from std_image.
#if defined(__clang__)
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Weverything"
#elif defined(__GNUC__) || defined(__GNUG__)
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored "-Wswitch-default"
#elif defined(_MSC_VER)
    #pragma warning(push)
    #pragma warning(disable : 4244 4996)
#endif

#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"

#define STB_IMAGE_RESIZE_IMPLEMENTATION
#include "stb_image_resize2.h"

// Enable warnings again.
#if defined(__clang__)
    #pragma clang diagnostic pop
#elif defined(__GNUC__) || defined(__GNUG__)
    #pragma GCC diagnostic pop
#elif defined(_MSC_VER)
    #pragma warning(pop)
#endif

#ifndef NDEBUG
// for debugging
#include <fstream>
#include <cstdint>
#include <iostream>

// export clip_image_u8 to bmp file for debugging
// https://codereview.stackexchange.com/questions/195121/writing-a-bitmap-image-from-c
struct clip_image_size;
static int bmp_export(const struct clip_image_u8 &img, const std::string &location);
#endif

struct clip_image_size {
    int width;
    int height;
};

// RGB uint8 image
// Memory layout: RGBRGBRGB...
struct clip_image_u8 {
    int nx;
    int ny;
    std::vector<uint8_t> buf;
    clip_image_u8() {}
    clip_image_u8(const llama_img img) {
        nx = img.nx;
        ny = img.ny;
        buf.resize(nx*ny*3);
        memcpy(buf.data(), img.data, buf.size());
    }
};

struct clip_image_u8_batch {
    struct clip_image_u8 * data;
    size_t size;
};

// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
    int nx;
    int ny;
    std::vector<float> buf;
};

using clip_image_f32_batch = std::vector<clip_image_f32>;
using clip_image_f8_batch  = std::vector<clip_image_u8>;

clip_projector_type projector_type_from_name(std::string & name) {
    if (name == "mlp") {
        return CLIP_PROJECTOR_TYPE_MLP;
    }
    return CLIP_PROJECTOR_TYPE_UNKNOWN;
}

mm_patch_merge mm_patch_merge_from_name(std::string & name) {
    if (name == "flat") {
        return MM_PATCH_MERGE_FLAT;
    } else if (name == "spatial_unpad") {
        return MM_PATCH_MERGE_SPATIAL_UNPAD;
    }
    return MM_PATCH_MERGE_UNKNOWN;
}

int clip_n_patches(const clip_context & ctx) {
    auto & hparams = ctx.model->hparams;
    int n_patches = (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
    return n_patches;
}

int ca_n_patches(const ca_context & ctx) {
    auto & hparams = ctx.model->hparams;
    int n_patches = (hparams.image_size / hparams.patch_size) * (hparams.image_size / hparams.patch_size);
    return n_patches;
}

int clip_n_mmproj_embd(const clip_context & ctx) {
    if (ctx.model->hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) {
        return ctx.model->mm_2_b->ne[0];
    } else {
        GGML_ASSERT(false && "invalid proj type");
    }
}

int ca_n_mmproj_embd(const ca_context & ctx) {
    return ctx.model->mm_1_b->ne[0];
}

/**
 * Selects the best resolution from a list of possible resolutions based on the original size.
 *
 * @param original_size The original size of the image in the format (width, height).
 * @param possible_resolutions A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
 * @return The best fit resolution in the format (width, height).
 */
static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size>& possible_resolutions) {
    int original_width  = original_size.width;
    int original_height = original_size.height;

    clip_image_size best_fit;
    int max_effective_resolution = 0;
    int min_wasted_resolution = std::numeric_limits<int>::max();

    for (const auto& resolution : possible_resolutions) {
        int width   = resolution.width;
        int height  = resolution.height;
        float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
        int downscaled_width  = static_cast<int>(original_width * scale);
        int downscaled_height = static_cast<int>(original_height * scale);
        int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
        int wasted_resolution = (width * height) - effective_resolution;
        // LOG_DBG("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
            max_effective_resolution = effective_resolution;
            min_wasted_resolution = wasted_resolution;
            best_fit = resolution;
        }
    }

    return best_fit;
}

static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
    auto clip = [](int x, int lower, int upper) -> int {
        return std::max(lower, std::min(x, upper));
    };

    const int nx = img.nx;
    const int ny = img.ny;

    dst.nx = target_width;
    dst.ny = target_height;
    dst.buf.resize(3 * target_width * target_height);

    float Cc;
    float C[5];
    float d0, d2, d3, a0, a1, a2, a3;
    int i, j, k, jj;
    int x, y;
    float dx, dy;
    float tx, ty;

    tx = (float)nx / (float)target_width;
    ty = (float)ny / (float)target_height;

    // Bicubic interpolation; adapted from ViT.cpp, inspired from :
    //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
    //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation

    for (i = 0; i < target_height; i++) {
        for (j = 0; j < target_width; j++) {
            x = (int)(tx * j);
            y = (int)(ty * i);

            dx = tx * j - x;
            dy = ty * i - y;

            for (k = 0; k < 3; k++) {
                for (jj = 0; jj <= 3; jj++) {
                    d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
                    d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
                    d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
                    a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];

                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
                    a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
                    a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;

                    C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;

                    d0 = C[0] - C[1];
                    d2 = C[2] - C[1];
                    d3 = C[3] - C[1];
                    a0 = C[1];
                    a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
                    a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
                    a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
                    Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;

                    const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
                    dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
                }
            }
        }
    }

    return true;
}

static std::vector<clip_image_u8> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
    std::vector<clip_image_u8> patches;
    int width = image.nx;
    int height = image.ny;
    for (int i = 0; i < height; i += patch_size) {
        for (int j = 0; j < width; j += patch_size) {
            clip_image_u8 patch;
            patch.nx = std::min(patch_size, width - j);
            patch.ny = std::min(patch_size, height - i);
            patch.buf.resize(3 * patch.nx * patch.ny);
            for (int y = 0; y < patch.ny; ++y) {
                for (int x = 0; x < patch.nx; ++x) {
                    for (int c = 0; c < 3; ++c) {
                        patch.buf[3 * (y * patch.nx + x) + c] = image.buf[3 * ((i + y) * width + (j + x)) + c];
                    }
                }
            }
            patches.push_back(patch);
        }
    }
    return patches;
}

// llava-1.6 type of resize_and_pad (black)
static void resize_and_pad_image(const clip_image_u8 & image, clip_image_u8 & image_output, const clip_image_size & target_resolution) {
    int target_width  = target_resolution.width;
    int target_height = target_resolution.height;

    float scale_w = static_cast<float>(target_width) / image.nx;
    float scale_h = static_cast<float>(target_height) / image.ny;

    int new_width, new_height;

    if (scale_w < scale_h) {
        new_width = target_width;
        new_height = std::min(static_cast<int>(std::ceil(image.ny * scale_w)), target_height);
    } else {
        new_height = target_height;
        new_width = std::min(static_cast<int>(std::ceil(image.nx * scale_h)), target_width);
    }

    clip_image_u8 resized_image;
    // bilinear_resize(image, resized_image, new_width, new_height);
    bicubic_resize(image, resized_image, new_width, new_height);

    clip_image_u8 padded_image;
    padded_image.nx = target_width;
    padded_image.ny = target_height;
    padded_image.buf.resize(3 * target_width * target_height, 0); // Initialize with black

    // Calculate padding offsets
    int pad_x = (target_width - new_width) / 2;
    int pad_y = (target_height - new_height) / 2;

    // Copy the resized image into the center of the padded buffer
    for (int y = 0; y < new_height; ++y) {
        for (int x = 0; x < new_width; ++x) {
            for (int c = 0; c < 3; ++c) {
                padded_image.buf[3 * ((y + pad_y) * target_width + (x + pad_x)) + c] = resized_image.buf[3 * (y * new_width + x) + c];
            }
        }
    }
    image_output = std::move(padded_image);
}

static void normalize_image_u8_to_f32(const clip_image_u8 src, clip_image_f32 dst, const std::array<float, 3> & mean, const std::array<float, 3> & std) {
    dst.nx = src.nx;
    dst.ny = src.ny;
    dst.buf.resize(src.buf.size());

    for (size_t i = 0; i < src.buf.size(); ++i) {
        int c = i % 3; // rgb
        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
    }
}

// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
static bool clip_image_preprocess(const clip_context & ctx, const clip_image_u8 & img, clip_image_f32_batch & output_imgs) {
    bool pad_to_square = true;
    auto & params = ctx.model->hparams;
    // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
    if (params.mm_patch_merge_type == MM_PATCH_MERGE_SPATIAL_UNPAD) {
        pad_to_square = false;
    }

    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156

    clip_image_u8 temp;
    if (pad_to_square && img.nx != img.ny) {
        int longer_side = std::max(img.nx, img.ny);
        temp.nx = longer_side;
        temp.ny = longer_side;
        temp.buf.resize(3 * longer_side * longer_side);
        const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA (this is the mean rgb color * 255)

        // fill with background color
        for (size_t i = 0; i < temp.buf.size(); i++) {
            temp.buf[i] = bc[i % 3];
        }

        // copy from the input image
        for (int y = 0; y < img.ny; y++) {
            for (int x = 0; x < img.nx; x++) {
                const int i = 3 * (y * img.nx + x);
                const int j = 3 * (y * temp.nx + x);
                temp.buf[j]   = img.buf[i];
                temp.buf[j+1] = img.buf[i+1];
                temp.buf[j+2] = img.buf[i+2];
            }
        }
    } else {
        if (params.image_grid_pinpoints[0] != 0) {
            // "spatial_unpad" with "anyres" processing for llava-1.6
            std::vector<clip_image_size> possible_resolutions;
            for (int i = 0; i < 32 && params.image_grid_pinpoints[i] != 0; i += 2) {
                clip_image_size s;
                s.width  = params.image_grid_pinpoints[i];
                s.height = params.image_grid_pinpoints[i+1];
                possible_resolutions.push_back(s);
            }
            clip_image_size best_resolution = select_best_resolution({img.nx, img.ny}, possible_resolutions);
            // clip_image_save_to_bmp(*img, "input.bmp");
            resize_and_pad_image(img, temp, best_resolution);  // we do not pad with mean-bg color anymore in llava-1.6
            // clip_image_save_to_bmp(*temp, "resized.bmp");

            std::vector<clip_image_u8> patches = divide_to_patches_u8(temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)

            clip_image_u8 image_original_resize;
            // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
            bicubic_resize(img, image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
            patches.insert(patches.begin(), image_original_resize);
            // clip_image_f32_batch_init(patches.size());
            output_imgs.resize(patches.size());
            int num = 0;
            for (auto & patch : patches) {
                normalize_image_u8_to_f32(patch, output_imgs[num], params.image_mean, params.image_std);
                num++;
            }
            return true;
        } else {
            temp.nx = img.nx;
            temp.ny = img.ny;
            temp.buf.resize(img.buf.size());
            memcpy(temp.buf.data(), img.buf.data(), temp.buf.size());
        }
    }

    const int nx = temp.nx;
    const int ny = temp.ny;
    // bmp_export(temp, "resized_vanilla.bmp");

    const int nx2 = params.image_size;
    const int ny2 = params.image_size;
    clip_image_f32 res;
    res.nx = nx2;
    res.ny = ny2;
    res.buf.resize(3 * nx2 * ny2);

    const float scale = std::max(nx, ny) / (float)params.image_size;

    const int nx3 = int(nx / scale + 0.5f);
    const int ny3 = int(ny / scale + 0.5f);

    const auto & m3 = params.image_mean; // {0.48145466f, 0.4578275f, 0.40821073f};
    const auto & s3 = params.image_std;  // {0.26862954f, 0.26130258f, 0.27577711f};

    for (int y = 0; y < ny3; y++) {
        for (int x = 0; x < nx3; x++) {
            for (int c = 0; c < 3; c++) {
                // linear interpolation
                const float sx = (x + 0.5f) * scale - 0.5f;
                const float sy = (y + 0.5f) * scale - 0.5f;

                const int x0 = std::max(0, (int)std::floor(sx));
                const int y0 = std::max(0, (int)std::floor(sy));

                const int x1 = std::min(x0 + 1, nx - 1);
                const int y1 = std::min(y0 + 1, ny - 1);

                const float dx = sx - x0;
                const float dy = sy - y0;

                const int j00 = 3 * (y0 * nx + x0) + c;
                const int j01 = 3 * (y0 * nx + x1) + c;
                const int j10 = 3 * (y1 * nx + x0) + c;
                const int j11 = 3 * (y1 * nx + x1) + c;

                const float v00 = temp.buf[j00];
                const float v01 = temp.buf[j01];
                const float v10 = temp.buf[j10];
                const float v11 = temp.buf[j11];

                const float v0 = v00 * (1.0f - dx) + v01 * dx;
                const float v1 = v10 * (1.0f - dx) + v11 * dx;

                const float v = v0 * (1.0f - dy) + v1 * dy;

                const uint8_t v2 = std::min(std::max(std::round(v), 0.0f), 255.0f);

                const int i = 3 * (y * nx3 + x) + c;

                res.buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
            }
        }
    }

    output_imgs.resize(1);
    output_imgs[0] = std::move(res);

    return true;
}

static ggml_cgraph * clip_image_build_graph(clip_context & ctx, int batch_size, clip_image_size & image_size) {
    auto & model = *ctx.model;
    auto & hparams = ctx.model->hparams;

    const int hidden_size   = hparams.hidden_size;
    const int n_head        = hparams.n_head;
    const int d_head        = hidden_size / n_head;
    const int patch_size    = hparams.patch_size;
    const float eps         = hparams.eps;
    const int num_patches   = ((image_size.width / patch_size) * (image_size.height / patch_size));
    const int num_positions = num_patches + (model.class_embedding ? 1 : 0);

    LLAMA_LOG_DEBUG("%s: num_patches = %d\n", __func__, num_patches);

    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx.buf_compute_meta.size(),
        /*.mem_buffer =*/ ctx.buf_compute_meta.data(),
        /*.no_alloc   =*/ true,
    };

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);

    // input
    struct ggml_tensor * embeddings;
    {
        struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size.width, image_size.height, 3, batch_size);
        ggml_set_name(inp_raw, "inp_raw");
        ggml_set_input(inp_raw);

        struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);

        inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));

        if (model.patch_bias) {
            inp = ggml_add(ctx0, inp, model.patch_bias);
        }
        // auto * ne = inp->ne; printf("%d %d %d %d\n", ne[0], ne[1], ne[2], ne[3]);

        embeddings = inp;
        if (model.class_embedding) {
            embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
            ggml_set_name(embeddings, "embeddings");
            ggml_set_input(embeddings);
            embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
            embeddings = ggml_acc(ctx0, embeddings, inp,
                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
        }

        struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
        ggml_set_name(positions, "positions");
        ggml_set_input(positions);

        embeddings = ggml_add(ctx0,
            embeddings,
            ggml_get_rows(ctx0, model.position_embeddings, positions));
    }

    // pre-layernorm
    if (model.pre_norm_w) {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "pre_ln");

        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_norm_w), model.pre_norm_b);
    }

    // loop over layers
    for (int il = 0; il < (int)hparams.n_layer + hparams.select_layer; il++) {
        struct ggml_tensor * cur = embeddings;

        // layernorm1
        {
            cur = ggml_norm(ctx0, cur, eps);
            cur = ggml_add(ctx0,
                ggml_mul(ctx0, cur, model.layers[il].norm_in_w),
                model.layers[il].norm_in_b);
        }

        // self-attention
        {

            struct ggml_tensor * Q = ggml_add(ctx0,
                ggml_mul_mat(ctx0, model.layers[il].q_w, cur),
                model.layers[il].q_b);

            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);

            struct ggml_tensor * K = ggml_add(ctx0,
                ggml_mul_mat(ctx0, model.layers[il].k_w, cur),
                model.layers[il].k_b);

            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);

            struct ggml_tensor * V = ggml_add(ctx0,
                ggml_mul_mat(ctx0, model.layers[il].v_w, cur),
                model.layers[il].v_b);

            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);

            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
            KQ = ggml_soft_max_inplace(ctx0, KQ);
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);

            cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
        }

        // attention output
        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].output_w, cur), model.layers[il].output_b);

        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, embeddings);

        embeddings = cur; // embeddings = residual, cur = hidden_states

        // layernorm2
        {
            cur = ggml_norm(ctx0, cur, eps);
            cur = ggml_add(ctx0,
                ggml_mul(ctx0, cur, model.layers[il].norm_out_w),
                model.layers[il].norm_out_b);
        }

        cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up_w, cur);
        cur = ggml_add(ctx0, cur, model.layers[il].ffn_up_b);

        if (hparams.use_gelu) {
            cur = ggml_gelu_inplace(ctx0, cur);
        } else {
            cur = ggml_gelu_quick_inplace(ctx0, cur);
        }

        cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down_w, cur);
        cur = ggml_add(ctx0, cur, model.layers[il].ffn_down_b);

        // residual 2
        cur = ggml_add(ctx0, embeddings, cur);

        embeddings = cur;
    }

    // post-layernorm
    if (model.post_norm_w) {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "post_ln");

        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_norm_w), model.post_norm_b);
    }

    // llava projector
    {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);

        struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
        ggml_set_name(patches, "patches");
        ggml_set_input(patches);

        // shape [1, 576, 1024]
        // ne is whcn, ne = [1024, 576, 1, 1]
        embeddings = ggml_get_rows(ctx0, embeddings, patches);

        if (hparams.proj_type == CLIP_PROJECTOR_TYPE_MLP) {
            embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);

            embeddings = ggml_gelu(ctx0, embeddings);
            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
        } else {
            GGML_ASSERT(false && "unsupported proj type");
        }
    }

    // build the graph
    ggml_build_forward_expand(gf, embeddings);
    ggml_free(ctx0);
    return gf;
}

static int32_t clip_image_batch_encode(clip_context & ctx, const clip_image_f32_batch & imgs, std::vector<float> & output) {
    int batch_size = imgs.size();
    auto & model = *ctx.model;
    auto & hparams = ctx.model->hparams;

    if (hparams.arch == VISION_ARCH_LLAVA) {
        GGML_ASSERT(batch_size == 1); // TODO: support multiple images
    }

    clip_image_size image_size{(int)hparams.image_size, (int)hparams.image_size};
    const int patch_size    = hparams.patch_size;
    const int num_patches   = ((image_size.width / patch_size) * (image_size.height / patch_size));
    const int num_positions = num_patches + (model.class_embedding ? 1 : 0);

    LLAMA_LOG_DEBUG("%s: image_size = %d\n", __func__, hparams.image_size);
    LLAMA_LOG_DEBUG("%s: num_positions = %d\n", __func__, num_positions);

    // build the inference graph
    ggml_cgraph * gf = clip_image_build_graph(ctx, batch_size, image_size);

    // alloc memory for graph
    bool ok = ggml_backend_sched_alloc_graph(ctx.sched, gf);
    if (!ok) {
        LLAMA_LOG_ERROR("failed to alloc memory for graph\n");
        return -1;
    }

    // set raw input
    {
        struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
        float * data = (float *)malloc(ggml_nbytes(inp_raw));

        for (int i = 0; i < batch_size; i++) {
            const int nx = imgs[i].nx;
            const int ny = imgs[i].ny;
            const int n = nx * ny;

            for (int b = 0; b < batch_size; b++) {
                for (int k = 0; k < 3; k++) {
                    for (int y = 0; y < ny; y++) {
                        for (int x = 0; x < nx; x++) {
                            data[(b * 3 * n) + k * n + y * nx + x] = imgs[b].buf[3 * (y * nx + x) + k];
                        }
                    }
                }
            }
        }
        ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
        free(data);
    }

    if (model.class_embedding) {
        struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");

        void* zero_mem = malloc(ggml_nbytes(embeddings));
        memset(zero_mem, 0, ggml_nbytes(embeddings));
        ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
        free(zero_mem);
    }


    {
        struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");

        int* positions_data = (int*)malloc(ggml_nbytes(positions));
        for (int i = 0; i < num_positions; i++) {
            positions_data[i] = i;
        }
        ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
        free(positions_data);
    }

    {
        struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
        int* patches_data = (int*)malloc(ggml_nbytes(patches));
        for (int i = 0; i < num_patches; i++) {
            patches_data[i] = i + 1;
        }
        ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
        free(patches_data);
    }

    // compute
    ggml_backend_sched_graph_compute_async(ctx.sched, gf);

    // the last node is the embedding tensor
    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
    ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(ctx.sched, embeddings);

    // copy the embeddings to the location passed by the user
    size_t out_nbytes = clip_n_patches(ctx)*clip_n_mmproj_embd(ctx)*sizeof(float);
    GGML_ASSERT(out_nbytes == ggml_nbytes(embeddings));
    output.resize(out_nbytes);
    ggml_backend_tensor_get_async(backend_embd, embeddings, output.data(), 0, ggml_nbytes(embeddings));

    ggml_backend_sched_synchronize(ctx.sched);

    return 0;
}

static int32_t clip_image_encode(clip_context & ctx, const clip_image_f32 & img, std::vector<float> & output) {
    clip_image_f32_batch imgs{img};
    return clip_image_batch_encode(ctx, imgs, output);
}

static int32_t encode_image_with_clip(clip_context & ctx, const llama_img img, std::vector<float> & output_embd) {
    clip_image_u8 img_u8(img);
    clip_image_f32_batch img_res_v;
    auto & hparams = ctx.model->hparams;
    // bmp_export(img_u8, "test_inp.bmp");

    if (!clip_image_preprocess(ctx, img_u8, img_res_v)) {
        LLAMA_LOG_ERROR("%s: unable to preprocess image\n", __func__);
        return -2;
    }

    switch (hparams.mm_patch_merge_type) {
        case MM_PATCH_MERGE_FLAT:
            {
                // flat / default llava-1.5 type embedding
                // n_output = clip_n_patches(ctx);
                int32_t encoded = clip_image_encode(ctx, img_res_v[0], output_embd);
                if (encoded != 0) {
                    LLAMA_LOG_ERROR("Unable to encode image\n");
                    return encoded;
                }
            } break;
        case MM_PATCH_MERGE_SPATIAL_UNPAD:
            {
                // TODO: support llava-1.6
                (void)0;
            } break;
        default:
            GGML_ASSERT(false && "unsupported mm_patch_merge_type");
    }

    return 0;
}

// TODO(danbev) Incorporate the following into the llama_img struct
// or something similar later. This is only to try to get something working
// and I'm currently cutting corners to not spend too much time on this.
struct mllama_image {
    const llama_img * img;

    int n_channels = 3;
    int n_tiles = 4;
};

struct mllama_image_batch {
    struct mllama_image * data;
    size_t size;
};

struct MLLamaParams {
    int out_size = 560;
    int n_tiles = 4;

    const float mean[3] = {0.48145466f, 0.4578275f, 0.40821073f};
    const float std[3]  = {0.26862954f, 0.26130258f, 0.27577711f};
};

static std::vector<std::pair<int, int>> get_supported_aspect_ratios(int max_tiles) {
    std::vector<std::pair<int, int>> ratios;
    for (int w = 0; w < max_tiles; ++w) {
        for (int h = 0; h < max_tiles; ++h) {
            if ((w + 1) * (h + 1) <= max_tiles) {
                ratios.push_back({w + 1, h + 1});
            }
        }
    }
    return ratios;
}

static std::pair<int, int> get_optimal_tiled_canvas(int image_w, int image_h, int max_tiles, int tile_size) {
    printf("Calculating optimal canvas for image %dx%d with max_tiles=%d, tile_size=%d\n",
           image_w, image_h, max_tiles, tile_size);

    auto possible_ratios = get_supported_aspect_ratios(max_tiles);
    std::vector<std::pair<int, int>> possible_canvas_sizes;
    std::vector<float> scales;

    // Get possible canvas sizes and their scales
    printf("Possible ratios and their canvas sizes:\n");
    for (const auto& ratio : possible_ratios) {
        int canvas_w = ratio.first * tile_size;
        int canvas_h = ratio.second * tile_size;
        possible_canvas_sizes.push_back({canvas_w, canvas_h});

        float scale_h = static_cast<float>(canvas_h) / image_h;
        float scale_w = static_cast<float>(canvas_w) / image_w;
        float scale = (scale_w > scale_h) ? scale_h : scale_w;
        scales.push_back(scale);

        printf("  Ratio %dx%d -> Canvas %dx%d (scale_w=%.3f scale_h=%.3f selected=%.3f)\n",
               ratio.first, ratio.second, canvas_w, canvas_h,
               scale_w, scale_h, scale);
    }

    float min_upscale = 0.0f;
    float max_downscale = 0.0f;
    bool upscale = false;

    for (float s : scales) {
        if (s > 1.0f) {
            upscale = true;
            if (min_upscale == 0.0f) {
                min_upscale = s;
            } else {
                min_upscale = std::min(min_upscale, s);
            }
        } else {
            max_downscale = std::max(max_downscale, s);
        }
    }

    float selected_scale = upscale ? min_upscale : max_downscale;
    printf("Selected scale: %f (upscale=%d)\n", selected_scale, upscale);

    std::pair<int, int> selected_canvas{0, 0};
    for (size_t n = 0; n < possible_canvas_sizes.size(); n++) {
        if (std::abs(scales[n] - selected_scale) < 1e-6) {
            const auto& canvas = possible_canvas_sizes[n];
            if (selected_canvas.first == 0 && selected_canvas.second == 0) {
                selected_canvas = canvas;
            } else if (canvas.first * canvas.second <
                      selected_canvas.first * selected_canvas.second) {
                selected_canvas = canvas;
            }
            printf("Candidate canvas %dx%d (area=%d)\n",
                   canvas.first, canvas.second,
                   canvas.first * canvas.second);
        }
    }

    printf("Final selected canvas %dx%d\n",
           selected_canvas.first, selected_canvas.second);
    return selected_canvas;
}

static std::pair<int, int> scale_to_fit_canvas(int w, int h, int canvas_w, int canvas_h, int tile_size) {
    int target_w = std::clamp(w, tile_size, canvas_w);
    int target_h = std::clamp(h, tile_size, canvas_h);

    float scale_w = static_cast<float>(target_w) / w;
    float scale_h = static_cast<float>(target_h) / h;
    float scale = std::min(scale_w, scale_h);

    int new_w = std::min(static_cast<int>(std::floor(w * scale)), target_w);
    int new_h = std::min(static_cast<int>(std::floor(h * scale)), target_h);

    // Round up to nearest tile size
    new_w = ((new_w + tile_size - 1) / tile_size) * tile_size;
    new_h = ((new_h + tile_size - 1) / tile_size) * tile_size;

    return {new_w, new_h};
}

static int get_aspect_ratio_index(int canvas_w, int canvas_h, int tile_size, int max_tiles) {
    auto ratios = get_supported_aspect_ratios(max_tiles);
    int tiles_x = canvas_w / tile_size;
    int tiles_y = canvas_h / tile_size;

    for (size_t i = 0; i < ratios.size(); ++i) {
        if (ratios[i].first == tiles_x && ratios[i].second == tiles_y) {
            return i + 1;
        }
    }
    return 1;
}

static std::vector<std::vector<unsigned char>> split_to_tiles(
    const unsigned char* img_data,
    int img_width, int img_height,
    int tile_width, int tile_height,
    int tiles_x, int tiles_y
) {
    if (img_data == nullptr || img_width <= 0 || img_height <= 0 ||
        tile_width <= 0 || tile_height <= 0 || tiles_x <= 0 || tiles_y <= 0) {
        throw std::invalid_argument("Invalid input parameters");
    }

    printf("split_to_tiles: img_width=%d, img_height=%d, tile_width=%d, tile_height=%d, tiles_x=%d, tiles_y=%d\n",
           img_width, img_height, tile_width, tile_height, tiles_x, tiles_y);

    std::vector<std::vector<unsigned char>> tiles;
    tiles.reserve(tiles_x * tiles_y);

    for (int ty = 0; ty < tiles_y; ty++) {
        for (int tx = 0; tx < tiles_x; tx++) {
            std::vector<unsigned char> tile_data(tile_width * tile_height * 3, 0);

            // Calculate effective dimensions for this tile
            int effective_width = std::min(tile_width, img_width - tx * tile_width);
            int effective_height = std::min(tile_height, img_height - ty * tile_height);

            // Debug current tile bounds
            printf("\nProcessing tile [%d,%d], source region: x=%d-%d, y=%d-%d\n",
                   tx, ty,
                   tx * tile_width, tx * tile_width + effective_width - 1,
                   ty * tile_height, ty * tile_height + effective_height - 1);

            // Copy valid pixel data
            for (int y = 0; y < effective_height; y++) {
                for (int x = 0; x < effective_width; x++) {
                    int src_x = tx * tile_width + x;
                    int src_y = ty * tile_height + y;
                    int src_idx = (src_y * img_width + src_x) * 3;
                    int dst_idx = (y * tile_width + x) * 3;

                    // Copy RGB values
                    tile_data[dst_idx + 0] = img_data[src_idx + 0];
                    tile_data[dst_idx + 1] = img_data[src_idx + 1];
                    tile_data[dst_idx + 2] = img_data[src_idx + 2];

                    // Debug first few pixels of each tile
                    if (x < 3 && y < 3) {
                        printf("  Tile[%d,%d] at (%d,%d): src=(%d,%d,%d) -> dst=(%d,%d,%d)\n",
                               tx, ty, x, y,
                               img_data[src_idx + 0],
                               img_data[src_idx + 1],
                               img_data[src_idx + 2],
                               tile_data[dst_idx + 0],
                               tile_data[dst_idx + 1],
                               tile_data[dst_idx + 2]);
                    }
                }
            }

            // Store the completed tile
            tiles.push_back(std::move(tile_data));
        }
    }

    return tiles;
}

struct Point {
    int X, Y;
};

static unsigned char* pad_image(
    const unsigned char* img,
    int imgWidth, int imgHeight, int channels,
    Point outputSize, Point aspectRatio,
    int& paddedWidth, int& paddedHeight
) {
    printf("Padding image to size %dx%d with aspect ratio %dx%d\n",
		   outputSize.X, outputSize.Y, aspectRatio.X, aspectRatio.Y);
    // Calculate padded size
    paddedWidth = outputSize.X * aspectRatio.X;
    paddedHeight = outputSize.Y * aspectRatio.Y;

    // Create padded canvas
    unsigned char* paddedImg = new unsigned char[paddedWidth * paddedHeight * channels];
    std::memset(paddedImg, 0, paddedWidth * paddedHeight * channels); // Initialize to black (0)

    // Copy the original image to the canvas
    for (int y = 0; y < imgHeight; ++y) {
        for (int x = 0; x < imgWidth; ++x) {
            for (int c = 0; c < channels; ++c) {
                paddedImg[(y * paddedWidth + x) * channels + c] = img[(y * imgWidth + x) * channels + c];
            }
        }
    }

    return paddedImg;
}

static std::pair<int, int> get_image_size_fit_to_canvas(
    int img_width, int img_height,
    int canvas_width, int canvas_height,
    int tile_size
) {
    // First clamp to minimum tile size
    int target_width = std::max(img_width, tile_size);
    int target_height = std::max(img_height, tile_size);

    printf("Get image size fit to canvas: img=%dx%d, canvas=%dx%d, tile=%d\n",
           img_width, img_height, canvas_width, canvas_height, tile_size);

    // Clamp to canvas size.
    target_width = std::min(target_width, canvas_width);
    target_height = std::min(target_height, canvas_height);

    // Calculate scale factors
    double scale_width = static_cast<double>(target_width) / img_width;
    double scale_height = static_cast<double>(target_height) / img_height;

    if (scale_width < scale_height) {
        target_height = img_height * scale_width;
    } else {
        target_width = img_width * scale_height;
    }

    int final_width = target_width;
    int final_height = target_height;

    return {final_width, final_height};
}

using ca_image_batch = std::vector<mllama_image>;

static bool mllama_image_preprocess(const mllama_image & mllama_img, ca_image_batch & output_imgs) {
    MLLamaParams params;

    const llama_img * img = mllama_img.img;
    int i_w = img->nx;
    int i_h = img->ny;

    auto canvas_size = get_optimal_tiled_canvas(i_w, i_h, params.n_tiles, params.out_size);
    int aspect_ratio_idx = get_aspect_ratio_index(canvas_size.first, canvas_size.second,
	                                          params.out_size, params.n_tiles);

    auto [final_w, final_h] = get_image_size_fit_to_canvas(i_w, i_h,
	                                                   canvas_size.first,
							   canvas_size.second,
							   params.out_size);

    printf("Now resize image to size: %dx%d\n", final_w, final_h);
    std::vector<unsigned char> resized_whc(final_w * final_h * 3);
    stbir_resize_uint8_srgb(
        img->data, i_w, i_h, i_w * 3,
        resized_whc.data(), final_w, final_h, final_w * 3,
        STBIR_RGB
    );
    stbi_image_free(img->data);

    auto padded_image = pad_image(resized_whc.data(), final_w, final_h, 3,
                                  {params.out_size, params.out_size},
                                  {canvas_size.first / params.out_size, canvas_size.second / params.out_size},
                                  final_w, final_h);
    printf("Padded image to size %dx%d\n", final_w, final_h);

    int tiles_x = final_w / params.out_size;
    int tiles_y = final_h / params.out_size;
    printf("Splitting into %dx%d tiles\n", tiles_x, tiles_y);

    auto tiles = split_to_tiles(padded_image, final_w, final_h,
                                params.out_size, params.out_size,
                                tiles_x, tiles_y);

    size_t pixels_per_tile = params.out_size * params.out_size;
    size_t total_pixels = pixels_per_tile * 4;
    size_t total_values = total_pixels * 3;

    const size_t actual_tiles = tiles.size();
    std::vector<float> all_rVals(pixels_per_tile * actual_tiles, 0.0f);
    std::vector<float> all_gVals(pixels_per_tile * actual_tiles, 0.0f);
    std::vector<float> all_bVals(pixels_per_tile * actual_tiles, 0.0f);

    for (size_t tile_idx = 0; tile_idx < actual_tiles; tile_idx++) {
        const auto& tile_data = tiles[tile_idx];
        size_t tile_offset = tile_idx * pixels_per_tile;
        printf("Processing tile %zu\n", tile_idx);

        for (int y = 0; y < params.out_size; y++) {
            for (int x = 0; x < params.out_size; x++) {
                int src_idx = (y * params.out_size + x) * 3;

                // Normalization from 0-255 to 0-1.
                float rVal = static_cast<float>(tile_data[src_idx + 0]) / 255.0f;
                float gVal = static_cast<float>(tile_data[src_idx + 1]) / 255.0f;
                float bVal = static_cast<float>(tile_data[src_idx + 2]) / 255.0f;

                // Apply mean/std normalization
                rVal = (rVal - params.mean[0]) / params.std[0];
                gVal = (gVal - params.mean[1]) / params.std[1];
                bVal = (bVal - params.mean[2]) / params.std[2];

                size_t dst_idx = tile_offset + y * params.out_size + x;
                all_rVals[dst_idx] = rVal;
                all_gVals[dst_idx] = gVal;
                all_bVals[dst_idx] = bVal;
            }
        }
    }

    // Create result image
    llama_img* result = new llama_img();
    result->nx = params.out_size;
    result->ny = params.out_size * 4;
    result->aspect_ratio = aspect_ratio_idx;
    printf("aspect_ratio=%d\n", result->aspect_ratio);

    float* float_data = static_cast<float*>(calloc(total_values, sizeof(float)));
    if (!float_data) {
        delete result;
        throw std::runtime_error("Failed to allocate memory for float data");
    }

    // Pack the tiles in channel order have, keeping RGB channels separate within each tile.
    for (size_t tile_idx = 0; tile_idx < actual_tiles; tile_idx++) {
        size_t tile_offset = tile_idx * pixels_per_tile;
        size_t tile_start = tile_idx * pixels_per_tile * 3;

        for (size_t i = 0; i < pixels_per_tile; i++) {
            float_data[tile_start + i] = all_rVals[tile_offset + i];
        }
        for (size_t i = 0; i < pixels_per_tile; i++) {
            float_data[tile_start + pixels_per_tile + i] = all_gVals[tile_offset + i];
        }
        for (size_t i = 0; i < pixels_per_tile; i++) {
            float_data[tile_start + 2 * pixels_per_tile + i] = all_bVals[tile_offset + i];
        }
    }

    result->data = reinterpret_cast<unsigned char*>(float_data);
    output_imgs.resize(1);
    output_imgs[0].img = result;

    return true;
}


static ggml_tensor * mllama_image_build_encoder_layer(
    struct ggml_context * ctx0, const size_t il, const struct ca_layer & layer, struct ggml_tensor * embeddings,
    const float eps, const int hidden_size, const int batch_size, const int n_head, const int d_head) {

    struct ggml_tensor *cur = embeddings;

    {
        // layernorm1
        cur = ggml_norm(ctx0, cur, eps);
        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.norm_in_w), layer.norm_in_b);
        ggml_format_name(cur, "layer_norm_in-%ld", il);
    }

    {
        // self-attention
        struct ggml_tensor *Q = ggml_mul_mat(ctx0, layer.q_w, cur);
	ggml_format_name(Q, "Q-%ld", il);
        if (layer.q_b != nullptr) {
            Q = ggml_add(ctx0, Q, layer.q_b);
	    ggml_format_name(Q, "Q-bias-%ld", il);
        }

        Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, Q->ne[1], batch_size);
	ggml_format_name(Q, "Q-reshape-%ld", il);
        Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
	ggml_format_name(Q, "Q-cont-%ld", il);

        struct ggml_tensor *K = ggml_mul_mat(ctx0, layer.k_w, cur);
	ggml_format_name(K, "K-%ld", il);
        if (layer.k_b != nullptr) {
            K = ggml_add(ctx0, K, layer.k_b);
	    ggml_format_name(K, "K-bias-%ld", il);
        }

        K = ggml_reshape_4d(ctx0, K, d_head, n_head, K->ne[1], batch_size);
	ggml_format_name(K, "K-reshape-%ld", il);
        K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
	ggml_format_name(K, "K-cont-%ld", il);

        struct ggml_tensor *V = ggml_mul_mat(ctx0, layer.v_w, cur);
	ggml_format_name(V, "V-%ld", il);
        if (layer.v_b != nullptr) {
            V = ggml_add(ctx0, V, layer.v_b);
	    ggml_format_name(V, "V-bias-%ld", il);
        }

        V = ggml_reshape_4d(ctx0, V, d_head, n_head, V->ne[1], batch_size);
	ggml_format_name(V, "V-reshape-%ld", il);
        V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
	ggml_format_name(V, "V-cont-%ld", il);

        struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
	ggml_format_name(KQ, "KQ-%ld", il);
        KQ = ggml_scale(ctx0, KQ, 1.0f / sqrtf((float)d_head));
	ggml_format_name(KQ, "KQ-scale-%ld", il);
        KQ = ggml_soft_max(ctx0, KQ);
	ggml_format_name(KQ, "KQ-softmax-%ld", il);

        struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
	ggml_format_name(KQV, "KQV-%ld", il);
        KQV = ggml_reshape_4d(ctx0, KQV, d_head, KQV->ne[1], n_head, batch_size);
	ggml_format_name(KQV, "KQV-reshape-%ld", il);
        KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
	ggml_format_name(KQV, "KQV-permute-%ld", il);
        KQV = ggml_cont_3d(ctx0, KQV, hidden_size, KQV->ne[2], batch_size);
	ggml_format_name(KQV, "KQV-cont-%ld", il);

        cur = ggml_mul_mat(ctx0, layer.output_w, KQV);
	ggml_format_name(cur, "output-%ld", il);
        if (layer.output_b != nullptr) {
            cur = ggml_add(ctx0, cur, layer.output_b);
	    ggml_format_name(cur, "output-bias-%ld", il);
        }

        if (layer.attn_gate != nullptr) {
            cur = ggml_mul(ctx0, cur, layer.attn_gate);
	    ggml_format_name(cur, "output-gate-%ld", il);
        }
    }

    cur = ggml_add(ctx0, cur, embeddings);
    ggml_format_name(cur, "residual-%ld", il);

    embeddings = cur;

    {
        // layernorm2
        cur = ggml_norm(ctx0, cur, eps);
	ggml_format_name(cur, "layer_norm_out-%ld", il);
        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.norm_out_w), layer.norm_out_b);
	ggml_format_name(cur, "layer_norm_out-bias-%ld", il);
    }

    {
        // feed forward
        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ffn_up_w, cur), layer.ffn_up_b);
	ggml_format_name(cur, "ffn_up-%ld", il);
        cur = ggml_gelu(ctx0, cur);
	ggml_format_name(cur, "ffn_up-gelu-%ld", il);
        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, layer.ffn_down_w, cur), layer.ffn_down_b);
	ggml_format_name(cur, "ffn_down-%ld", il);

        if (layer.ffn_gate != nullptr) {
            cur = ggml_mul(ctx0, cur, layer.ffn_gate);
	    ggml_format_name(cur, "ffn_down-gate-%ld", il);
        }
    }

    // residual 2
    cur = ggml_add(ctx0, cur, embeddings);
    ggml_format_name(cur, "residual2-%ld", il);

    embeddings = cur;

    return embeddings;
}


static ggml_cgraph * mllama_image_build_graph(ca_context * ctx, const ca_image_batch img_batch) {
    const auto & model = *ctx->model;
    const auto &hparams = model.hparams;

    const int image_size = hparams.image_size;
    const int image_size_width = image_size;
    const int image_size_height = image_size;

    const int patch_size = hparams.patch_size;
    const int n_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int n_positions = n_patches + (model.class_embedding == nullptr ? 0 : 1);
    const int hidden_size = hparams.hidden_size;
    const int n_head = hparams.n_head;
    const int d_head = hidden_size / n_head;

    const int batch_size = img_batch.size();
    GGML_ASSERT(img_batch.size() == 1);

    const int n_tiles = img_batch[0].n_tiles > 0 ? img_batch[0].n_tiles : 4;
    const int n_channels = img_batch[0].n_channels > 0 ? img_batch[0].n_channels : 3;

    struct ggml_init_params params = {
        /* mem_size   */  ctx->buf_compute_meta.size(),
        /* mem buffer */  ctx->buf_compute_meta.data(),
        /* no_alloc   */  true,
    };

    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);

    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, n_channels, n_tiles);
    ggml_set_input(inp_raw);
    ggml_set_name(inp_raw, "inp_raw");

    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
    inp = ggml_reshape_3d(ctx0, inp, n_patches, hidden_size, n_tiles);
    inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
    inp = ggml_cont(ctx0, inp);

    struct ggml_tensor * aspect_ratios = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, img_batch.size());
    ggml_set_name(aspect_ratios, "aspect_ratios");
    ggml_set_input(aspect_ratios);

    if (model.pre_tile_position_embeddings != nullptr) {
        struct ggml_tensor *pre_tile_position_embeddings = ggml_get_rows(ctx0, model.pre_tile_position_embeddings, aspect_ratios);

        pre_tile_position_embeddings = ggml_reshape_3d(ctx0, pre_tile_position_embeddings, hidden_size, 1, n_tiles);
        if (model.pre_tile_position_embeddings_gate != nullptr) {
            pre_tile_position_embeddings = ggml_mul(ctx0, pre_tile_position_embeddings, model.pre_tile_position_embeddings_gate);
        }

        inp = ggml_add(ctx0, inp, pre_tile_position_embeddings);
    }

    struct ggml_tensor * embeddings = inp;

    if (model.class_embedding != nullptr) {
        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, n_positions, n_tiles);
        ggml_set_name(embeddings, "embeddings");
	ggml_set_input(embeddings);

        for (int i = 0; i < n_tiles; ++i) {
            // repeat class embeddings for each tile
            embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], i * embeddings->nb[2]);
	    ggml_set_input(embeddings);
	    ggml_format_name(embeddings, "embeddings-acc-%d", i);
        }

        embeddings = ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
	ggml_set_input(embeddings);
	ggml_set_name(embeddings, "embeddings-acc-inp");
    }

    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_positions);
    ggml_set_name(positions, "positions");
    ggml_set_input(positions);

    struct ggml_tensor * position_embd = ggml_get_rows(ctx0, model.position_embeddings, positions);
    ggml_set_name(position_embd, "position_embd");

    if (model.position_embeddings_gate != nullptr) {
        position_embd = ggml_mul(ctx0, position_embd, model.position_embeddings_gate);
	ggml_set_name(position_embd, "position_embd_gated");
    }

    embeddings = ggml_add(ctx0, embeddings, position_embd);
    ggml_set_name(embeddings, "embeddings_after_position_embd");

    if (model.tile_position_embeddings != nullptr) {
        struct ggml_tensor * tile_position_embeddings = ggml_get_rows(ctx0, model.tile_position_embeddings, aspect_ratios);
        ggml_set_name(tile_position_embeddings, "tile_position_embeddings");

        tile_position_embeddings = ggml_reshape_3d(ctx0, tile_position_embeddings, hidden_size, n_positions, n_tiles);
	ggml_set_name(tile_position_embeddings, "tile_position_embeddings_reshaped");

        if (model.tile_position_embeddings_gate != nullptr) {
            tile_position_embeddings = ggml_mul(ctx0, tile_position_embeddings, model.tile_position_embeddings_gate);
	    ggml_set_name(tile_position_embeddings, "tile_position_embeddings_gated");
        }

        embeddings = ggml_add(ctx0, embeddings, tile_position_embeddings);
	ggml_set_name(embeddings, "embeddings_after_tile_position_embd");
    }

    // pre-layernorm
    if (model.pre_norm_w != nullptr) {
        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.pre_norm_w);
	ggml_set_name(embeddings, "pre layernorm");
        if (model.pre_norm_b != nullptr) {
            embeddings = ggml_add(ctx0, embeddings, model.pre_norm_b);
	    ggml_set_name(embeddings, "pre layernorm with bias");
        }

        ggml_set_name(embeddings, "pre layernorm");
    }

    const int num_padding_patches = 8 - (embeddings->ne[1] % 8) % 8;

    embeddings = ggml_pad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
    ggml_set_name(embeddings, "embeddings_pad");

    embeddings = ggml_view_3d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1] * embeddings->ne[2], batch_size, embeddings->nb[1], embeddings->nb[2] * embeddings->ne[3], 0);
    ggml_set_name(embeddings, "embeddings_view");

    std::vector<struct ggml_tensor *> intermediate_embeddings;

    // encoder
    for (size_t il = 0, iil = 0; il < model.layers.size(); il++) {
        if (hparams.intermediate_layers[iil] == (int) il) {
            intermediate_embeddings.push_back(embeddings);
            iil++;
        }

        embeddings = mllama_image_build_encoder_layer(
            ctx0, il, model.layers[il], embeddings,
            hparams.eps, hidden_size, batch_size, n_head, d_head);
	ggml_format_name(embeddings, "layer-%ld", il);
    }

    // post-layernorm
    if (model.post_norm_w != nullptr) {
        embeddings = ggml_mul(ctx0, ggml_norm(ctx0, embeddings, hparams.eps), model.post_norm_w);
	ggml_set_name(embeddings, "post layernorm");
        if (model.post_norm_b != nullptr) {
            embeddings = ggml_add(ctx0, embeddings, model.post_norm_b);
	    ggml_set_name(embeddings, "post_layernorm_bias");
        }

        ggml_set_name(embeddings, "post layernorm");
    }

    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, n_positions + num_padding_patches, n_tiles);
    ggml_set_name(embeddings, "embeddings_reshaped");

    if (model.post_tile_position_embeddings != nullptr) {
        struct ggml_tensor * post_tile_position_embeddings = ggml_get_rows(ctx0, model.post_tile_position_embeddings, aspect_ratios);

        post_tile_position_embeddings = ggml_reshape_3d(ctx0, post_tile_position_embeddings, hidden_size, 1, n_tiles);
        if (model.post_tile_position_embeddings_gate != nullptr) {
            post_tile_position_embeddings = ggml_mul(ctx0, post_tile_position_embeddings, model.post_tile_position_embeddings_gate);
        }

        embeddings = ggml_add(ctx0, embeddings, post_tile_position_embeddings);
    }

    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, n_tiles * (n_positions + num_padding_patches), 1);

    // global encoder
    for (size_t il = 0; il < model.global_layers.size(); il++) {
        embeddings = mllama_image_build_encoder_layer(
            ctx0, il, model.global_layers[il], embeddings,
            hparams.eps, hidden_size, batch_size, n_head, d_head);
	ggml_format_name(embeddings, "global_layer-%ld", il);
    }

    struct ggml_tensor * stacked_embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 0, hidden_size, (n_positions + num_padding_patches) * n_tiles);
    for (size_t i = 0; i < intermediate_embeddings.size(); ++i) {
        stacked_embeddings = ggml_concat(ctx0, stacked_embeddings, ggml_reshape_3d(ctx0, intermediate_embeddings[i], 1, intermediate_embeddings[i]->ne[0], intermediate_embeddings[i]->ne[1]), 0);
    }

    stacked_embeddings = ggml_reshape_4d(ctx0, stacked_embeddings, intermediate_embeddings.size() * hidden_size, n_positions + num_padding_patches, n_tiles, batch_size);
    stacked_embeddings = ggml_unpad(ctx0, stacked_embeddings, 0, num_padding_patches, 0, 0);

    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, n_positions + num_padding_patches, n_tiles);
    embeddings = ggml_unpad(ctx0, embeddings, 0, num_padding_patches, 0, 0);
    embeddings = ggml_concat(ctx0, embeddings, stacked_embeddings, 0);

    // mllama projector
    embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_1_w, embeddings), model.mm_1_b);
    ggml_set_name(embeddings, "mmproj");

    // build the graph
    ggml_build_forward_expand(gf, embeddings);

    ggml_free(ctx0);

    return gf;
}

static int32_t encode_image_with_ca_vision(ca_context & ctx,
        llama_img img, std::vector<float> & output) {
    const auto & model = *ctx.model;
    auto & hparams = model.hparams;

    mllama_image mllama_img = {&img, 3, 4};
    //ca_image_batch img_batch{mllama_img};
    ca_image_batch img_batch;

    if (!mllama_image_preprocess(mllama_img, img_batch)) {
        LLAMA_LOG_ERROR("%s: unable to preprocess mllama image\n", __func__);
        return -2;
    }

    static ggml_cgraph * gf = mllama_image_build_graph(&ctx, img_batch);
    ggml_backend_sched_reset(ctx.sched);
    bool ok = ggml_backend_sched_alloc_graph(ctx.sched, gf);
    if (!ok) {
        LLAMA_LOG_ERROR("failed to alloc memory for graph\n");
        return -1;
    }

    // Set inputs
    const int image_size = hparams.image_size;
    int image_size_width = image_size;
    int image_size_height = image_size;

    const int patch_size = hparams.patch_size;
    const int n_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int n_positions = n_patches + (model.class_embedding == nullptr ? 0 : 1);

    // set raw input
    {
        struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
        ggml_backend_tensor_set(inp_raw, img_batch[0].img->data, 0, ggml_nbytes(inp_raw));

        float buf[10];
        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(ctx.sched, inp_raw);
        ggml_backend_tensor_get_async(backend, inp_raw, buf, 0, sizeof(buf));
        ggml_backend_sched_synchronize(ctx.sched);

	size_t values_per_tile = 3 * 560 * 560;
	for (int tile = 0; tile < 4; tile++) {
	    size_t offset = values_per_tile * tile;
	    printf("\nTile %d first 10 values:\n", tile);
	    for (int i = 0; i < 10; i++) {
		float* float_data = reinterpret_cast<float*>(img_batch[0].img->data);
		printf("  [%d] = %f\n", i, float_data[offset + i]);
	    }
	}
    }

    if (model.class_embedding) {
        struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");

        void* zero_mem = malloc(ggml_nbytes(embeddings));
        memset(zero_mem, 0, ggml_nbytes(embeddings));
        ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
        free(zero_mem);
    }


    {
        struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
        if (positions != nullptr) {
	    printf("n_positions bytes: %zu, n_positions: %d\n", ggml_nbytes(positions), n_positions);
            int * positions_data = (int *)malloc(ggml_nbytes(positions));
            for (int i = 0; i < n_positions; i++) {
                positions_data[i] = i;
            }
            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
            free(positions_data);

        }
    }

    {
        struct ggml_tensor * aspect_ratios = ggml_graph_get_tensor(gf, "aspect_ratios");
        if (aspect_ratios != nullptr) {
            int * aspect_ratios_data = (int *)malloc(ggml_nbytes(aspect_ratios));
            aspect_ratios_data[0] = img_batch[0].img->aspect_ratio;
            ggml_backend_tensor_set(aspect_ratios, aspect_ratios_data, 0, ggml_nbytes(aspect_ratios));
            free(aspect_ratios_data);
        }
    }

    ggml_backend_sched_graph_compute_async(ctx.sched, gf);

    struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "mmproj");

    size_t out_nbytes = n_positions * embeddings->ne[0] * hparams.n_tiles * sizeof(float);
    GGML_ASSERT(out_nbytes == ggml_nbytes(embeddings));
    output.resize(out_nbytes);

    ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(ctx.sched, embeddings);
    ggml_backend_tensor_get_async(backend_embd, embeddings, output.data(), 0, ggml_nbytes(embeddings));
    ggml_backend_sched_synchronize(ctx.sched);

    for (int i = 0; i < 10; i++) {
        printf("vision encoder output[%d] = %f\n", i, output[i]);
    }

    ggml_backend_sched_reset(ctx.sched);
    return 0;
}


////////////////////////////////////////////////////////////////////////////////////////
// public API

int32_t llama_encode_vision_internal(clip_context & ctx, llama_batch_img * batch) {
    if (batch->n_imgs == 0) {
        return 0;
    }

    // TODO: batching is not working atm, should be fixed later
    const int n_embd = clip_n_mmproj_embd(ctx);
    const int n_tokens_per_img = clip_n_patches(ctx);
    const int n_pos = n_tokens_per_img*batch->n_imgs;

    ctx.out_embd.resize(n_embd*n_pos);
    ctx.out_pos.resize(n_pos);

    for (int i = 0; i < batch->n_imgs; i++) {
        std::vector<float> output_single;
        int32_t status = encode_image_with_clip(ctx, *batch->imgs[i], output_single);
        if (status != 0) {
            return status;
        }
        // copy output embeddings to result
        for (int k = 0; k < n_embd*n_tokens_per_img; k++) {
            ctx.out_embd[n_embd*n_tokens_per_img*i + k] = output_single[k];
        }
        // fill position for all output tokens
        for (int p = 0; p < n_tokens_per_img; p++) {
            ctx.out_pos[n_tokens_per_img*i + p] = batch->pos[i] + p;
        }
    }

    return 0;
}

int32_t ca_llama_encode_vision_internal(ca_context & ctx, llama_batch_img * batch) {
    if (batch->n_imgs == 0) {
        return 0;
    }
    const int n_embd = ca_n_mmproj_embd(ctx);
    const int n_tokens_per_img = ca_n_patches(ctx);

    const int image_size = ctx.model->hparams.image_size;
    int image_size_width = image_size;
    int image_size_height = image_size;
    const int patch_size = ctx.model->hparams.patch_size;
    const int n_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int n_positions = n_patches + (ctx.model->class_embedding == nullptr ? 0 : 1);

    size_t out_ntokens = n_positions * ctx.model->hparams.n_tiles;
    size_t out_nbytes = out_ntokens * n_embd * sizeof(float);
    ctx.out_embd.resize(out_nbytes);
    ctx.out_pos.resize(n_positions);


    for (int i = 0; i < batch->n_imgs; i++) {
        std::vector<float> output_single;
        int32_t status = encode_image_with_ca_vision(ctx, *batch->imgs[i], output_single);
        if (status != 0) {
            return status;
        }

        // copy output embeddings to result
        for (size_t k = 0; k < out_nbytes; k++) {
            ctx.out_embd[n_embd * n_tokens_per_img * i + k] = output_single[k];
        }

        // fill position for all output tokens
        for (int p = 0; p < n_positions; p++) {
            ctx.out_pos[n_tokens_per_img*i + p] = batch->pos[i] + p;
        }
    }

    return 0;
}

void llama_vision_clear_output(clip_context & ctx) {
    ctx.out_embd.clear();
    ctx.out_pos.clear();
}

void ca_llama_vision_clear_output(ca_context & ctx) {
    ctx.out_embd.clear();
    ctx.out_pos.clear();
}

////////////////////////////////////////////////////////////////////////////////////////
// for debugging
#ifndef NDEBUG

static int bmp_export(const struct clip_image_u8 &img, const std::string &location) {
    const uint32_t width = img.nx;
    const uint32_t height = img.ny;
    // swap red and blue channel
    std::vector<uint8_t> buffer(width*height*3);
    for (uint32_t y = 0; y < height; y++) {
        for (uint32_t x = 0; x < width; x++) {
            size_t base = x*3 + y*3*width;
            buffer[base+2] = img.buf[base];
            buffer[base+1] = img.buf[base+1];
            buffer[base]   = img.buf[base+2];
        }
    }
    const bool hasAlphaChannel = false;

    std::ofstream fout(location, std::ios::out | std::ios::binary);

    if (fout.fail()) {
        return 0;
    }

    //Padding
    const uint8_t padding = hasAlphaChannel ? 0 : (4 - (width * 3) % 4) % 4;

    //Bitmap file header.
    const char signature[2] = { 'B', 'M' };
    const uint32_t fileSize = buffer.size() * sizeof(uint8_t) + padding * (height - 1) + 14 + 124;
    const uint32_t offset = 14 + 124;

    //Bitmap information header file
    const uint32_t DIBSize = 124;
    const int32_t bitmapWidth = width;
    const int32_t bitmapHeight = height;
    const uint16_t numPlanes = 1;
    const uint16_t bitsPerPixel = (hasAlphaChannel) ? 32 : 24;
    const uint32_t compressionMethod = (hasAlphaChannel) ? 3 : 0; //BI_RGB = 0, BI_BITFIELDS = 3
    const uint32_t bitmapSize = buffer.size() * sizeof(uint8_t);
    const int32_t horizontalResolution = 2834;
    const int32_t verticalResolution = 2834;
    const uint32_t numColors = 0;
    const uint32_t impColorCount = 0;
    const uint32_t redBitmask = (hasAlphaChannel) ? 0x0000FF00 : 0; //ARGB32 pixel format
    const uint32_t greenBitmask = (hasAlphaChannel) ? 0x00FF0000 : 0;
    const uint32_t blueBitmask = (hasAlphaChannel) ? 0xFF000000 : 0;
    const uint32_t alphaBitmask = (hasAlphaChannel) ? 0x000000FF : 0;

    //Writing the file header and information header to the file
    std::vector<uint8_t> header(offset, 0);
    header[0] = signature[0];
    header[1] = signature[1];

#define BMP_HEADERS(i, variableName)    header[i] = variableName; header[i+1] = variableName >> 8; header[i+2] = variableName >> 16; header[i+3] = variableName >> 24;

    BMP_HEADERS(2, fileSize);
    BMP_HEADERS(6, 0);
    BMP_HEADERS(10, offset);
    BMP_HEADERS(14, DIBSize);
    BMP_HEADERS(18, bitmapWidth);
    BMP_HEADERS(22, bitmapHeight);

    header[26] = (uint8_t)numPlanes;
    header[27] = (uint8_t)(numPlanes >> 8);
    header[28] = (uint8_t)bitsPerPixel;
    header[29] = (uint8_t)(bitsPerPixel >> 8);

    BMP_HEADERS(30, compressionMethod);
    BMP_HEADERS(34, (unsigned char)bitmapSize);
    BMP_HEADERS(38, (unsigned char)horizontalResolution);
    BMP_HEADERS(42, (unsigned char)verticalResolution);
    BMP_HEADERS(46, (unsigned char)numColors);
    BMP_HEADERS(50, (unsigned char)impColorCount);
    BMP_HEADERS(54, (unsigned char)redBitmask);
    BMP_HEADERS(58, (unsigned char)greenBitmask);
    BMP_HEADERS(62, (unsigned char)blueBitmask);
    BMP_HEADERS(66, alphaBitmask);

#undef BMP_HEADERS

    fout.write((char *)header.data(), sizeof(uint8_t) * header.size());

    //Writing the pixel array
    const uint32_t bWidth = bitsPerPixel / 8 * width;

    for (int i = height - 1; i >= 0; i--) {
        std::vector<uint8_t> row(buffer.begin() + i * bWidth, buffer.begin() + i * bWidth + bWidth);
        fout.write((char *)row.data(), row.size() * sizeof(uint8_t));
        fout.seekp(padding * sizeof(uint8_t), std::ios::cur);
    }

    fout.close();
    return 1;
}

#endif