Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : initial ggml-backend integration #4520

Merged
merged 24 commits into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
8e6735e
llama : initial ggml-backend integration
slaren Dec 17, 2023
0808aa5
add ggml-metal
slaren Dec 19, 2023
9450791
Merge remote-tracking branch 'origin/master' into sl/ggml-backend-int
slaren Dec 19, 2023
0c5ee7c
cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_…
slaren Dec 19, 2023
1ac01fb
add ggml_backend_buffer_clear
slaren Dec 19, 2023
c8bd5d8
add ggml_backend_buffer_is_hos, used to avoid copies if possible when…
slaren Dec 19, 2023
72a0c96
disable gpu backends with ngl 0
slaren Dec 20, 2023
d3e7242
more accurate mlock
slaren Dec 20, 2023
c3678ca
unmap offloaded part of the model
slaren Dec 20, 2023
5241045
use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance…
slaren Dec 20, 2023
bcd87ca
update quantize and lora
slaren Dec 20, 2023
24cc321
update session copy/set to use ggml-backend
slaren Dec 20, 2023
f70f94d
use posix_fadvise instead of posix_fadvise64
slaren Dec 20, 2023
6c045a8
ggml_backend_alloc_ctx_tensors_from_buft : remove old print
slaren Dec 20, 2023
5834a25
llama_mmap::align_offset : use pointers instead of references for out…
slaren Dec 20, 2023
ecb23d4
restore progress_callback behavior
slaren Dec 20, 2023
8ed2a8e
move final progress_callback call to load_all_data
slaren Dec 20, 2023
a4e191f
cuda : fix fprintf format string (minor)
ggerganov Dec 21, 2023
a74b1a8
do not offload scales
slaren Dec 21, 2023
6a72c7f
Merge remote-tracking branch 'origin/master' into sl/ggml-backend-int
slaren Dec 21, 2023
cd4167b
llama_mmap : avoid unmapping the same fragments again in the destructor
slaren Dec 21, 2023
16582cd
Merge remote-tracking branch 'origin/master' into sl/ggml-backend-int
slaren Dec 21, 2023
323881e
remove unnecessary unmap
slaren Dec 21, 2023
f4d884f
metal : add default log function that prints to stderr, cleanup code
slaren Dec 21, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ test: $(TEST_TARGETS)
./$$test_target; \
fi; \
if [ $$? -ne 0 ]; then \
printf 'Test $$test_target FAILED!\n\n' $$test_target; \
printf 'Test %s FAILED!\n\n' $$test_target; \
failures=$$(( failures + 1 )); \
else \
printf 'Test %s passed.\n\n' $$test_target; \
Expand Down
16 changes: 12 additions & 4 deletions ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
if (update_backend) {
view->backend = view->view_src->backend;
}
view->buffer = view->view_src->buffer;
// views are initialized in the alloc buffer rather than the view_src buffer
view->buffer = alloc->buffer;
view->data = (char *)view->view_src->data + view->view_offs;

// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);

if (!alloc->measure) {
Expand Down Expand Up @@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
}

void ggml_allocr_free(ggml_allocr_t alloc) {
if (alloc == NULL) {
return;
}

ggml_gallocr_free(alloc->galloc);
ggml_tallocr_free(alloc->talloc);
free(alloc);
Expand Down Expand Up @@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
}

if (nbytes == 0) {
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
// all the tensors in the context are already allocated
return NULL;
}

Expand All @@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
} else {
ggml_backend_view_init(buffer, t);
}
} else {
if (t->view_src != NULL) {
// view of a pre-allocated tensor
ggml_backend_view_init(buffer, t);
}
}
}

Expand Down
20 changes: 12 additions & 8 deletions ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ extern "C" {
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
bool (*is_host) (ggml_backend_buffer_type_t buft);
};

struct ggml_backend_buffer_type {
Expand All @@ -31,15 +34,16 @@ extern "C" {
typedef void * ggml_backend_buffer_context_t;

struct ggml_backend_buffer_i {
void (*free_buffer)(ggml_backend_buffer_t buffer);
void (*free_buffer) (ggml_backend_buffer_t buffer);
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
void * (*get_base) (ggml_backend_buffer_t buffer);
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void * (*get_base) (ggml_backend_buffer_t buffer);
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
};

struct ggml_backend_buffer {
Expand Down Expand Up @@ -78,7 +82,7 @@ extern "C" {
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);

void (*synchronize) (ggml_backend_t backend);
void (*synchronize)(ggml_backend_t backend);

// compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
Expand Down
80 changes: 75 additions & 5 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
return buft->iface.supports_backend(buft, backend);
}

bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
if (buft->iface.is_host) {
return buft->iface.is_host(buft);
}
return false;
}

// backend buffer

ggml_backend_buffer_t ggml_backend_buffer_init(
Expand Down Expand Up @@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
}

void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
buffer->iface.clear(buffer, value);
}

bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
}

ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
return buffer->buft;
}
Expand Down Expand Up @@ -378,7 +393,6 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {

static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
GGML_UNUSED(buffer);
}

static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
Expand Down Expand Up @@ -411,6 +425,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
GGML_UNUSED(buffer);
}

static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}

static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
Expand All @@ -419,6 +437,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
/* .clear = */ ggml_backend_cpu_buffer_clear,
};

// for buffers from ptr, free is not called
Expand All @@ -430,6 +449,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
/* .clear = */ ggml_backend_cpu_buffer_clear,
};

static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
Expand All @@ -455,20 +475,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
GGML_UNUSED(buft);
}

static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;

GGML_UNUSED(buft);
}

ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
/* .iface = */ {
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .context = */ NULL,
};

return &ggml_backend_buffer_type_cpu;
return &ggml_backend_cpu_buffer_type;
}

#ifdef GGML_USE_CPU_HBM

// buffer type HBM

#include <hbwmalloc.h>

static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
hbw_free(buffer->context);
}

static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
//void * ptr = hbw_malloc(size);
void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) {
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
return NULL;
}

// FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;

return buffer;
}

ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
/* .iface = */ {
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .context = */ NULL,
};

return &ggml_backend_cpu_buffer_type_hbm;
}
#endif

struct ggml_backend_cpu_context {
int n_threads;
void * work_data;
Expand Down Expand Up @@ -505,7 +575,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));

cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
cpu_plan->cgraph = *cgraph;
cpu_plan->cgraph = *cgraph; // FIXME: deep copy

if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
Expand Down Expand Up @@ -1180,7 +1250,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
// utils
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL);
GGML_ASSERT(tensor->data == NULL);
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL);
Expand Down
7 changes: 7 additions & 0 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ extern "C" {
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);

// buffer
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
Expand All @@ -29,6 +30,8 @@ extern "C" {
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);

//
Expand Down Expand Up @@ -76,6 +79,10 @@ extern "C" {

GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);

#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif

//
// Backend registry
//
Expand Down
Loading
Loading