Skip to content

Commit

Permalink
cuda : improve cuda pool efficiency using virtual memory (ggerganov#4606
Browse files Browse the repository at this point in the history
)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
  • Loading branch information
slaren and JohannesGaessler authored Dec 24, 2023
1 parent 708e179 commit 5bf3953
Show file tree
Hide file tree
Showing 8 changed files with 328 additions and 208 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,8 @@ if (LLAMA_CUBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)

if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
Expand Down
6 changes: 2 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -367,17 +367,15 @@ endif # LLAMA_BLIS

ifdef LLAMA_CUBLAS
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
MK_LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
MK_NVCCFLAGS = -use_fast_math
ifndef JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
endif # JETSON_EOL_MODULE_DETECT

ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
endif

endif # LLAMA_DEBUG
ifdef LLAMA_CUDA_NVCC
NVCC = $(LLAMA_CUDA_NVCC)
else
Expand Down
16 changes: 6 additions & 10 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ static void ggml_backend_registry_init(void) {
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);

int id = ggml_backend_registry_count;
size_t id = ggml_backend_registry_count;

ggml_backend_registry[id] = (struct ggml_backend_reg) {
/* .name = */ {0},
Expand Down Expand Up @@ -330,6 +330,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
return i;
}
}

// not found
return SIZE_MAX;
}

Expand All @@ -340,15 +342,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str)
const char * params = strchr(backend_str, ':');
char backend_name[128];
if (params == NULL) {
strcpy(backend_name, backend_str);
snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
params = "";
} else {
strncpy(backend_name, backend_str, params - backend_str);
backend_name[params - backend_str] = '\0';
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
params++;
}

size_t backend_i = ggml_backend_reg_find_by_name(backend_name);

if (backend_i == SIZE_MAX) {
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
return NULL;
Expand Down Expand Up @@ -396,18 +398,12 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
}

static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");

memcpy((char *)tensor->data + offset, data, size);

GGML_UNUSED(buffer);
}

static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");

memcpy(data, (const char *)tensor->data + offset, size);

GGML_UNUSED(buffer);
Expand Down
Loading

0 comments on commit 5bf3953

Please sign in to comment.