Skip to content

Commit

Permalink
llama : refactor model loader with backend registry (#10026)
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren authored Oct 30, 2024
1 parent 8f275a7 commit c5b0f4b
Show file tree
Hide file tree
Showing 18 changed files with 1,914 additions and 2,030 deletions.
132 changes: 22 additions & 110 deletions examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@
#include "ggml.h"
#include "llama.h"
#include "common.h"
#include "ggml-cuda.h"
#include "ggml-sycl.h"

#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif

#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
Expand Down Expand Up @@ -82,95 +76,27 @@ static T stdev(const std::vector<T> & v) {
}

static std::string get_cpu_info() {
std::string id;
#ifdef __linux__
FILE * f = fopen("/proc/cpuinfo", "r");
if (f) {
char buf[1024];
while (fgets(buf, sizeof(buf), f)) {
if (strncmp(buf, "model name", 10) == 0) {
char * p = strchr(buf, ':');
if (p) {
p++;
while (std::isspace(*p)) {
p++;
}
while (std::isspace(p[strlen(p) - 1])) {
p[strlen(p) - 1] = '\0';
}
id = p;
break;
}
}
std::vector<std::string> cpu_list;
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
auto * dev = ggml_backend_dev_get(i);
auto dev_type = ggml_backend_dev_type(dev);
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
cpu_list.push_back(ggml_backend_dev_description(dev));
}
fclose(f);
}
#elif defined(_WIN32)
HKEY hKey;
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
0,
KEY_READ,
&hKey) != ERROR_SUCCESS) {
// fail to open registry key
return "";
}
char cpu_brand[256];
DWORD cpu_brand_size = sizeof(cpu_brand);
if (RegQueryValueExA(hKey,
TEXT("ProcessorNameString"),
NULL,
NULL,
(LPBYTE)cpu_brand,
&cpu_brand_size) == ERROR_SUCCESS) {
id.assign(cpu_brand, cpu_brand_size);
if (id.find('\0') != std::string::npos) {
id.resize(id.find('\0'));
}
}
RegCloseKey(hKey);
#endif
// TODO: other platforms
return id;
return join(cpu_list, ", ");
}

static std::string get_gpu_info() {
std::string id;
#ifdef GGML_USE_CUDA
int count = ggml_backend_cuda_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
std::vector<std::string> gpu_list;
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
auto * dev = ggml_backend_dev_get(i);
auto dev_type = ggml_backend_dev_type(dev);
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
gpu_list.push_back(ggml_backend_dev_description(dev));
}
}
#endif
#ifdef GGML_USE_SYCL
int count = ggml_backend_sycl_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
#ifdef GGML_USE_CANN
uint32_t count = ggml_backend_cann_get_device_count();
for (uint32_t i = 0; i < count; i++) {
char buf[128];
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";
}
}
#endif
// TODO: other backends
return id;
return join(gpu_list, ", ");
}

// command line params
Expand Down Expand Up @@ -938,29 +864,15 @@ struct test {
}

static std::string get_backend() {
if (cuda) {
return GGML_CUDA_NAME;
}
if (vulkan) {
return "Vulkan";
}
if (kompute) {
return "Kompute";
}
if (metal) {
return "Metal";
}
if (sycl) {
return GGML_SYCL_NAME;
}
if (gpu_blas) {
return "GPU BLAS";
}
if (blas) {
return "BLAS";
std::vector<std::string> backends;
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
auto * reg = ggml_backend_reg_get(i);
std::string name = ggml_backend_reg_name(reg);
if (name != "CPU") {
backends.push_back(ggml_backend_reg_name(reg));
}
}

return "CPU";
return backends.empty() ? "CPU" : join(backends, ",");
}

static const std::vector<std::string> & get_fields() {
Expand Down
19 changes: 12 additions & 7 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,12 @@ extern "C" {
//

enum ggml_backend_dev_type {
// CPU device using system memory
GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
};

// functionality supported by the device
Expand Down Expand Up @@ -167,10 +168,14 @@ extern "C" {
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);

// Common functions that may be obtained using ggml_backend_reg_get_proc_address

// Functions that may be obtained using ggml_backend_reg_get_proc_address
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
// Split buffer type for tensor parallelism
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
// Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);

//
// Backend registry
Expand All @@ -192,7 +197,7 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);

//
Expand Down
2 changes: 1 addition & 1 deletion ggml/include/ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);

// split tensor buffer that splits matrices by rows across multiple devices
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);

// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
Expand Down
33 changes: 8 additions & 25 deletions ggml/src/ggml-amx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,6 @@
#if defined(__AMX_INT8__)

// AMX buffer interface
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
return "AMX";

GGML_UNUSED(buffer);
}

static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
Expand Down Expand Up @@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
}

static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .get_name = */ ggml_backend_amx_buffer_get_name,
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
Expand Down Expand Up @@ -121,14 +114,14 @@ static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ NULL,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ NULL,
};

Expand All @@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
delete backend;
}

static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_amx_buffer_type();

GGML_UNUSED(backend);
}

static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;

Expand Down Expand Up @@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
static struct ggml_backend_i ggml_backend_amx_i = {
/* .get_name = */ ggml_backend_amx_name,
/* .free = */ ggml_backend_amx_free,
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
Expand All @@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_amx_graph_compute,
/* .supports_op = */ NULL,
/* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
Expand Down Expand Up @@ -279,7 +262,7 @@ static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t *
}

static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_CPU;
return GGML_BACKEND_DEVICE_TYPE_ACCEL;

GGML_UNUSED(dev);
}
Expand Down
19 changes: 4 additions & 15 deletions ggml/src/ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ extern "C" {
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// (optional) check if tensor data is in host memory (defaults to false)
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
bool (*is_host) (ggml_backend_buffer_type_t buft);
};

Expand All @@ -37,7 +37,6 @@ extern "C" {
//

struct ggml_backend_buffer_i {
const char * (*get_name) (ggml_backend_buffer_t buffer);
// (optional) free the buffer
void (*free_buffer) (ggml_backend_buffer_t buffer);
// base address of the buffer
Expand Down Expand Up @@ -88,19 +87,16 @@ extern "C" {

void (*free)(ggml_backend_t backend);

// Will be moved to the device interface
// buffer allocation
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);

// (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);

// (optional) complete all pending operations
// (optional) complete all pending operations (required if the backend supports async operations)
void (*synchronize)(ggml_backend_t backend);

// (optional) compute graph with a plan (not used currently)
// (optional) graph plans (not used currently)
// compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
Expand All @@ -111,13 +107,6 @@ extern "C" {
// compute graph (always async if supported by the backend)
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);

// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
// new backends should implement the device interface instead
// These functions are being moved to the device interface
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);

// (optional) event synchronization
// record an event on this stream
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
Expand Down
Loading

0 comments on commit c5b0f4b

Please sign in to comment.