Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metal : reusing llama.cpp logging #3152

Merged
merged 13 commits into from
Sep 27, 2023
2 changes: 1 addition & 1 deletion examples/llama-bench/llama-bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -911,7 +911,7 @@ static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads)
}
}

static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) text;
(void) user_data;
Expand Down
4 changes: 4 additions & 0 deletions ggml-metal.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

#pragma once

#include "ggml.h"

#include <stddef.h>
#include <stdbool.h>

Expand All @@ -33,6 +35,8 @@ struct ggml_cgraph;
extern "C" {
#endif

void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);

struct ggml_metal_context;

// number of command buffers to use
Expand Down
116 changes: 74 additions & 42 deletions ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,14 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

// TODO: temporary - reuse llama.cpp logging
#ifdef GGML_METAL_NDEBUG
#define metal_printf(...)
#define GGML_METAL_LOG_INFO(...)
#define GGML_METAL_LOG_WARN(...)
#define GGML_METAL_LOG_ERROR(...)
#else
#define metal_printf(...) fprintf(stderr, __VA_ARGS__)
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#endif

#define UNUSED(x) (void)(x)
Expand Down Expand Up @@ -118,8 +121,37 @@ @interface GGMLMetalClass : NSObject
@implementation GGMLMetalClass
@end

ggml_log_callback ggml_metal_log_callback = NULL;
void * ggml_metal_log_user_data = NULL;

void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
ggml_metal_log_callback = log_callback;
ggml_metal_log_user_data = user_data;
}

static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){
if (ggml_metal_log_callback != NULL) {
va_list args;
va_start(args, format);
char buffer[128];
int len = vsnprintf(buffer, 128, format, args);
if (len < 128) {
ggml_metal_log_callback(level, buffer, ggml_metal_log_user_data);
} else {
char* buffer2 = malloc(len+1);
vsnprintf(buffer2, len+1, format, args);
buffer2[len] = 0;
ggml_metal_log_callback(level, buffer2, ggml_metal_log_user_data);
free(buffer2);
}
va_end(args);
}
}



struct ggml_metal_context * ggml_metal_init(int n_cb) {
metal_printf("%s: allocating\n", __func__);
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);

id <MTLDevice> device;
NSString * s;
Expand All @@ -129,14 +161,14 @@ @implementation GGMLMetalClass
NSArray * devices = MTLCopyAllDevices();
for (device in devices) {
s = [device name];
metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
}
#endif

// Pick and show default Metal device
device = MTLCreateSystemDefaultDevice();
s = [device name];
metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);

// Configure context
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
Expand All @@ -163,7 +195,7 @@ @implementation GGMLMetalClass
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];

if (error) {
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
}
Expand All @@ -177,11 +209,11 @@ @implementation GGMLMetalClass
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);

NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
if (error) {
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}

Expand All @@ -193,7 +225,7 @@ @implementation GGMLMetalClass
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
#endif
if (error) {
metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
return NULL;
}
}
Expand All @@ -205,11 +237,11 @@ @implementation GGMLMetalClass
#define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
(int) ctx->pipeline_##name.threadExecutionWidth); \
if (error) { \
metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \
}

Expand Down Expand Up @@ -266,21 +298,21 @@ @implementation GGMLMetalClass
#undef GGML_METAL_ADD_KERNEL
}

metal_printf("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
#if TARGET_OS_OSX
metal_printf("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
if (ctx->device.maxTransferRate != 0) {
metal_printf("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
} else {
metal_printf("%s: maxTransferRate = built-in GPU\n", __func__);
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
}
#endif

return ctx;
}

void ggml_metal_free(struct ggml_metal_context * ctx) {
metal_printf("%s: deallocating\n", __func__);
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
#define GGML_METAL_DEL_KERNEL(name) \
[ctx->function_##name release]; \
[ctx->pipeline_##name release];
Expand Down Expand Up @@ -354,7 +386,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
void * data = NULL;
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
if (result != 0) {
metal_printf("%s: error: posix_memalign failed\n", __func__);
GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
return NULL;
}

Expand Down Expand Up @@ -382,7 +414,7 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
// Metal buffer based on the host memory pointer
//
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
//metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
//GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);

const int64_t tsize = ggml_nbytes(t);

Expand All @@ -394,13 +426,13 @@ int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs;

//metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
//GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);

return ctx->buffers[i].metal;
}
}

metal_printf("%s: error: buffer is nil\n", __func__);
GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);

return nil;
}
Expand All @@ -412,7 +444,7 @@ bool ggml_metal_add_buffer(
size_t size,
size_t max_size) {
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
metal_printf("%s: too many buffers\n", __func__);
GGML_METAL_LOG_ERROR("%s: error: too many buffers\n", __func__);
return false;
}

Expand All @@ -422,7 +454,7 @@ bool ggml_metal_add_buffer(
const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;

if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
GGML_METAL_LOG_ERROR("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
return false;
}
}
Expand All @@ -443,11 +475,11 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];

if (ctx->buffers[ctx->n_buffers].metal == nil) {
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
return false;
}

metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);

++ctx->n_buffers;
} else {
Expand All @@ -467,31 +499,31 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];

if (ctx->buffers[ctx->n_buffers].metal == nil) {
metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
return false;
}

metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
if (i + size_step < size) {
metal_printf("\n");
GGML_METAL_LOG_INFO("\n");
}

++ctx->n_buffers;
}
}

#if TARGET_OS_OSX
metal_printf(", (%8.2f / %8.2f)",
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);

if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
GGML_METAL_LOG_WARN(", warning: current allocated size is greater than the recommended max working set size\n", __func__);
} else {
metal_printf("\n");
GGML_METAL_LOG_INFO("\n");
}
#else
metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
#endif
}

Expand Down Expand Up @@ -604,7 +636,7 @@ void ggml_metal_graph_find_concurrency(
}

if (ctx->concur_list_len > GGML_MAX_CONCUR) {
metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
GGML_METAL_LOG_WARN("%s: too many elements for metal ctx->concur_list!\n", __func__);
}
}

Expand Down Expand Up @@ -658,7 +690,7 @@ void ggml_metal_graph_compute(
continue;
}

//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
//GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));

struct ggml_tensor * src0 = gf->nodes[i]->src[0];
struct ggml_tensor * src1 = gf->nodes[i]->src[1];
Expand Down Expand Up @@ -702,17 +734,17 @@ void ggml_metal_graph_compute(
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;

//metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
//if (src0) {
// metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
// GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
// ggml_is_contiguous(src0), src0->name);
//}
//if (src1) {
// metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
// GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
// ggml_is_contiguous(src1), src1->name);
//}
//if (dst) {
// metal_printf("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
// GGML_METAL_LOG_INFO("%s: dst - %4s [%5lld, %5lld, %5lld], 1, %s\n", __func__, ggml_type_name(dstt), ne0, ne1, ne2,
// dst->name);
//}

Expand Down Expand Up @@ -824,7 +856,7 @@ void ggml_metal_graph_compute(
} break;
default:
{
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false);
}
} break;
Expand Down Expand Up @@ -1007,7 +1039,7 @@ void ggml_metal_graph_compute(
} break;
default:
{
metal_printf("Asserting on type %d\n",(int)src0t);
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
GGML_ASSERT(false && "not implemented");
}
};
Expand Down Expand Up @@ -1249,7 +1281,7 @@ void ggml_metal_graph_compute(
} break;
default:
{
metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false);
}
}
Expand All @@ -1274,7 +1306,7 @@ void ggml_metal_graph_compute(

MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
if (status != MTLCommandBufferStatusCompleted) {
metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
GGML_ASSERT(false);
}
}
Expand Down
7 changes: 7 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,12 @@ extern "C" {
GGML_OBJECT_WORK_BUFFER
};

enum ggml_log_level {
GGML_LOG_LEVEL_ERROR = 2,
GGML_LOG_LEVEL_WARN = 3,
GGML_LOG_LEVEL_INFO = 4
};

// ggml object
struct ggml_object {
size_t offs;
Expand Down Expand Up @@ -1682,6 +1688,7 @@ extern "C" {
};

typedef void (*ggml_opt_callback)(void * data, float * sched);
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);

// optimization parameters
//
Expand Down
Loading