Skip to content

Commit

Permalink
Change how we queue metal ops.
Browse files Browse the repository at this point in the history
  • Loading branch information
liuliu committed Nov 9, 2024
1 parent a60a38b commit 4bad02e
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 12 deletions.
11 changes: 8 additions & 3 deletions lib/nnc/ccv_nnc.h
Original file line number Diff line number Diff line change
Expand Up @@ -885,10 +885,15 @@ CCV_WARN_UNUSED(int) ccv_nnc_cmd_enforce_inplace(const ccv_nnc_cmd_t cmd, const
*/
void ccv_nnc_set_profiler(int state);
/**
* When have choices between doing things, prefer to be more memory efficient and take performance hit. This is relevant to MPSGraph because if we dispatch all command buffers at full speed, we risk of holding a lot of resources up until all of them executed. Alternatively, we can wait previous one done before proceed, with obvious performance penalties.
* @param state 1 is on, 0 is off. Default to off.
* Set the queue watermark when queueing up GPU commands. This is a Metal-only option.
* @param > 0 is how many in-flight GPU commands can have.
*/
void ccv_nnc_set_memory_efficient(int state);
void ccv_nnc_set_queue_watermark(int state);
/**
* Get the queue watermark when queueing up GPU commands. This is a Metal-only option.
* @return How many in-flight GPU commands can have.
*/
CCV_WARN_UNUSED(int) ccv_nnc_queue_watermark(void);
/**
* Quantize a given memory region of a given datatype / memory resides, into nbits palette.
* @param input The input memory region, it can be CCV_64F, CCV_32F or CCV_16F.
Expand Down
13 changes: 11 additions & 2 deletions lib/nnc/ccv_nnc_cmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -714,10 +714,19 @@ void ccv_nnc_set_profiler(int state)
#endif
}

void ccv_nnc_set_memory_efficient(int state)
int ccv_nnc_queue_watermark(void)
{
#ifdef HAVE_MPS
return ccv_nnc_mps_queue_watermark();
#else
return 0;
#endif
}

void ccv_nnc_set_queue_watermark(int watermark)
{
#ifdef HAVE_MPS
// If we need to be memory efficient, we need to bound how many in-flight command buffers there are.
ccv_nnc_mps_unbounded_command_buffers(!state);
ccv_nnc_mps_set_queue_watermark(watermark);
#endif
}
3 changes: 2 additions & 1 deletion lib/nnc/mps/ccv_nnc_mps.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ void ccv_nnc_stream_compat_emit_signal(const ccv_nnc_stream_context_t* const str
void ccv_nnc_stream_compat_wait_signal(const ccv_nnc_stream_context_t* const stream, const ccv_nnc_stream_signal_t* const signal);
void ccv_nnc_deinit_stream_signal(ccv_nnc_stream_signal_t* const signal);
CCV_WARN_UNUSED(int) ccv_nnc_gpu_device_count(void);
void ccv_nnc_mps_unbounded_command_buffers(int state);
void ccv_nnc_mps_set_queue_watermark(int watermark);
CCV_WARN_UNUSED(int) ccv_nnc_mps_queue_watermark(void);
void ccv_nnc_mps_clear_graph_executable_cache(void);
void ccv_nnc_mps_depalettize(const void* input, const int datatype, const size_t input_length, const int qbits, const int number_in_blocks, void* output, const size_t output_length, void* const command_buffer);

Expand Down
16 changes: 10 additions & 6 deletions lib/nnc/mps/ccv_nnc_mps.m
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ @interface MTLFileBackedBuffer: NSObject

static os_unfair_lock queue_lock;
#define OLD_MAX_COMMAND_BUFFER_SIZE (32)
#define OLD_LIMITED_COMMAND_BUFFER_SIZE (8)
static id<MTLCommandBuffer> old_last_command_buffers[OLD_MAX_COMMAND_BUFFER_SIZE];
static id<MTLCommandBuffer> last_command_buffer;

Expand Down Expand Up @@ -560,15 +559,15 @@ ccv_nnc_mps_graph_key_t ccv_nnc_mps_graph_key_new(const ccv_nnc_cmd_t cmd, const
return stream_context;
}

static int enable_unbounded_command_buffers = 1;
static int command_buffers_watermark = 8;

void ccv_nnc_synchronize_stream_context(const ccv_nnc_stream_context_t* const stream_context)
{
os_unfair_lock_lock(&queue_lock);
id<MTLCommandBuffer> command_buffer = last_command_buffer;
last_command_buffer = nil;
int i;
const int buffer_size = enable_unbounded_command_buffers ? OLD_MAX_COMMAND_BUFFER_SIZE : OLD_LIMITED_COMMAND_BUFFER_SIZE;
const int buffer_size = ccv_min(command_buffers_watermark, OLD_MAX_COMMAND_BUFFER_SIZE);
id<MTLCommandBuffer> old_buffers[buffer_size];
for (i = 0; i < buffer_size; i++)
{
Expand Down Expand Up @@ -707,9 +706,14 @@ int ccv_nnc_gpu_device_count(void)
return [MPSCommandBuffer commandBufferFromCommandQueue:_ccv_nnc_default_queue()];
}

void ccv_nnc_mps_unbounded_command_buffers(int state)
int ccv_nnc_mps_queue_watermark(void)
{
enable_unbounded_command_buffers = state;
return command_buffers_watermark;
}

void ccv_nnc_mps_set_queue_watermark(int watermark)
{
command_buffers_watermark = ccv_max(ccv_min(watermark, OLD_MAX_COMMAND_BUFFER_SIZE), 0);
}

void ccv_nnc_stream_context_finish_command_buffer(ccv_nnc_stream_context_t* const stream_context, MPSCommandBuffer* mps_command_buffer, MTLCommandBatch* command_batch)
Expand All @@ -722,7 +726,7 @@ void ccv_nnc_stream_context_finish_command_buffer(ccv_nnc_stream_context_t* cons
}

int i;
const int buffer_size = enable_unbounded_command_buffers ? OLD_MAX_COMMAND_BUFFER_SIZE : OLD_LIMITED_COMMAND_BUFFER_SIZE;
const int buffer_size = ccv_min(command_buffers_watermark, OLD_MAX_COMMAND_BUFFER_SIZE);
if (!stream_context)
{
id<MTLCommandBuffer> committed_command_buffer = [mtl_command_buffer retain];
Expand Down

0 comments on commit 4bad02e

Please sign in to comment.