Skip to content

Commit

Permalink
Per-pipeline-invocation profiling (#8153)
Browse files Browse the repository at this point in the history
* Profiler tracks per-invocation state, instead of global state

This should give better results when multiple Halide pipelines are
running at the same time.

* Profiler improvements

- Don't profile bounds queries
- Simplify layout calculation
- Bill time after decrementing main thread as overhead, not waiting on
parallel tasks
- Change waiting on parallel tasks label

* name hygiene

* Fix signature

* Fix tracking of pipeline-level memory statistics

* Address review comments

* Pacify clang-tidy

* [Hexagon] Profiling changes for abadams/per_instance_profiling (#8187)

* Get abadams/per_instance_profiling working on hvx

* More changes

* Add Hexagon libraries

* Fix multiple instances of profiler_state

* Update hexagon libraries

* clang-format

---------

Co-authored-by: Steven Johnson <srj@google.com>
Co-authored-by: aankit-quic <166656642+aankit-quic@users.noreply.github.com>
  • Loading branch information
3 people authored Jun 25, 2024
1 parent 1449692 commit 8ff261e
Show file tree
Hide file tree
Showing 27 changed files with 538 additions and 291 deletions.
7 changes: 7 additions & 0 deletions src/BoundsInference.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1388,6 +1388,13 @@ Stmt bounds_inference(Stmt s,
Expr marker = Call::make(Int(32), Call::skip_stages_marker, {}, Call::Intrinsic);
s = Block::make(Evaluate::make(marker), s);

if (target.has_feature(Target::Profile) || target.has_feature(Target::ProfileByTimer)) {
// Add a note in the IR for what profiling should cover, so that it doesn't
// include bounds queries as pipeline executions.
marker = Call::make(Int(32), Call::profiling_enable_instance_marker, {}, Call::Intrinsic);
s = Block::make(Evaluate::make(marker), s);
}

// Add a note in the IR for where assertions on input images
// should go. Those are handled by a later lowering pass.
marker = Call::make(Int(32), Call::add_image_checks_marker, {}, Call::Intrinsic);
Expand Down
4 changes: 2 additions & 2 deletions src/CodeGen_Internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ bool function_takes_user_context(const std::string &name) {
"halide_print",
"halide_profiler_memory_allocate",
"halide_profiler_memory_free",
"halide_profiler_pipeline_start",
"halide_profiler_pipeline_end",
"halide_profiler_instance_start",
"halide_profiler_instance_end",
"halide_profiler_stack_peak_update",
"halide_spawn_thread",
"halide_device_release",
Expand Down
1 change: 1 addition & 0 deletions src/IR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,7 @@ const char *const intrinsic_op_names[] = {
"mux",
"popcount",
"prefetch",
"profiling_enable_instance_marker",
"promise_clamped",
"random",
"register_destructor",
Expand Down
1 change: 1 addition & 0 deletions src/IR.h
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,7 @@ struct Call : public ExprNode<Call> {
mux,
popcount,
prefetch,
profiling_enable_instance_marker,
promise_clamped,
random,
register_destructor,
Expand Down
207 changes: 135 additions & 72 deletions src/Profiling.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/Profiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Function;
* storage flattening, but after all bounds inference.
*
*/
Stmt inject_profiling(Stmt, const std::string &, const std::map<std::string, Function> &env);
Stmt inject_profiling(const Stmt &, const std::string &, const std::map<std::string, Function> &env);

} // namespace Internal
} // namespace Halide
Expand Down
130 changes: 85 additions & 45 deletions src/runtime/HalideRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -1265,6 +1265,9 @@ enum halide_error_code_t {
/** "vscale" value of Scalable Vector detected in runtime does not match
* the vscale value used in compilation. */
halide_error_code_vscale_invalid = -47,

/** Profiling failed for a pipeline invocation. */
halide_error_code_cannot_profile_pipeline = -48,
};

/** Halide calls the functions below on various error conditions. The
Expand Down Expand Up @@ -1849,7 +1852,7 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_func_stats {
/** Per-pipeline state tracked by the sampling profiler. These exist
* in a linked list. */
struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
/** Total time spent inside this pipeline (in nanoseconds) */
/** Total time spent in this pipeline (in nanoseconds) */
uint64_t time;

/** The current memory allocation of funcs in this pipeline. */
Expand Down Expand Up @@ -1878,9 +1881,6 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
/** The number of funcs in this pipeline. */
int num_funcs;

/** An internal base id used to identify the funcs in this pipeline. */
int first_func_id;

/** The number of times this pipeline has been run. */
int runs;

Expand All @@ -1891,48 +1891,98 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
int num_allocs;
};

/** The global state of the profiler. */
/** Per-invocation-of-a-pipeline state. Lives on the stack of the Halide
* code. Exists in a doubly-linked list to that it can be cleanly
* removed. */
struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_instance_state {
/** Time billed to funcs in this instance by the sampling thread. */
uint64_t billed_time;

struct halide_profiler_state {
/** Guards access to the fields below. If not locked, the sampling
* profiler thread is free to modify things below (including
* reordering the linked list of pipeline stats). */
struct halide_mutex lock;
/** Wall clock time of the start of the instance. */
uint64_t start_time;

/** The amount of time the profiler thread sleeps between samples
* in milliseconds. Defaults to 1 */
int sleep_time;
/** The current memory allocation of funcs in this instance. */
uint64_t memory_current;

/** An internal id used for bookkeeping. */
int first_free_id;
/** The peak memory allocation of funcs in this instance. */
uint64_t memory_peak;

/** The total memory allocation of funcs in this instance. */
uint64_t memory_total;

/** The average number of thread pool worker threads doing useful
* work while computing this instance. */
uint64_t active_threads_numerator, active_threads_denominator;

/** A pointer to the next running instance, so that the running instances
* can exist in a linked list. */
struct halide_profiler_instance_state *next;

/** A pointer to the address of the next pointer of the previous instance,
* so that this can be removed from the linked list when the instance
* terminates. */
struct halide_profiler_instance_state **prev_next;

/** Information shared across all instances. The stats above are merged into
* it when the instance is retired. */
struct halide_profiler_pipeline_stats *pipeline_stats;

/** An array containing states for each Func in this instance of this pipeline. */
struct halide_profiler_func_stats *funcs;

/** The id of the current running Func. Set by the pipeline, read
* periodically by the profiler thread. */
int current_func;

/** The number of threads currently doing work. */
/** The number of threads currently doing work on this pipeline instance. */
int active_threads;

/** The number of samples taken by this instance. */
int samples;

/** The total number of memory allocation of funcs in this instance. */
int num_allocs;

/** Whether or not this instance should count towards pipeline
* statistics. */
int should_collect_statistics;
};

/** The global state of the profiler. */
struct halide_profiler_state {
/** Guards access to the fields below. If not locked, the sampling
* profiler thread is free to modify things below (including
* reordering the linked list of pipeline stats). */
struct halide_mutex lock;

/** A linked list of stats gathered for each pipeline. */
struct halide_profiler_pipeline_stats *pipelines;

/** Retrieve remote profiler state. Used so that the sampling
* profiler can follow along with execution that occurs elsewhere,
* e.g. on a DSP. If null, it reads from the int above instead. */
void (*get_remote_profiler_state)(int *func, int *active_workers);

/** Sampling thread reference to be joined at shutdown. */
struct halide_thread *sampling_thread;
};

/** Profiler func ids with special meanings. */
enum {
/// current_func takes on this value when not inside Halide code
halide_profiler_outside_of_halide = -1,
/// Set current_func to this value to tell the profiling thread to
/// halt. It will start up again next time you run a pipeline with
/// profiling enabled.
halide_profiler_please_stop = -2
/** The running instances of Halide pipelines. */
struct halide_profiler_instance_state *instances;

/** If this callback is defined, the profiler asserts that there is a single
* live instance, and then uses it to get the current func and number of
* active threads insted of reading the fields in the instance. This is used
* so that the profiler can follow along with execution that occurs
* elsewhere (e.g. on an accelerator). */
void (*get_remote_profiler_state)(int *func, int *active_workers);

/** The amount of time the profiler thread sleeps between samples in
* microseconds. Defaults to 1000. To change it call
* halide_profiler_get_state and mutate this field. */
int sleep_time;

/** Set to 1 when you want the profiler to wait for all running instances to
* finish and then stop gracefully. */
int shutdown;
};

/** Get a pointer to the global profiler state for programmatic
Expand All @@ -1950,34 +2000,24 @@ extern struct halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state
* accurate time interval if desired. */
extern int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *prev_t);

/** Reset profiler state cheaply. May leave threads running or some
* memory allocated but all accumluated statistics are reset.
* WARNING: Do NOT call this method while any halide pipeline is
* running; halide_profiler_memory_allocate/free and
* halide_profiler_stack_peak_update update the profiler pipeline's
* state without grabbing the global profiler state's lock. */
/** Reset profiler state cheaply. May leave threads running or some memory
* allocated but all accumulated statistics are reset. Blocks until all running
* profiled Halide pipelines exit. */
extern void halide_profiler_reset(void);

/** Reset all profiler state.
* WARNING: Do NOT call this method while any halide pipeline is
* running; halide_profiler_memory_allocate/free and
* halide_profiler_stack_peak_update update the profiler pipeline's
* state without grabbing the global profiler state's lock. */
void halide_profiler_shutdown(void);
/** Reset all profiler state. Blocks until all running profiled Halide
* pipelines exit. */
extern void halide_profiler_shutdown(void);

/** Print out timing statistics for everything run since the last
* reset. Also happens at process exit. */
extern void halide_profiler_report(void *user_context);

/** For timer based profiling, this routine starts the timer chain running.
* halide_get_profiler_state can be called to get the current timer interval.
*/
extern void halide_start_timer_chain(void);
/** These routines are called to temporarily disable and then reenable
* timer interuppts for profiling */
* the profiler. */
//@{
extern void halide_disable_timer_interrupt(void);
extern void halide_enable_timer_interrupt(void);
extern void halide_profiler_lock(struct halide_profiler_state *);
extern void halide_profiler_unlock(struct halide_profiler_state *);
//@}

/// \name "Float16" functions
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/fuchsia_clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
return zx_clock_get_monotonic() - halide_reference_clock;
}

WEAK void halide_sleep_ms(void *user_context, int ms) {
zx_nanosleep(zx_deadline_after(ms * 1000));
WEAK void halide_sleep_us(void *user_context, int us) {
zx_nanosleep(zx_deadline_after(us));
}
}
23 changes: 18 additions & 5 deletions src/runtime/hexagon_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,6 @@ WEAK int halide_hexagon_initialize_kernels(void *user_context, void **state_ptr,
halide_abort_if_false(user_context, state_ptr != nullptr);

#ifdef DEBUG_RUNTIME
halide_start_clock(user_context);
uint64_t t_before = halide_current_time_ns(user_context);
#endif

Expand Down Expand Up @@ -478,11 +477,22 @@ WEAK int halide_hexagon_run(void *user_context,
// get_remote_profiler_func to retrieve the current
// func. Otherwise leave it alone - the cost of remote running
// will be billed to the calling Func.
halide_profiler_state *s = halide_profiler_get_state();
if (remote_poll_profiler_state) {
halide_profiler_get_state()->get_remote_profiler_state = get_remote_profiler_state;
if (remote_profiler_set_current_func) {
remote_profiler_set_current_func(halide_profiler_get_state()->current_func);
halide_profiler_lock(s);
const halide_profiler_instance_state *instance = s->instances;
if (instance) {
if (instance->next) {
halide_profiler_unlock(s);
error(user_context) << "Hexagon: multiple simultaneous profiled pipelines is unsupported.";
return halide_error_code_cannot_profile_pipeline;
}
s->get_remote_profiler_state = get_remote_profiler_state;
if (remote_profiler_set_current_func) {
remote_profiler_set_current_func(instance->current_func);
}
}
halide_profiler_unlock(s);
}

// Call the pipeline on the device side.
Expand All @@ -498,7 +508,9 @@ WEAK int halide_hexagon_run(void *user_context,
return halide_error_code_generic_error;
}

halide_profiler_get_state()->get_remote_profiler_state = nullptr;
halide_profiler_lock(s);
s->get_remote_profiler_state = nullptr;
halide_profiler_unlock(s);

#ifdef DEBUG_RUNTIME
uint64_t t_after = halide_current_time_ns(user_context);
Expand Down Expand Up @@ -580,6 +592,7 @@ WEAK int halide_hexagon_device_malloc(void *user_context, halide_buffer_t *buf)
debug(user_context) << " allocating buffer of " << (uint64_t)size << " bytes\n";

#ifdef DEBUG_RUNTIME
halide_start_clock(user_context);
uint64_t t_before = halide_current_time_ns(user_context);
#endif

Expand Down
Binary file modified src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so
Binary file not shown.
Binary file modified src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote
Binary file not shown.
Binary file not shown.
Binary file modified src/runtime/hexagon_remote/bin/v65/libsim_qurt.a
Binary file not shown.
Binary file not shown.
15 changes: 8 additions & 7 deletions src/runtime/hexagon_remote/qurt/halide_remote.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,23 +419,24 @@ int halide_hexagon_remote_release_library(handle_t module_ptr) {
return 0;
}

halide_profiler_instance_state *halide_hexagon_remote_profiler_get_global_instance() {
static halide_profiler_instance_state hvx_profiler_instance;
return &hvx_profiler_instance;
}

int halide_hexagon_remote_poll_profiler_state(int *func, int *threads) {
// Increase the current thread priority to match working threads priorities,
// so profiler can access the remote state without extra latency.
qurt_thread_t current_thread_id = qurt_thread_get_id();
qurt_thread_set_priority(current_thread_id, 100);

*func = halide_profiler_get_state()->current_func;
*threads = halide_profiler_get_state()->active_threads;
*func = halide_hexagon_remote_profiler_get_global_instance()->current_func;
*threads = halide_hexagon_remote_profiler_get_global_instance()->active_threads;
return 0;
}
int halide_hexagon_remote_profiler_set_current_func(int current_func) {
halide_profiler_get_state()->current_func = current_func;
halide_hexagon_remote_profiler_get_global_instance()->current_func = current_func;
return 0;
}
halide_profiler_state *halide_profiler_get_state() {
static halide_profiler_state hvx_profiler_state;
return &hvx_profiler_state;
}

} // extern "C"
1 change: 0 additions & 1 deletion src/runtime/hexagon_remote/qurt/known_symbols.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ void *get_known_symbol(const char *name) {

{"halide_error", (char *)(&halide_error)},
{"halide_print", (char *)(&halide_print)},
{"halide_profiler_get_state", (char *)(&halide_profiler_get_state)},
{"qurt_hvx_lock", (char *)(&qurt_hvx_lock)},
{"qurt_hvx_unlock", (char *)(&qurt_hvx_unlock)},

Expand Down
10 changes: 5 additions & 5 deletions src/runtime/hexagon_remote/qurt/sim_remote.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,12 @@ int release_library(handle_t module_ptr) {
}

extern "C" {
halide_profiler_state profiler_state;
int *profiler_current_func_addr = &profiler_state.current_func;
}
halide_profiler_instance_state hvx_profiler_instance;
int *profiler_current_func_addr = &hvx_profiler_instance.current_func;

halide_profiler_state *halide_profiler_get_state() {
return (halide_profiler_state *)(&profiler_state);
halide_profiler_instance_state *halide_hexagon_remote_profiler_get_global_instance() {
return &hvx_profiler_instance;
}
}

extern "C" {
Expand Down
4 changes: 2 additions & 2 deletions src/runtime/linux_clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
}

extern int usleep(int);
WEAK void halide_sleep_ms(void *user_context, int ms) {
usleep(ms * 1000);
WEAK void halide_sleep_us(void *user_context, int us) {
usleep(us);
}
}
4 changes: 2 additions & 2 deletions src/runtime/osx_clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
}

extern int usleep(int);
WEAK void halide_sleep_ms(void *user_context, int ms) {
usleep(ms * 1000);
WEAK void halide_sleep_us(void *user_context, int us) {
usleep(us);
}
}
4 changes: 2 additions & 2 deletions src/runtime/posix_clock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
}

extern int usleep(int);
WEAK void halide_sleep_ms(void *user_context, int ms) {
usleep(ms * 1000);
WEAK void halide_sleep_us(void *user_context, int us) {
usleep(us);
}
}
Loading

0 comments on commit 8ff261e

Please sign in to comment.