diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp index 21ca06e07285..84f076d94537 100644 --- a/src/BoundsInference.cpp +++ b/src/BoundsInference.cpp @@ -1388,6 +1388,13 @@ Stmt bounds_inference(Stmt s, Expr marker = Call::make(Int(32), Call::skip_stages_marker, {}, Call::Intrinsic); s = Block::make(Evaluate::make(marker), s); + if (target.has_feature(Target::Profile) || target.has_feature(Target::ProfileByTimer)) { + // Add a note in the IR for what profiling should cover, so that it doesn't + // include bounds queries as pipeline executions. + marker = Call::make(Int(32), Call::profiling_enable_instance_marker, {}, Call::Intrinsic); + s = Block::make(Evaluate::make(marker), s); + } + // Add a note in the IR for where assertions on input images // should go. Those are handled by a later lowering pass. marker = Call::make(Int(32), Call::add_image_checks_marker, {}, Call::Intrinsic); diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp index b68373f23ccf..21a0cea839df 100644 --- a/src/CodeGen_Internal.cpp +++ b/src/CodeGen_Internal.cpp @@ -51,8 +51,8 @@ bool function_takes_user_context(const std::string &name) { "halide_print", "halide_profiler_memory_allocate", "halide_profiler_memory_free", - "halide_profiler_pipeline_start", - "halide_profiler_pipeline_end", + "halide_profiler_instance_start", + "halide_profiler_instance_end", "halide_profiler_stack_peak_update", "halide_spawn_thread", "halide_device_release", diff --git a/src/IR.cpp b/src/IR.cpp index 81cf0a0f41ff..804e41234f71 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -653,6 +653,7 @@ const char *const intrinsic_op_names[] = { "mux", "popcount", "prefetch", + "profiling_enable_instance_marker", "promise_clamped", "random", "register_destructor", diff --git a/src/IR.h b/src/IR.h index d3f6af596f31..f21f3a9a52ba 100644 --- a/src/IR.h +++ b/src/IR.h @@ -569,6 +569,7 @@ struct Call : public ExprNode { mux, popcount, prefetch, + profiling_enable_instance_marker, promise_clamped, random, register_destructor, diff --git a/src/Profiling.cpp b/src/Profiling.cpp index 414578299df6..b0178b4a073a 100644 --- a/src/Profiling.cpp +++ b/src/Profiling.cpp @@ -23,14 +23,38 @@ using std::vector; namespace { -Stmt incr_active_threads(const Expr &profiler_state) { +// All names that need to be unique, just in case someone does something +// perverse like naming a func "profiler_instance". +struct Names { + const std::string &pipeline_name; + std::string profiler_instance; + std::string profiler_local_sampling_token; + std::string profiler_shared_sampling_token; + std::string hvx_profiler_instance; + std::string profiler_func_names; + std::string profiler_func_stack_peak_buf; + std::string profiler_start_error_code; + + Names(const std::string &pipeline_name) + : pipeline_name(pipeline_name), + profiler_instance(unique_name("profiler_instance")), + profiler_local_sampling_token(unique_name("profiler_local_sampling_token")), + profiler_shared_sampling_token(unique_name("profiler_shared_sampling_token")), + hvx_profiler_instance(unique_name("hvx_profiler_instance")), + profiler_func_names(unique_name("profiler_func_names")), + profiler_func_stack_peak_buf(unique_name("profiler_func_stack_peak_buf")), + profiler_start_error_code(unique_name("profiler_start_error_code")) { + } +}; + +Stmt incr_active_threads(const Expr &profiler_instance) { return Evaluate::make(Call::make(Int(32), "halide_profiler_incr_active_threads", - {profiler_state}, Call::Extern)); + {profiler_instance}, Call::Extern)); } -Stmt decr_active_threads(const Expr &profiler_state) { +Stmt decr_active_threads(const Expr &profiler_instance) { return Evaluate::make(Call::make(Int(32), "halide_profiler_decr_active_threads", - {profiler_state}, Call::Extern)); + {profiler_instance}, Call::Extern)); } Stmt acquire_sampling_token(const Expr &shared_token, const Expr &local_token) { @@ -43,18 +67,6 @@ Stmt release_sampling_token(const Expr &shared_token, const Expr &local_token) { {shared_token, local_token}, Call::Extern)); } -Stmt activate_thread(const Stmt &s, const Expr &profiler_state) { - return Block::make({incr_active_threads(profiler_state), - s, - decr_active_threads(profiler_state)}); -} - -Stmt suspend_thread(const Stmt &s, const Expr &profiler_state) { - return Block::make({decr_active_threads(profiler_state), - s, - incr_active_threads(profiler_state)}); -} - Stmt claim_sampling_token(const Stmt &s, const Expr &shared_token, const Expr &local_token) { return LetStmt::make(local_token.as()->name, Call::make(Handle(), Call::alloca, {Int(32).bytes()}, Call::Intrinsic), @@ -70,38 +82,62 @@ class InjectProfiling : public IRMutator { vector stack; // What produce nodes are we currently inside of. - string pipeline_name; + const Names &names; const map &env; bool in_fork = false; bool in_parallel = false; bool in_leaf_task = false; - InjectProfiling(const string &pipeline_name, const map &env) - : pipeline_name(pipeline_name), env(env) { + InjectProfiling(const Names &names, const map &env) + : names(names), env(env) { stack.push_back(get_func_id("overhead")); // ID 0 is treated specially in the runtime as overhead internal_assert(stack.back() == 0); + waiting_on_tasks_id = get_func_id("waiting for parallel tasks to finish"); malloc_id = get_func_id("halide_malloc"); free_id = get_func_id("halide_free"); - profiler_pipeline_state = Variable::make(Handle(), "profiler_pipeline_state"); - profiler_state = Variable::make(Handle(), "profiler_state"); - profiler_token = Variable::make(Int(32), "profiler_token"); - profiler_local_sampling_token = Variable::make(Handle(), "profiler_local_sampling_token"); - profiler_shared_sampling_token = Variable::make(Handle(), "profiler_shared_sampling_token"); + + profiler_instance = Variable::make(Handle(), names.profiler_instance); + profiler_local_sampling_token = Variable::make(Handle(), names.profiler_local_sampling_token); + profiler_shared_sampling_token = Variable::make(Handle(), names.profiler_shared_sampling_token); } map func_stack_current; // map from func id -> current stack allocation map func_stack_peak; // map from func id -> peak stack allocation + Stmt activate_thread(const Stmt &s) { + return activate_thread_helper(s, waiting_on_tasks_id); + } + + Stmt activate_main_thread(const Stmt &s) { + // The same as a child task, except when we finish (but before the + // instances get popped), bill anything as overhead. + return activate_thread_helper(s, 0); + } + + Stmt activate_thread_helper(const Stmt &s, int final_id) { + return Block::make({incr_active_threads(profiler_instance), + unconditionally_set_current_func(stack.back()), + s, + decr_active_threads(profiler_instance), + unconditionally_set_current_func(final_id)}); + } + + Stmt suspend_thread(const Stmt &s) { + return Block::make({decr_active_threads(profiler_instance), + unconditionally_set_current_func(waiting_on_tasks_id), + s, + incr_active_threads(profiler_instance), + unconditionally_set_current_func(stack.back())}); + } + private: using IRMutator::visit; - int malloc_id, free_id; - Expr profiler_pipeline_state; - Expr profiler_state; - Expr profiler_token; + int malloc_id, free_id, waiting_on_tasks_id; + Expr profiler_instance; Expr profiler_local_sampling_token; Expr profiler_shared_sampling_token; @@ -157,6 +193,12 @@ class InjectProfiling : public IRMutator { return idx; } + Stmt unconditionally_set_current_func(int id) { + Stmt s = Evaluate::make(Call::make(Int(32), "halide_profiler_set_current_func", + {profiler_instance, id, reinterpret(Handle(), cast(0))}, Call::Extern)); + return s; + } + Stmt set_current_func(int id) { if (most_recently_set_func == id) { return Evaluate::make(0); @@ -165,7 +207,7 @@ class InjectProfiling : public IRMutator { Expr last_arg = in_leaf_task ? profiler_local_sampling_token : reinterpret(Handle(), cast(0)); // This call gets inlined and becomes a single store instruction. Stmt s = Evaluate::make(Call::make(Int(32), "halide_profiler_set_current_func", - {profiler_state, profiler_token, id, last_arg}, Call::Extern)); + {profiler_instance, id, last_arg}, Call::Extern)); return s; } @@ -203,6 +245,17 @@ class InjectProfiling : public IRMutator { return size; } + Expr visit(const Call *op) override { + if (op->is_intrinsic(Call::profiling_enable_instance_marker)) { + // We're out of the bounds query code. This instance should be + // tracked (including any samples taken before this point. + return Call::make(Int(32), "halide_profiler_enable_instance", + {profiler_instance}, Call::Extern); + } else { + return IRMutator::visit(op); + } + } + Stmt visit(const Allocate *op) override { auto [new_extents, changed] = mutate_with_changes(op->extents); @@ -229,7 +282,7 @@ class InjectProfiling : public IRMutator { func_stack_current[idx] += *int_size; func_stack_peak[idx] = std::max(func_stack_peak[idx], func_stack_current[idx]); debug(3) << " Allocation on stack: " << op->name - << "(" << size << ") in pipeline " << pipeline_name + << "(" << size << ") in pipeline " << names.pipeline_name << "; current: " << func_stack_current[idx] << "; peak: " << func_stack_peak[idx] << "\n"; } @@ -240,11 +293,11 @@ class InjectProfiling : public IRMutator { int idx = get_func_id(op->name); debug(3) << " Allocation on heap: " << op->name << "(" << size << ") in pipeline " - << pipeline_name << "\n"; + << names.pipeline_name << "\n"; tasks.push_back(set_current_func(malloc_id)); tasks.push_back(Evaluate::make(Call::make(Int(32), "halide_profiler_memory_allocate", - {profiler_pipeline_state, idx, size}, Call::Extern))); + {profiler_instance, idx, size}, Call::Extern))); } Stmt body = mutate(op->body); @@ -281,12 +334,12 @@ class InjectProfiling : public IRMutator { if (!alloc.on_stack) { if (profiling_memory) { int idx = get_func_id(op->name); - debug(3) << " Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "\n"; + debug(3) << " Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << names.pipeline_name << "\n"; vector tasks{ set_current_func(free_id), Evaluate::make(Call::make(Int(32), "halide_profiler_memory_free", - {profiler_pipeline_state, idx, alloc.size}, Call::Extern)), + {profiler_instance, idx, alloc.size}, Call::Extern)), stmt, set_current_func(stack.back())}; @@ -304,8 +357,10 @@ class InjectProfiling : public IRMutator { idx = get_func_id(op->name); } func_stack_current[idx] -= *int_size; - debug(3) << " Free on stack: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name - << "; current: " << func_stack_current[idx] << "; peak: " << func_stack_peak[idx] << "\n"; + debug(3) << " Free on stack: " << op->name + << "(" << alloc.size << ") in pipeline " << names.pipeline_name + << "; current: " << func_stack_current[idx] + << "; peak: " << func_stack_peak[idx] << "\n"; } } return stmt; @@ -345,7 +400,7 @@ class InjectProfiling : public IRMutator { } else if (const Acquire *a = s.as()) { s = Acquire::make(a->semaphore, a->count, visit_parallel_task(a->body)); } else { - s = activate_thread(mutate(s), profiler_state); + s = activate_thread(mutate(s)); } if (most_recently_set_func != old) { most_recently_set_func = -1; @@ -355,13 +410,13 @@ class InjectProfiling : public IRMutator { Stmt visit(const Acquire *op) override { Stmt s = visit_parallel_task(op); - return suspend_thread(s, profiler_state); + return suspend_thread(s); } Stmt visit(const Fork *op) override { ScopedValue bind(in_fork, true); Stmt s = visit_parallel_task(op); - return suspend_thread(s, profiler_state); + return suspend_thread(s); } Stmt visit(const For *op) override { @@ -378,7 +433,6 @@ class InjectProfiling : public IRMutator { bool leaf_task = false; if (update_active_threads) { - body = activate_thread(body, profiler_state); class ContainsParallelOrBlockingNode : public IRVisitor { using IRVisitor::visit; @@ -404,6 +458,8 @@ class InjectProfiling : public IRMutator { if (leaf_task) { body = claim_sampling_token(body, profiler_shared_sampling_token, profiler_local_sampling_token); } + + body = activate_thread(body); } ScopedValue bind_leaf_task(in_leaf_task, in_leaf_task || leaf_task); @@ -423,9 +479,9 @@ class InjectProfiling : public IRMutator { // Get the profiler state pointer from scratch inside the // kernel. There will be a separate copy of the state on // the DSP that the host side will periodically query. - Expr get_state = Call::make(Handle(), "halide_profiler_get_state", {}, Call::Extern); - body = substitute("profiler_state", Variable::make(Handle(), "hvx_profiler_state"), body); - body = LetStmt::make("hvx_profiler_state", get_state, body); + Expr get_state = Call::make(Handle(), "halide_hexagon_remote_profiler_get_global_instance", {}, Call::Extern); + body = substitute(names.profiler_instance, Variable::make(Handle(), names.hvx_profiler_instance), body); + body = LetStmt::make(names.hvx_profiler_instance, get_state, body); } else if (op->device_api == DeviceAPI::None || op->device_api == DeviceAPI::Host) { body = mutate(body); @@ -440,7 +496,7 @@ class InjectProfiling : public IRMutator { Stmt stmt = For::make(op->name, op->min, op->extent, op->for_type, op->partition_policy, op->device_api, body); if (update_active_threads) { - stmt = suspend_thread(stmt, profiler_state); + stmt = suspend_thread(stmt); } return stmt; @@ -538,77 +594,84 @@ class InjectProfiling : public IRMutator { } // namespace -Stmt inject_profiling(Stmt s, const string &pipeline_name, const std::map &env) { - InjectProfiling profiling(pipeline_name, env); - s = profiling.mutate(s); +Stmt inject_profiling(const Stmt &stmt, const string &pipeline_name, const std::map &env) { + Names names(pipeline_name); + + InjectProfiling profiling(names, env); + Stmt s = profiling.mutate(stmt); int num_funcs = (int)(profiling.indices.size()); - Expr func_names_buf = Variable::make(Handle(), "profiling_func_names"); + // TODO: unique_name all these strings - Expr start_profiler = Call::make(Int(32), "halide_profiler_pipeline_start", - {pipeline_name, num_funcs, func_names_buf}, Call::Extern); + Expr instance = Variable::make(Handle(), names.profiler_instance); - Expr get_state = Call::make(Handle(), "halide_profiler_get_state", {}, Call::Extern); + Expr func_names_buf = Variable::make(Handle(), names.profiler_func_names); - Expr get_pipeline_state = Call::make(Handle(), "halide_profiler_get_pipeline_state", {pipeline_name}, Call::Extern); + Expr start_profiler = Call::make(Int(32), "halide_profiler_instance_start", + {pipeline_name, num_funcs, func_names_buf, instance}, Call::Extern); - Expr profiler_token = Variable::make(Int(32), "profiler_token"); + Expr profiler_start_error_code = Variable::make(Int(32), names.profiler_start_error_code); Expr stop_profiler = Call::make(Handle(), Call::register_destructor, - {Expr("halide_profiler_pipeline_end"), get_state}, Call::Intrinsic); + {Expr("halide_profiler_instance_end"), instance}, Call::Intrinsic); bool no_stack_alloc = profiling.func_stack_peak.empty(); if (!no_stack_alloc) { - Expr func_stack_peak_buf = Variable::make(Handle(), "profiling_func_stack_peak_buf"); + Expr func_stack_peak_buf = Variable::make(Handle(), names.profiler_func_stack_peak_buf); - Expr profiler_pipeline_state = Variable::make(Handle(), "profiler_pipeline_state"); Stmt update_stack = Evaluate::make(Call::make(Int(32), "halide_profiler_stack_peak_update", - {profiler_pipeline_state, func_stack_peak_buf}, Call::Extern)); + {instance, func_stack_peak_buf}, Call::Extern)); s = Block::make(update_stack, s); } - Expr profiler_state = Variable::make(Handle(), "profiler_state"); - - s = activate_thread(s, profiler_state); + s = profiling.activate_main_thread(s); // Initialize the shared sampling token - Expr shared_sampling_token_var = Variable::make(Handle(), "profiler_shared_sampling_token"); + Expr shared_sampling_token_var = Variable::make(Handle(), names.profiler_shared_sampling_token); Expr init_sampling_token = Call::make(Int(32), "halide_profiler_init_sampling_token", {shared_sampling_token_var, 0}, Call::Extern); s = Block::make({Evaluate::make(init_sampling_token), s}); - s = LetStmt::make("profiler_shared_sampling_token", + s = LetStmt::make(names.profiler_shared_sampling_token, Call::make(Handle(), Call::alloca, {Int(32).bytes()}, Call::Intrinsic), s); - s = LetStmt::make("profiler_pipeline_state", get_pipeline_state, s); - s = LetStmt::make("profiler_state", get_state, s); // If there was a problem starting the profiler, it will call an // appropriate halide error function and then return the // (negative) error code as the token. - s = Block::make(AssertStmt::make(profiler_token >= 0, profiler_token), s); - s = LetStmt::make("profiler_token", start_profiler, s); + s = Block::make(AssertStmt::make(profiler_start_error_code == 0, profiler_start_error_code), s); + s = LetStmt::make(names.profiler_start_error_code, start_profiler, s); if (!no_stack_alloc) { for (int i = num_funcs - 1; i >= 0; --i) { - s = Block::make(Store::make("profiling_func_stack_peak_buf", + s = Block::make(Store::make(names.profiler_func_stack_peak_buf, make_const(UInt(64), profiling.func_stack_peak[i]), i, Parameter(), const_true(), ModulusRemainder()), s); } - s = Block::make(s, Free::make("profiling_func_stack_peak_buf")); - s = Allocate::make("profiling_func_stack_peak_buf", UInt(64), + s = Block::make(s, Free::make(names.profiler_func_stack_peak_buf)); + s = Allocate::make(names.profiler_func_stack_peak_buf, UInt(64), MemoryType::Auto, {num_funcs}, const_true(), s); } for (const auto &p : profiling.indices) { - s = Block::make(Store::make("profiling_func_names", p.first, p.second, Parameter(), const_true(), ModulusRemainder()), s); + s = Block::make(Store::make(names.profiler_func_names, p.first, p.second, Parameter(), const_true(), ModulusRemainder()), s); } - s = Block::make(s, Free::make("profiling_func_names")); - s = Allocate::make("profiling_func_names", Handle(), + s = Block::make(s, Free::make(names.profiler_func_names)); + s = Allocate::make(names.profiler_func_names, Handle(), MemoryType::Auto, {num_funcs}, const_true(), s); s = Block::make(Evaluate::make(stop_profiler), s); + // Allocate memory for the profiler instance state + + // Check there isn't going to be end-of-struct padding to worry about. + static_assert((sizeof(halide_profiler_func_stats) & 7) == 0); + + const int instance_size_bytes = sizeof(halide_profiler_instance_state) + num_funcs * sizeof(halide_profiler_func_stats); + + s = Allocate::make(names.profiler_instance, UInt(64), MemoryType::Auto, + {(instance_size_bytes + 7) / 8}, const_true(), s); + // We have nested definitions of the sampling token s = uniquify_variable_names(s); diff --git a/src/Profiling.h b/src/Profiling.h index afaa47fe6d6e..b91644409655 100644 --- a/src/Profiling.h +++ b/src/Profiling.h @@ -40,7 +40,7 @@ class Function; * storage flattening, but after all bounds inference. * */ -Stmt inject_profiling(Stmt, const std::string &, const std::map &env); +Stmt inject_profiling(const Stmt &, const std::string &, const std::map &env); } // namespace Internal } // namespace Halide diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h index 4ddb52cd89bf..736d64478c8b 100644 --- a/src/runtime/HalideRuntime.h +++ b/src/runtime/HalideRuntime.h @@ -1265,6 +1265,9 @@ enum halide_error_code_t { /** "vscale" value of Scalable Vector detected in runtime does not match * the vscale value used in compilation. */ halide_error_code_vscale_invalid = -47, + + /** Profiling failed for a pipeline invocation. */ + halide_error_code_cannot_profile_pipeline = -48, }; /** Halide calls the functions below on various error conditions. The @@ -1849,7 +1852,7 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_func_stats { /** Per-pipeline state tracked by the sampling profiler. These exist * in a linked list. */ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats { - /** Total time spent inside this pipeline (in nanoseconds) */ + /** Total time spent in this pipeline (in nanoseconds) */ uint64_t time; /** The current memory allocation of funcs in this pipeline. */ @@ -1878,9 +1881,6 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats { /** The number of funcs in this pipeline. */ int num_funcs; - /** An internal base id used to identify the funcs in this pipeline. */ - int first_func_id; - /** The number of times this pipeline has been run. */ int runs; @@ -1891,48 +1891,98 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats { int num_allocs; }; -/** The global state of the profiler. */ +/** Per-invocation-of-a-pipeline state. Lives on the stack of the Halide + * code. Exists in a doubly-linked list to that it can be cleanly + * removed. */ +struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_instance_state { + /** Time billed to funcs in this instance by the sampling thread. */ + uint64_t billed_time; -struct halide_profiler_state { - /** Guards access to the fields below. If not locked, the sampling - * profiler thread is free to modify things below (including - * reordering the linked list of pipeline stats). */ - struct halide_mutex lock; + /** Wall clock time of the start of the instance. */ + uint64_t start_time; - /** The amount of time the profiler thread sleeps between samples - * in milliseconds. Defaults to 1 */ - int sleep_time; + /** The current memory allocation of funcs in this instance. */ + uint64_t memory_current; - /** An internal id used for bookkeeping. */ - int first_free_id; + /** The peak memory allocation of funcs in this instance. */ + uint64_t memory_peak; + + /** The total memory allocation of funcs in this instance. */ + uint64_t memory_total; + + /** The average number of thread pool worker threads doing useful + * work while computing this instance. */ + uint64_t active_threads_numerator, active_threads_denominator; + + /** A pointer to the next running instance, so that the running instances + * can exist in a linked list. */ + struct halide_profiler_instance_state *next; + + /** A pointer to the address of the next pointer of the previous instance, + * so that this can be removed from the linked list when the instance + * terminates. */ + struct halide_profiler_instance_state **prev_next; + + /** Information shared across all instances. The stats above are merged into + * it when the instance is retired. */ + struct halide_profiler_pipeline_stats *pipeline_stats; + + /** An array containing states for each Func in this instance of this pipeline. */ + struct halide_profiler_func_stats *funcs; /** The id of the current running Func. Set by the pipeline, read * periodically by the profiler thread. */ int current_func; - /** The number of threads currently doing work. */ + /** The number of threads currently doing work on this pipeline instance. */ int active_threads; + /** The number of samples taken by this instance. */ + int samples; + + /** The total number of memory allocation of funcs in this instance. */ + int num_allocs; + + /** Whether or not this instance should count towards pipeline + * statistics. */ + int should_collect_statistics; +}; + +/** The global state of the profiler. */ +struct halide_profiler_state { + /** Guards access to the fields below. If not locked, the sampling + * profiler thread is free to modify things below (including + * reordering the linked list of pipeline stats). */ + struct halide_mutex lock; + /** A linked list of stats gathered for each pipeline. */ struct halide_profiler_pipeline_stats *pipelines; /** Retrieve remote profiler state. Used so that the sampling * profiler can follow along with execution that occurs elsewhere, * e.g. on a DSP. If null, it reads from the int above instead. */ - void (*get_remote_profiler_state)(int *func, int *active_workers); /** Sampling thread reference to be joined at shutdown. */ struct halide_thread *sampling_thread; -}; -/** Profiler func ids with special meanings. */ -enum { - /// current_func takes on this value when not inside Halide code - halide_profiler_outside_of_halide = -1, - /// Set current_func to this value to tell the profiling thread to - /// halt. It will start up again next time you run a pipeline with - /// profiling enabled. - halide_profiler_please_stop = -2 + /** The running instances of Halide pipelines. */ + struct halide_profiler_instance_state *instances; + + /** If this callback is defined, the profiler asserts that there is a single + * live instance, and then uses it to get the current func and number of + * active threads insted of reading the fields in the instance. This is used + * so that the profiler can follow along with execution that occurs + * elsewhere (e.g. on an accelerator). */ + void (*get_remote_profiler_state)(int *func, int *active_workers); + + /** The amount of time the profiler thread sleeps between samples in + * microseconds. Defaults to 1000. To change it call + * halide_profiler_get_state and mutate this field. */ + int sleep_time; + + /** Set to 1 when you want the profiler to wait for all running instances to + * finish and then stop gracefully. */ + int shutdown; }; /** Get a pointer to the global profiler state for programmatic @@ -1950,34 +2000,24 @@ extern struct halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state * accurate time interval if desired. */ extern int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *prev_t); -/** Reset profiler state cheaply. May leave threads running or some - * memory allocated but all accumluated statistics are reset. - * WARNING: Do NOT call this method while any halide pipeline is - * running; halide_profiler_memory_allocate/free and - * halide_profiler_stack_peak_update update the profiler pipeline's - * state without grabbing the global profiler state's lock. */ +/** Reset profiler state cheaply. May leave threads running or some memory + * allocated but all accumulated statistics are reset. Blocks until all running + * profiled Halide pipelines exit. */ extern void halide_profiler_reset(void); -/** Reset all profiler state. - * WARNING: Do NOT call this method while any halide pipeline is - * running; halide_profiler_memory_allocate/free and - * halide_profiler_stack_peak_update update the profiler pipeline's - * state without grabbing the global profiler state's lock. */ -void halide_profiler_shutdown(void); +/** Reset all profiler state. Blocks until all running profiled Halide + * pipelines exit. */ +extern void halide_profiler_shutdown(void); /** Print out timing statistics for everything run since the last * reset. Also happens at process exit. */ extern void halide_profiler_report(void *user_context); -/** For timer based profiling, this routine starts the timer chain running. - * halide_get_profiler_state can be called to get the current timer interval. - */ -extern void halide_start_timer_chain(void); /** These routines are called to temporarily disable and then reenable - * timer interuppts for profiling */ + * the profiler. */ //@{ -extern void halide_disable_timer_interrupt(void); -extern void halide_enable_timer_interrupt(void); +extern void halide_profiler_lock(struct halide_profiler_state *); +extern void halide_profiler_unlock(struct halide_profiler_state *); //@} /// \name "Float16" functions diff --git a/src/runtime/fuchsia_clock.cpp b/src/runtime/fuchsia_clock.cpp index b532867e0788..9733d6e135bc 100644 --- a/src/runtime/fuchsia_clock.cpp +++ b/src/runtime/fuchsia_clock.cpp @@ -30,7 +30,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) { return zx_clock_get_monotonic() - halide_reference_clock; } -WEAK void halide_sleep_ms(void *user_context, int ms) { - zx_nanosleep(zx_deadline_after(ms * 1000)); +WEAK void halide_sleep_us(void *user_context, int us) { + zx_nanosleep(zx_deadline_after(us)); } } diff --git a/src/runtime/hexagon_host.cpp b/src/runtime/hexagon_host.cpp index 7035741a9fdf..80dcca1dc707 100644 --- a/src/runtime/hexagon_host.cpp +++ b/src/runtime/hexagon_host.cpp @@ -275,7 +275,6 @@ WEAK int halide_hexagon_initialize_kernels(void *user_context, void **state_ptr, halide_abort_if_false(user_context, state_ptr != nullptr); #ifdef DEBUG_RUNTIME - halide_start_clock(user_context); uint64_t t_before = halide_current_time_ns(user_context); #endif @@ -478,11 +477,22 @@ WEAK int halide_hexagon_run(void *user_context, // get_remote_profiler_func to retrieve the current // func. Otherwise leave it alone - the cost of remote running // will be billed to the calling Func. + halide_profiler_state *s = halide_profiler_get_state(); if (remote_poll_profiler_state) { - halide_profiler_get_state()->get_remote_profiler_state = get_remote_profiler_state; - if (remote_profiler_set_current_func) { - remote_profiler_set_current_func(halide_profiler_get_state()->current_func); + halide_profiler_lock(s); + const halide_profiler_instance_state *instance = s->instances; + if (instance) { + if (instance->next) { + halide_profiler_unlock(s); + error(user_context) << "Hexagon: multiple simultaneous profiled pipelines is unsupported."; + return halide_error_code_cannot_profile_pipeline; + } + s->get_remote_profiler_state = get_remote_profiler_state; + if (remote_profiler_set_current_func) { + remote_profiler_set_current_func(instance->current_func); + } } + halide_profiler_unlock(s); } // Call the pipeline on the device side. @@ -498,7 +508,9 @@ WEAK int halide_hexagon_run(void *user_context, return halide_error_code_generic_error; } - halide_profiler_get_state()->get_remote_profiler_state = nullptr; + halide_profiler_lock(s); + s->get_remote_profiler_state = nullptr; + halide_profiler_unlock(s); #ifdef DEBUG_RUNTIME uint64_t t_after = halide_current_time_ns(user_context); @@ -580,6 +592,7 @@ WEAK int halide_hexagon_device_malloc(void *user_context, halide_buffer_t *buf) debug(user_context) << " allocating buffer of " << (uint64_t)size << " bytes\n"; #ifdef DEBUG_RUNTIME + halide_start_clock(user_context); uint64_t t_before = halide_current_time_ns(user_context); #endif diff --git a/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so b/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so index 148388d21adf..ee503bb14686 100755 Binary files a/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so and b/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so differ diff --git a/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote b/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote index c807c5134edd..e6f36f58084a 100755 Binary files a/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote and b/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote differ diff --git a/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so b/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so index 852b143d9adf..7c58f73b8dec 100755 Binary files a/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so and b/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so differ diff --git a/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a b/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a index 032a59aa14a4..f0c666e5bb60 100644 Binary files a/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a and b/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a differ diff --git a/src/runtime/hexagon_remote/bin/v65/signed_by_debug/libhalide_hexagon_remote_skel.so b/src/runtime/hexagon_remote/bin/v65/signed_by_debug/libhalide_hexagon_remote_skel.so deleted file mode 100644 index 7d8a61050ba5..000000000000 Binary files a/src/runtime/hexagon_remote/bin/v65/signed_by_debug/libhalide_hexagon_remote_skel.so and /dev/null differ diff --git a/src/runtime/hexagon_remote/qurt/halide_remote.cpp b/src/runtime/hexagon_remote/qurt/halide_remote.cpp index 2af8b8606cdb..d110803e1a95 100644 --- a/src/runtime/hexagon_remote/qurt/halide_remote.cpp +++ b/src/runtime/hexagon_remote/qurt/halide_remote.cpp @@ -419,23 +419,24 @@ int halide_hexagon_remote_release_library(handle_t module_ptr) { return 0; } +halide_profiler_instance_state *halide_hexagon_remote_profiler_get_global_instance() { + static halide_profiler_instance_state hvx_profiler_instance; + return &hvx_profiler_instance; +} + int halide_hexagon_remote_poll_profiler_state(int *func, int *threads) { // Increase the current thread priority to match working threads priorities, // so profiler can access the remote state without extra latency. qurt_thread_t current_thread_id = qurt_thread_get_id(); qurt_thread_set_priority(current_thread_id, 100); - *func = halide_profiler_get_state()->current_func; - *threads = halide_profiler_get_state()->active_threads; + *func = halide_hexagon_remote_profiler_get_global_instance()->current_func; + *threads = halide_hexagon_remote_profiler_get_global_instance()->active_threads; return 0; } int halide_hexagon_remote_profiler_set_current_func(int current_func) { - halide_profiler_get_state()->current_func = current_func; + halide_hexagon_remote_profiler_get_global_instance()->current_func = current_func; return 0; } -halide_profiler_state *halide_profiler_get_state() { - static halide_profiler_state hvx_profiler_state; - return &hvx_profiler_state; -} } // extern "C" diff --git a/src/runtime/hexagon_remote/qurt/known_symbols.cpp b/src/runtime/hexagon_remote/qurt/known_symbols.cpp index 48c60b22fd85..0ed27597769e 100644 --- a/src/runtime/hexagon_remote/qurt/known_symbols.cpp +++ b/src/runtime/hexagon_remote/qurt/known_symbols.cpp @@ -105,7 +105,6 @@ void *get_known_symbol(const char *name) { {"halide_error", (char *)(&halide_error)}, {"halide_print", (char *)(&halide_print)}, - {"halide_profiler_get_state", (char *)(&halide_profiler_get_state)}, {"qurt_hvx_lock", (char *)(&qurt_hvx_lock)}, {"qurt_hvx_unlock", (char *)(&qurt_hvx_unlock)}, diff --git a/src/runtime/hexagon_remote/qurt/sim_remote.cpp b/src/runtime/hexagon_remote/qurt/sim_remote.cpp index 45b89edc2416..771f7fa02e39 100644 --- a/src/runtime/hexagon_remote/qurt/sim_remote.cpp +++ b/src/runtime/hexagon_remote/qurt/sim_remote.cpp @@ -152,12 +152,12 @@ int release_library(handle_t module_ptr) { } extern "C" { -halide_profiler_state profiler_state; -int *profiler_current_func_addr = &profiler_state.current_func; -} +halide_profiler_instance_state hvx_profiler_instance; +int *profiler_current_func_addr = &hvx_profiler_instance.current_func; -halide_profiler_state *halide_profiler_get_state() { - return (halide_profiler_state *)(&profiler_state); +halide_profiler_instance_state *halide_hexagon_remote_profiler_get_global_instance() { + return &hvx_profiler_instance; +} } extern "C" { diff --git a/src/runtime/linux_clock.cpp b/src/runtime/linux_clock.cpp index 4729b2b2bc6d..dbb8fc186cd4 100644 --- a/src/runtime/linux_clock.cpp +++ b/src/runtime/linux_clock.cpp @@ -76,7 +76,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) { } extern int usleep(int); -WEAK void halide_sleep_ms(void *user_context, int ms) { - usleep(ms * 1000); +WEAK void halide_sleep_us(void *user_context, int us) { + usleep(us); } } diff --git a/src/runtime/osx_clock.cpp b/src/runtime/osx_clock.cpp index 9da5f99d1c11..a604a76eaf16 100644 --- a/src/runtime/osx_clock.cpp +++ b/src/runtime/osx_clock.cpp @@ -45,7 +45,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) { } extern int usleep(int); -WEAK void halide_sleep_ms(void *user_context, int ms) { - usleep(ms * 1000); +WEAK void halide_sleep_us(void *user_context, int us) { + usleep(us); } } diff --git a/src/runtime/posix_clock.cpp b/src/runtime/posix_clock.cpp index ddf258d3e0b2..7f0359b2ef6f 100644 --- a/src/runtime/posix_clock.cpp +++ b/src/runtime/posix_clock.cpp @@ -39,7 +39,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) { } extern int usleep(int); -WEAK void halide_sleep_ms(void *user_context, int ms) { - usleep(ms * 1000); +WEAK void halide_sleep_us(void *user_context, int us) { + usleep(us); } } diff --git a/src/runtime/posix_timer_profiler.cpp b/src/runtime/posix_timer_profiler.cpp index 83dcfdfde2e8..9fde50ee0611 100644 --- a/src/runtime/posix_timer_profiler.cpp +++ b/src/runtime/posix_timer_profiler.cpp @@ -50,7 +50,7 @@ WEAK extern "C" void halide_start_timer_chain() { halide_profiler_state *s = halide_profiler_get_state(); itimerval timer_state; timer_state.it_interval.tv_sec = 0; - timer_state.it_interval.tv_usec = s->sleep_time * 1000.0; + timer_state.it_interval.tv_usec = s->sleep_time; timer_state.it_value = timer_state.it_interval; signal(SIGPROF, &profiler_handler); diff --git a/src/runtime/profiler_common.cpp b/src/runtime/profiler_common.cpp index ccbe0bf11ecb..a5633898b8a9 100644 --- a/src/runtime/profiler_common.cpp +++ b/src/runtime/profiler_common.cpp @@ -10,7 +10,16 @@ extern "C" { // Returns the address of the global halide_profiler state WEAK halide_profiler_state *halide_profiler_get_state() { - static halide_profiler_state s = {{{0}}, 1, 0, 0, 0, nullptr, nullptr, nullptr}; + static halide_profiler_state s = { + {{0}}, // The mutex + nullptr, // pipeline stats + nullptr, // sampling thread + nullptr, // running instances + nullptr, // get_remote_profiler_state callback + 1000, // Sampling rate in us + 0 // Flag that tells us to shutdown when it turns to 1 + }; + return &s; } @@ -19,10 +28,25 @@ extern "C" void halide_start_timer_chain(); extern "C" void halide_disable_timer_interrupt(); extern "C" void halide_enable_timer_interrupt(); #endif + +WEAK void halide_profiler_lock(struct halide_profiler_state *state) { +#if TIMER_PROFILING + halide_disable_timer_interrupt(); +#endif + halide_mutex_lock(&state->lock); +} + +WEAK void halide_profiler_unlock(struct halide_profiler_state *state) { +#if TIMER_PROFILING + halide_enable_timer_interrupt(); +#endif + halide_mutex_unlock(&state->lock); +} } namespace Halide { namespace Runtime { + namespace Internal { class LockProfiler { @@ -31,16 +55,10 @@ class LockProfiler { public: explicit LockProfiler(halide_profiler_state *s) : state(s) { -#if TIMER_PROFILING - halide_disable_timer_interrupt(); -#endif - halide_mutex_lock(&state->lock); + halide_profiler_lock(s); } ~LockProfiler() { - halide_mutex_unlock(&state->lock); -#if TIMER_PROFILING - halide_enable_timer_interrupt(); -#endif + halide_profiler_unlock(state); } }; @@ -64,7 +82,6 @@ WEAK halide_profiler_pipeline_stats *find_or_create_pipeline(const char *pipelin } p->next = s->pipelines; p->name = pipeline_name; - p->first_func_id = s->first_free_id; p->num_funcs = num_funcs; p->runs = 0; p->time = 0; @@ -91,60 +108,48 @@ WEAK halide_profiler_pipeline_stats *find_or_create_pipeline(const char *pipelin p->funcs[i].active_threads_numerator = 0; p->funcs[i].active_threads_denominator = 0; } - s->first_free_id += num_funcs; s->pipelines = p; return p; } -WEAK void bill_func(halide_profiler_state *s, int func_id, uint64_t time, int active_threads) { - halide_profiler_pipeline_stats *p_prev = nullptr; - for (halide_profiler_pipeline_stats *p = s->pipelines; p; - p = (halide_profiler_pipeline_stats *)(p->next)) { - if (func_id >= p->first_func_id && func_id < p->first_func_id + p->num_funcs) { - if (p_prev) { - // Bubble the pipeline to the top to speed up future queries. - p_prev->next = (halide_profiler_pipeline_stats *)(p->next); - p->next = s->pipelines; - s->pipelines = p; - } - halide_profiler_func_stats *f = p->funcs + func_id - p->first_func_id; - f->time += time; - f->active_threads_numerator += active_threads; - f->active_threads_denominator += 1; - p->time += time; - p->samples++; - p->active_threads_numerator += active_threads; - p->active_threads_denominator += 1; - return; - } - p_prev = p; - } - // Someone must have called reset_state while a kernel was running. Do nothing. +WEAK void update_running_instance(halide_profiler_instance_state *instance, uint64_t time) { + halide_profiler_func_stats *f = instance->funcs + instance->current_func; + f->time += time; + f->active_threads_numerator += instance->active_threads; + f->active_threads_denominator += 1; + instance->samples++; + instance->active_threads_numerator += instance->active_threads; + instance->active_threads_denominator += 1; + instance->billed_time += time; } extern "C" WEAK int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *prev_t) { - int func, active_threads; + if (!s->instances) { + // No Halide code is currently running + return 0; + } + halide_profiler_instance_state *instance = s->instances; + if (s->get_remote_profiler_state) { // Execution has disappeared into remote code running // on an accelerator (e.g. Hexagon DSP) - s->get_remote_profiler_state(&func, &active_threads); - } else { - func = s->current_func; - active_threads = s->active_threads; + + // It shouldn't be possible to get into a state where multiple + // pipelines are being profiled and one or both of them uses + // get_remote_profiler_state. + halide_debug_assert(nullptr, s->instances->next == nullptr); + + s->get_remote_profiler_state(&(instance->current_func), &(instance->active_threads)); } + uint64_t t_now = halide_current_time_ns(nullptr); - if (func == halide_profiler_please_stop) { -#if TIMER_PROFILING - s->sampling_thread = nullptr; -#endif - return -1; - } else if (func >= 0) { - // Assume all time since I was last awake is due to - // the currently running func. - bill_func(s, func, t_now - *prev_t, active_threads); + uint64_t dt = t_now - *prev_t; + while (instance) { + update_running_instance(instance, dt); + instance = instance->next; } *prev_t = t_now; - return s->sleep_time; + return 0; } WEAK void sampling_profiler_thread(void *) { @@ -153,19 +158,17 @@ WEAK void sampling_profiler_thread(void *) { // grab the lock halide_mutex_lock(&s->lock); - while (s->current_func != halide_profiler_please_stop) { - uint64_t t1 = halide_current_time_ns(nullptr); - uint64_t t = t1; - while (true) { - int sleep_ms = halide_profiler_sample(s, &t); - if (sleep_ms < 0) { - break; - } - // Release the lock, sleep, reacquire. - halide_mutex_unlock(&s->lock); - halide_sleep_ms(nullptr, sleep_ms); - halide_mutex_lock(&s->lock); + uint64_t t1 = halide_current_time_ns(nullptr); + uint64_t t = t1; + while (!s->shutdown || s->instances) { + int err = halide_profiler_sample(s, &t); + if (err < 0) { + break; } + // Release the lock, sleep, reacquire. + halide_mutex_unlock(&s->lock); + halide_sleep_us(nullptr, s->sleep_time); + halide_mutex_lock(&s->lock); } halide_mutex_unlock(&s->lock); @@ -209,43 +212,135 @@ WEAK halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state(const ch return nullptr; } -// Returns a token identifying this pipeline instance. -WEAK int halide_profiler_pipeline_start(void *user_context, +// Populates the instance state struct +WEAK int halide_profiler_instance_start(void *user_context, const char *pipeline_name, int num_funcs, - const uint64_t *func_names) { + const uint64_t *func_names, + halide_profiler_instance_state *instance) { + // Tell the instance where we stashed the per-func state - just after the + // instance itself. + + // First check that the layout agrees with the amount of stack space + // allocated in the pipeline + static_assert((sizeof(halide_profiler_func_stats) & 7) == 0); + halide_profiler_func_stats *funcs = (halide_profiler_func_stats *)(instance + 1); + + // Zero initialize the instance and func state + memset(instance, 0, (uint8_t *)(funcs + num_funcs) - (uint8_t *)instance); + + instance->funcs = funcs; + halide_profiler_state *s = halide_profiler_get_state(); + { + LockProfiler lock(s); + + // Push this instance onto the running instances list + if (s->instances) { + s->instances->prev_next = &(instance->next); + + // If there was something already running using the remote polling + // method, we can't profile something else at the same time. + if (s->get_remote_profiler_state) { + error(user_context) << "Cannot profile pipeline " << pipeline_name + << " while pipeline " << s->instances->pipeline_stats->name + << " is running, because it is running on a device."; + return halide_error_code_cannot_profile_pipeline; + } + } + instance->next = s->instances; + instance->prev_next = &(s->instances); + s->instances = instance; + + // Find or create the pipeline statistics for this pipeline. + halide_profiler_pipeline_stats *p = + find_or_create_pipeline(pipeline_name, num_funcs, func_names); + if (!p) { + // Allocating space to track the statistics failed. + return halide_error_out_of_memory(user_context); + } - LockProfiler lock(s); + // Tell the instance the pipeline to which it belongs. + instance->pipeline_stats = p; - if (!s->sampling_thread) { + if (!s->sampling_thread) { #if TIMER_PROFILING - halide_start_clock(user_context); - halide_start_timer_chain(); - s->sampling_thread = (halide_thread *)1; + halide_start_clock(user_context); + halide_start_timer_chain(); + s->sampling_thread = (halide_thread *)1; #else - halide_start_clock(user_context); - s->sampling_thread = halide_spawn_thread(sampling_profiler_thread, nullptr); + halide_start_clock(user_context); + s->sampling_thread = halide_spawn_thread(sampling_profiler_thread, nullptr); #endif + } } - halide_profiler_pipeline_stats *p = - find_or_create_pipeline(pipeline_name, num_funcs, func_names); - if (!p) { - // Allocating space to track the statistics failed. - return halide_error_out_of_memory(user_context); + instance->start_time = halide_current_time_ns(user_context); + + return 0; +} + +WEAK int halide_profiler_instance_end(void *user_context, halide_profiler_instance_state *instance) { + uint64_t end_time = halide_current_time_ns(user_context); + halide_profiler_state *s = halide_profiler_get_state(); + LockProfiler lock(s); + + if (instance->should_collect_statistics) { + + uint64_t true_duration = end_time - instance->start_time; + halide_profiler_pipeline_stats *p = instance->pipeline_stats; + + // Retire the instance, accumulating statistics onto the statistics for this + // pipeline. Fields related to memory usages are tracked in the pipeline stats + p->samples += instance->samples; + p->time += true_duration; + p->active_threads_numerator += instance->active_threads_numerator; + p->active_threads_denominator += instance->active_threads_denominator; + p->memory_total += instance->memory_total; + p->memory_peak = max(p->memory_peak, instance->memory_peak); + p->num_allocs += instance->num_allocs; + p->runs++; + + // Compute an adjustment factor to account for the fact that the billed + // time is not equal to the duration between start and end calls. We + // could avoid this by just making sure there is a sampling event a the + // start and end of the pipeline, but this would overcount whatever the + // last value of current_func is at the end of the pipeline, and is + // likely to undercount time spent in the first func in a + // pipeline. Sampling events need to happen independently (in the random + // variable sense) of any changes in current_func. + double adjustment = 1; + if (instance->billed_time > 0) { + adjustment = (double)true_duration / instance->billed_time; + } + + for (int f = 0; f < p->num_funcs; f++) { + halide_profiler_func_stats *func = p->funcs + f; + const halide_profiler_func_stats *instance_func = instance->funcs + f; + // clang-tidy wants me to use a c standard library function to do + // the rounding below, but those aren't guaranteed to be available + // when compiling the runtime. + func->time += (uint64_t)(instance_func->time * adjustment + 0.5); // NOLINT + func->active_threads_numerator += instance_func->active_threads_numerator; + func->active_threads_denominator += instance_func->active_threads_denominator; + func->num_allocs += instance_func->num_allocs; + func->stack_peak = max(func->stack_peak, instance_func->stack_peak); + func->memory_peak = max(func->memory_peak, instance_func->memory_peak); + func->memory_total += instance_func->memory_total; + } } - p->runs++; - return p->first_func_id; + // Remove myself from the doubly-linked list + *(instance->prev_next) = instance->next; + if (instance->next) { + instance->next->prev_next = instance->prev_next; + } + return 0; } WEAK void halide_profiler_stack_peak_update(void *user_context, - void *pipeline_state, + halide_profiler_instance_state *instance, uint64_t *f_values) { - halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state; - halide_abort_if_false(user_context, p_stats != nullptr); - // Note: Update to the counter is done without grabbing the state's lock to // reduce lock contention. One potential issue is that other call that frees the // pipeline and function stats structs may be running in parallel. However, the @@ -253,15 +348,15 @@ WEAK void halide_profiler_stack_peak_update(void *user_context, // unless user specifically calls halide_profiler_reset(). // Update per-func memory stats - for (int i = 0; i < p_stats->num_funcs; ++i) { + for (int i = 0; i < instance->pipeline_stats->num_funcs; ++i) { if (f_values[i] != 0) { - sync_compare_max_and_swap(&(p_stats->funcs[i]).stack_peak, f_values[i]); + sync_compare_max_and_swap(&(instance->funcs[i]).stack_peak, f_values[i]); } } } WEAK void halide_profiler_memory_allocate(void *user_context, - void *pipeline_state, + halide_profiler_instance_state *instance, int func_id, uint64_t incr) { using namespace Halide::Runtime::Internal::Synchronization; @@ -272,34 +367,34 @@ WEAK void halide_profiler_memory_allocate(void *user_context, return; } - halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state; - halide_abort_if_false(user_context, p_stats != nullptr); + halide_abort_if_false(user_context, instance != nullptr); halide_abort_if_false(user_context, func_id >= 0); - halide_abort_if_false(user_context, func_id < p_stats->num_funcs); + halide_abort_if_false(user_context, func_id < instance->pipeline_stats->num_funcs); - halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id]; + halide_profiler_func_stats *func = &instance->funcs[func_id]; // Note: Update to the counter is done without grabbing the state's lock to - // reduce lock contention. One potential issue is that other call that frees the - // pipeline and function stats structs may be running in parallel. However, the - // current desctructor (called on profiler shutdown) does not free the structs - // unless user specifically calls halide_profiler_reset(). - - // Update per-pipeline memory stats - atomic_add_fetch_sequentially_consistent(&p_stats->num_allocs, 1); - atomic_add_fetch_sequentially_consistent(&p_stats->memory_total, incr); - uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent(&p_stats->memory_current, incr); - sync_compare_max_and_swap(&p_stats->memory_peak, p_mem_current); + // reduce lock contention. One potential issue is that another call that + // frees the pipeline and function stats structs may be running in + // parallel. However, the current destructor (called on profiler shutdown) + // does not free the structs unless user specifically calls + // halide_profiler_reset(). + + // Update per-instance memory stats + atomic_add_fetch_sequentially_consistent(&instance->num_allocs, 1); + atomic_add_fetch_sequentially_consistent(&instance->memory_total, incr); + uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent(&instance->memory_current, incr); + sync_compare_max_and_swap(&instance->memory_peak, p_mem_current); // Update per-func memory stats - atomic_add_fetch_sequentially_consistent(&f_stats->num_allocs, 1); - atomic_add_fetch_sequentially_consistent(&f_stats->memory_total, incr); - uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent(&f_stats->memory_current, incr); - sync_compare_max_and_swap(&f_stats->memory_peak, f_mem_current); + atomic_add_fetch_sequentially_consistent(&func->num_allocs, 1); + atomic_add_fetch_sequentially_consistent(&func->memory_total, incr); + uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent(&func->memory_current, incr); + sync_compare_max_and_swap(&func->memory_peak, f_mem_current); } WEAK void halide_profiler_memory_free(void *user_context, - void *pipeline_state, + halide_profiler_instance_state *instance, int func_id, uint64_t decr) { using namespace Halide::Runtime::Internal::Synchronization; @@ -310,12 +405,11 @@ WEAK void halide_profiler_memory_free(void *user_context, return; } - halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state; - halide_abort_if_false(user_context, p_stats != nullptr); + halide_abort_if_false(user_context, instance != nullptr); halide_abort_if_false(user_context, func_id >= 0); - halide_abort_if_false(user_context, func_id < p_stats->num_funcs); + halide_abort_if_false(user_context, func_id < instance->pipeline_stats->num_funcs); - halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id]; + halide_profiler_func_stats *func = &instance->funcs[func_id]; // Note: Update to the counter is done without grabbing the state's lock to // reduce lock contention. One potential issue is that other call that frees the @@ -324,10 +418,10 @@ WEAK void halide_profiler_memory_free(void *user_context, // unless user specifically calls halide_profiler_reset(). // Update per-pipeline memory stats - atomic_sub_fetch_sequentially_consistent(&p_stats->memory_current, decr); + atomic_sub_fetch_sequentially_consistent(&instance->memory_current, decr); // Update per-func memory stats - atomic_sub_fetch_sequentially_consistent(&f_stats->memory_current, decr); + atomic_sub_fetch_sequentially_consistent(&func->memory_current, decr); } WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) { @@ -360,7 +454,7 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st for (halide_profiler_pipeline_stats *p = s->pipelines; p; p = (halide_profiler_pipeline_stats *)(p->next)) { - float t = p->time / 1000000.0f; + float total_time = p->time / 1000000.0f; if (!p->runs) { continue; } @@ -368,10 +462,10 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st bool serial = p->active_threads_numerator == p->active_threads_denominator; float threads = p->active_threads_numerator / (p->active_threads_denominator + 1e-10); sstr << p->name << "\n" - << " total time: " << t << " ms" + << " total time: " << total_time << " ms" << " samples: " << p->samples << " runs: " << p->runs - << " time/run: " << t / p->runs << " ms\n"; + << " time per run: " << total_time / p->runs << " ms\n"; if (!serial) { sstr << " average threads used: " << threads << "\n"; } @@ -423,12 +517,18 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st for (int i = 0; i < p->num_funcs; i++) { halide_profiler_func_stats *fs = p->funcs + i; - // The first func is always a catch-all overhead + // The first id is always a catch-all overhead // slot. Only report overhead time if it's non-zero if (i == 0 && fs->time == 0) { continue; } + // These two ids are malloc and free. Don't print them if there + // were no heap allocations. + if ((i == 2 || i == 3) && p->num_allocs == 0) { + continue; + } + f_stats[f_stats_count++] = fs; } @@ -606,7 +706,6 @@ WEAK void halide_profiler_reset_unlocked(halide_profiler_state *s) { free(p->funcs); free(p); } - s->first_free_id = 0; } WEAK void halide_profiler_reset() { @@ -616,6 +715,7 @@ WEAK void halide_profiler_reset() { // state without grabbing the global profiler state's lock. halide_profiler_state *s = halide_profiler_get_state(); LockProfiler lock(s); + halide_abort_if_false(nullptr, s->instances == nullptr); halide_profiler_reset_unlocked(s); } @@ -624,12 +724,16 @@ __attribute__((destructor)) #endif WEAK void halide_profiler_shutdown() { + using namespace Halide::Runtime::Internal::Synchronization; + halide_profiler_state *s = halide_profiler_get_state(); if (!s->sampling_thread) { return; } - s->current_func = halide_profiler_please_stop; + int one = 1; + atomic_store_relaxed(&(s->shutdown), &one); + #if TIMER_PROFILING // Wait for timer interrupt to fire and notice things are shutdown. // volatile should be the right tool to use to wait for storage to be @@ -643,7 +747,9 @@ halide_profiler_shutdown() { s->sampling_thread = nullptr; #endif - s->current_func = halide_profiler_outside_of_halide; + // The join_thread should have waited for any running instances to + // terminate. + halide_debug_assert(nullptr, s->instances == nullptr); // Print results. No need to lock anything because we just shut // down the thread. @@ -664,7 +770,7 @@ WEAK void halide_windows_profiler_shutdown() { // sections in a static destructor as it may run after threads // have been killed by the OS. Furthermore, may calls, even things // like EnterCriticalSection may be set to kill the process if - // called during process shutdown. Hence kthis routine doesn't attmept + // called during process shutdown. Hence this routine doesn't attmept // to clean up state as the destructor does on other platforms. // Print results. Avoid locking as it will cause problems and @@ -674,8 +780,4 @@ WEAK void halide_windows_profiler_shutdown() { #endif } // namespace -WEAK void halide_profiler_pipeline_end(void *user_context, void *state) { - ((halide_profiler_state *)state)->current_func = halide_profiler_outside_of_halide; -} - } // extern "C" diff --git a/src/runtime/profiler_inlined.cpp b/src/runtime/profiler_inlined.cpp index f31289cbcb36..fc5bda1b1bcc 100644 --- a/src/runtime/profiler_inlined.cpp +++ b/src/runtime/profiler_inlined.cpp @@ -3,21 +3,27 @@ extern "C" { -WEAK_INLINE int halide_profiler_set_current_func(halide_profiler_state *state, int pipeline, int func, int *sampling_token) { +WEAK_INLINE int halide_profiler_set_current_func(halide_profiler_instance_state *instance, int func, int *sampling_token) { if (sampling_token == nullptr || *sampling_token == 0) { // Use empty volatile asm blocks to prevent code motion. Otherwise // llvm reorders or elides the stores. - volatile int *ptr = &(state->current_func); + volatile int *ptr = &(instance->current_func); // clang-format off asm volatile ("":::); - *ptr = pipeline + func; + *ptr = func; asm volatile ("":::); // clang-format on } return 0; } +// Called once we're sure we're not in bounds query code +WEAK_INLINE int halide_profiler_enable_instance(halide_profiler_instance_state *instance) { + instance->should_collect_statistics = 1; + return 0; +} + // Invariant: shared xor local, and both are either 0 or 1. 0 means acquired. WEAK_INLINE int halide_profiler_acquire_sampling_token(int32_t *shared, int32_t *local) { using namespace Halide::Runtime::Internal::Synchronization; @@ -42,15 +48,15 @@ WEAK_INLINE int halide_profiler_init_sampling_token(int32_t *sampling_token, int return 0; } -WEAK_INLINE int halide_profiler_incr_active_threads(halide_profiler_state *state) { +WEAK_INLINE int halide_profiler_incr_active_threads(halide_profiler_instance_state *instance) { using namespace Halide::Runtime::Internal::Synchronization; - return atomic_fetch_add_sequentially_consistent(&(state->active_threads), 1); + return atomic_fetch_add_sequentially_consistent(&(instance->active_threads), 1); } -WEAK_INLINE int halide_profiler_decr_active_threads(halide_profiler_state *state) { +WEAK_INLINE int halide_profiler_decr_active_threads(halide_profiler_instance_state *instance) { using namespace Halide::Runtime::Internal::Synchronization; - return atomic_fetch_sub_sequentially_consistent(&(state->active_threads), 1); + return atomic_fetch_sub_sequentially_consistent(&(instance->active_threads), 1); } } diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp index 7955e8749df7..4ce1092264c9 100644 --- a/src/runtime/runtime_api.cpp +++ b/src/runtime/runtime_api.cpp @@ -165,9 +165,10 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_print, (void *)&halide_profiler_get_pipeline_state, (void *)&halide_profiler_get_state, + (void *)&halide_profiler_instance_start, + (void *)&halide_profiler_instance_end, (void *)&halide_profiler_memory_allocate, (void *)&halide_profiler_memory_free, - (void *)&halide_profiler_pipeline_start, (void *)&halide_profiler_report, (void *)&halide_profiler_reset, (void *)&halide_profiler_stack_peak_update, @@ -195,7 +196,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = { (void *)&halide_set_trace_file, (void *)&halide_shutdown_thread_pool, (void *)&halide_shutdown_trace, - (void *)&halide_sleep_ms, + (void *)&halide_sleep_us, (void *)&halide_spawn_thread, (void *)&halide_start_clock, (void *)&halide_start_timer_chain, diff --git a/src/runtime/runtime_atomics.h b/src/runtime/runtime_atomics.h index 5c82e0a66008..61139a622d75 100644 --- a/src/runtime/runtime_atomics.h +++ b/src/runtime/runtime_atomics.h @@ -128,7 +128,8 @@ ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val) return __sync_or_and_fetch(addr, val); } -ALWAYS_INLINE void atomic_store_relaxed(uintptr_t *addr, uintptr_t *val) { +template +ALWAYS_INLINE void atomic_store_relaxed(T *addr, T *val) { *addr = *val; } @@ -247,7 +248,8 @@ ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val) return __atomic_or_fetch(addr, val, __ATOMIC_RELAXED); } -ALWAYS_INLINE void atomic_store_relaxed(uintptr_t *addr, uintptr_t *val) { +template +ALWAYS_INLINE void atomic_store_relaxed(T *addr, T *val) { __atomic_store(addr, val, __ATOMIC_RELAXED); } diff --git a/src/runtime/runtime_internal.h b/src/runtime/runtime_internal.h index 8df9dcb8eb2c..b42728be27e6 100644 --- a/src/runtime/runtime_internal.h +++ b/src/runtime/runtime_internal.h @@ -153,28 +153,35 @@ WEAK void *halide_get_library_symbol(void *lib, const char *name); WEAK int halide_start_clock(void *user_context); WEAK int64_t halide_current_time_ns(void *user_context); -WEAK void halide_sleep_ms(void *user_context, int ms); +WEAK void halide_sleep_us(void *user_context, int us); WEAK void halide_device_free_as_destructor(void *user_context, void *obj); WEAK void halide_device_and_host_free_as_destructor(void *user_context, void *obj); WEAK void halide_device_host_nop_free(void *user_context, void *obj); -// The pipeline_state is declared as void* type since halide_profiler_pipeline_stats -// is defined inside HalideRuntime.h which includes this header file. +struct halide_profiler_instance_state; WEAK void halide_profiler_stack_peak_update(void *user_context, - void *pipeline_state, + halide_profiler_instance_state *instance, uint64_t *f_values); WEAK void halide_profiler_memory_allocate(void *user_context, - void *pipeline_state, + halide_profiler_instance_state *instance, int func_id, uint64_t incr); WEAK void halide_profiler_memory_free(void *user_context, - void *pipeline_state, + halide_profiler_instance_state *instance, int func_id, uint64_t decr); -WEAK int halide_profiler_pipeline_start(void *user_context, +WEAK int halide_profiler_instance_start(void *user_context, const char *pipeline_name, int num_funcs, - const uint64_t *func_names); + const uint64_t *func_names, + halide_profiler_instance_state *instance); +WEAK int halide_profiler_instance_end(void *user_context, + halide_profiler_instance_state *instance); + +WEAK void halide_start_timer_chain(); +WEAK void halide_disable_timer_interrupt(); +WEAK void halide_enable_timer_interrupt(); + WEAK int halide_host_cpu_count(); WEAK int halide_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf, diff --git a/src/runtime/windows_clock.cpp b/src/runtime/windows_clock.cpp index 717681070c68..94609f63642c 100644 --- a/src/runtime/windows_clock.cpp +++ b/src/runtime/windows_clock.cpp @@ -37,7 +37,11 @@ WEAK int64_t halide_current_time_ns(void *user_context) { return (int64_t)(ns_per_tick * clock); } -WEAK void halide_sleep_ms(void *user_context, int ms) { - Sleep(ms); +WEAK void halide_sleep_us(void *user_context, int us) { + // Unfortunately you can't sleep on windows in microsecond amounts. The + // below call results in calling Sleep(0) for times less than 1 ms, which is + // documented to yield the rest of the time slice. If there is no other + // thread waiting to run, it may get rescheduled immediately. + Sleep(us / 1000); } }