diff --git a/src/BoundsInference.cpp b/src/BoundsInference.cpp
index 21ca06e07285..84f076d94537 100644
--- a/src/BoundsInference.cpp
+++ b/src/BoundsInference.cpp
@@ -1388,6 +1388,13 @@ Stmt bounds_inference(Stmt s,
     Expr marker = Call::make(Int(32), Call::skip_stages_marker, {}, Call::Intrinsic);
     s = Block::make(Evaluate::make(marker), s);
 
+    if (target.has_feature(Target::Profile) || target.has_feature(Target::ProfileByTimer)) {
+        // Add a note in the IR for what profiling should cover, so that it doesn't
+        // include bounds queries as pipeline executions.
+        marker = Call::make(Int(32), Call::profiling_enable_instance_marker, {}, Call::Intrinsic);
+        s = Block::make(Evaluate::make(marker), s);
+    }
+
     // Add a note in the IR for where assertions on input images
     // should go. Those are handled by a later lowering pass.
     marker = Call::make(Int(32), Call::add_image_checks_marker, {}, Call::Intrinsic);
diff --git a/src/CodeGen_Internal.cpp b/src/CodeGen_Internal.cpp
index b68373f23ccf..21a0cea839df 100644
--- a/src/CodeGen_Internal.cpp
+++ b/src/CodeGen_Internal.cpp
@@ -51,8 +51,8 @@ bool function_takes_user_context(const std::string &name) {
         "halide_print",
         "halide_profiler_memory_allocate",
         "halide_profiler_memory_free",
-        "halide_profiler_pipeline_start",
-        "halide_profiler_pipeline_end",
+        "halide_profiler_instance_start",
+        "halide_profiler_instance_end",
         "halide_profiler_stack_peak_update",
         "halide_spawn_thread",
         "halide_device_release",
diff --git a/src/IR.cpp b/src/IR.cpp
index 81cf0a0f41ff..804e41234f71 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -653,6 +653,7 @@ const char *const intrinsic_op_names[] = {
     "mux",
     "popcount",
     "prefetch",
+    "profiling_enable_instance_marker",
     "promise_clamped",
     "random",
     "register_destructor",
diff --git a/src/IR.h b/src/IR.h
index d3f6af596f31..f21f3a9a52ba 100644
--- a/src/IR.h
+++ b/src/IR.h
@@ -569,6 +569,7 @@ struct Call : public ExprNode<Call> {
         mux,
         popcount,
         prefetch,
+        profiling_enable_instance_marker,
         promise_clamped,
         random,
         register_destructor,
diff --git a/src/Profiling.cpp b/src/Profiling.cpp
index 414578299df6..b0178b4a073a 100644
--- a/src/Profiling.cpp
+++ b/src/Profiling.cpp
@@ -23,14 +23,38 @@ using std::vector;
 
 namespace {
 
-Stmt incr_active_threads(const Expr &profiler_state) {
+// All names that need to be unique, just in case someone does something
+// perverse like naming a func "profiler_instance".
+struct Names {
+    const std::string &pipeline_name;
+    std::string profiler_instance;
+    std::string profiler_local_sampling_token;
+    std::string profiler_shared_sampling_token;
+    std::string hvx_profiler_instance;
+    std::string profiler_func_names;
+    std::string profiler_func_stack_peak_buf;
+    std::string profiler_start_error_code;
+
+    Names(const std::string &pipeline_name)
+        : pipeline_name(pipeline_name),
+          profiler_instance(unique_name("profiler_instance")),
+          profiler_local_sampling_token(unique_name("profiler_local_sampling_token")),
+          profiler_shared_sampling_token(unique_name("profiler_shared_sampling_token")),
+          hvx_profiler_instance(unique_name("hvx_profiler_instance")),
+          profiler_func_names(unique_name("profiler_func_names")),
+          profiler_func_stack_peak_buf(unique_name("profiler_func_stack_peak_buf")),
+          profiler_start_error_code(unique_name("profiler_start_error_code")) {
+    }
+};
+
+Stmt incr_active_threads(const Expr &profiler_instance) {
     return Evaluate::make(Call::make(Int(32), "halide_profiler_incr_active_threads",
-                                     {profiler_state}, Call::Extern));
+                                     {profiler_instance}, Call::Extern));
 }
 
-Stmt decr_active_threads(const Expr &profiler_state) {
+Stmt decr_active_threads(const Expr &profiler_instance) {
     return Evaluate::make(Call::make(Int(32), "halide_profiler_decr_active_threads",
-                                     {profiler_state}, Call::Extern));
+                                     {profiler_instance}, Call::Extern));
 }
 
 Stmt acquire_sampling_token(const Expr &shared_token, const Expr &local_token) {
@@ -43,18 +67,6 @@ Stmt release_sampling_token(const Expr &shared_token, const Expr &local_token) {
                                      {shared_token, local_token}, Call::Extern));
 }
 
-Stmt activate_thread(const Stmt &s, const Expr &profiler_state) {
-    return Block::make({incr_active_threads(profiler_state),
-                        s,
-                        decr_active_threads(profiler_state)});
-}
-
-Stmt suspend_thread(const Stmt &s, const Expr &profiler_state) {
-    return Block::make({decr_active_threads(profiler_state),
-                        s,
-                        incr_active_threads(profiler_state)});
-}
-
 Stmt claim_sampling_token(const Stmt &s, const Expr &shared_token, const Expr &local_token) {
     return LetStmt::make(local_token.as<Variable>()->name,
                          Call::make(Handle(), Call::alloca, {Int(32).bytes()}, Call::Intrinsic),
@@ -70,38 +82,62 @@ class InjectProfiling : public IRMutator {
 
     vector<int> stack;  // What produce nodes are we currently inside of.
 
-    string pipeline_name;
+    const Names &names;
     const map<string, Function> &env;
 
     bool in_fork = false;
     bool in_parallel = false;
     bool in_leaf_task = false;
 
-    InjectProfiling(const string &pipeline_name, const map<string, Function> &env)
-        : pipeline_name(pipeline_name), env(env) {
+    InjectProfiling(const Names &names, const map<std::string, Function> &env)
+        : names(names), env(env) {
         stack.push_back(get_func_id("overhead"));
         // ID 0 is treated specially in the runtime as overhead
         internal_assert(stack.back() == 0);
 
+        waiting_on_tasks_id = get_func_id("waiting for parallel tasks to finish");
         malloc_id = get_func_id("halide_malloc");
         free_id = get_func_id("halide_free");
-        profiler_pipeline_state = Variable::make(Handle(), "profiler_pipeline_state");
-        profiler_state = Variable::make(Handle(), "profiler_state");
-        profiler_token = Variable::make(Int(32), "profiler_token");
-        profiler_local_sampling_token = Variable::make(Handle(), "profiler_local_sampling_token");
-        profiler_shared_sampling_token = Variable::make(Handle(), "profiler_shared_sampling_token");
+
+        profiler_instance = Variable::make(Handle(), names.profiler_instance);
+        profiler_local_sampling_token = Variable::make(Handle(), names.profiler_local_sampling_token);
+        profiler_shared_sampling_token = Variable::make(Handle(), names.profiler_shared_sampling_token);
     }
 
     map<int, uint64_t> func_stack_current;  // map from func id -> current stack allocation
     map<int, uint64_t> func_stack_peak;     // map from func id -> peak stack allocation
 
+    Stmt activate_thread(const Stmt &s) {
+        return activate_thread_helper(s, waiting_on_tasks_id);
+    }
+
+    Stmt activate_main_thread(const Stmt &s) {
+        // The same as a child task, except when we finish (but before the
+        // instances get popped), bill anything as overhead.
+        return activate_thread_helper(s, 0);
+    }
+
+    Stmt activate_thread_helper(const Stmt &s, int final_id) {
+        return Block::make({incr_active_threads(profiler_instance),
+                            unconditionally_set_current_func(stack.back()),
+                            s,
+                            decr_active_threads(profiler_instance),
+                            unconditionally_set_current_func(final_id)});
+    }
+
+    Stmt suspend_thread(const Stmt &s) {
+        return Block::make({decr_active_threads(profiler_instance),
+                            unconditionally_set_current_func(waiting_on_tasks_id),
+                            s,
+                            incr_active_threads(profiler_instance),
+                            unconditionally_set_current_func(stack.back())});
+    }
+
 private:
     using IRMutator::visit;
 
-    int malloc_id, free_id;
-    Expr profiler_pipeline_state;
-    Expr profiler_state;
-    Expr profiler_token;
+    int malloc_id, free_id, waiting_on_tasks_id;
+    Expr profiler_instance;
     Expr profiler_local_sampling_token;
     Expr profiler_shared_sampling_token;
 
@@ -157,6 +193,12 @@ class InjectProfiling : public IRMutator {
         return idx;
     }
 
+    Stmt unconditionally_set_current_func(int id) {
+        Stmt s = Evaluate::make(Call::make(Int(32), "halide_profiler_set_current_func",
+                                           {profiler_instance, id, reinterpret(Handle(), cast<uint64_t>(0))}, Call::Extern));
+        return s;
+    }
+
     Stmt set_current_func(int id) {
         if (most_recently_set_func == id) {
             return Evaluate::make(0);
@@ -165,7 +207,7 @@ class InjectProfiling : public IRMutator {
         Expr last_arg = in_leaf_task ? profiler_local_sampling_token : reinterpret(Handle(), cast<uint64_t>(0));
         // This call gets inlined and becomes a single store instruction.
         Stmt s = Evaluate::make(Call::make(Int(32), "halide_profiler_set_current_func",
-                                           {profiler_state, profiler_token, id, last_arg}, Call::Extern));
+                                           {profiler_instance, id, last_arg}, Call::Extern));
 
         return s;
     }
@@ -203,6 +245,17 @@ class InjectProfiling : public IRMutator {
         return size;
     }
 
+    Expr visit(const Call *op) override {
+        if (op->is_intrinsic(Call::profiling_enable_instance_marker)) {
+            // We're out of the bounds query code. This instance should be
+            // tracked (including any samples taken before this point.
+            return Call::make(Int(32), "halide_profiler_enable_instance",
+                              {profiler_instance}, Call::Extern);
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
     Stmt visit(const Allocate *op) override {
 
         auto [new_extents, changed] = mutate_with_changes(op->extents);
@@ -229,7 +282,7 @@ class InjectProfiling : public IRMutator {
             func_stack_current[idx] += *int_size;
             func_stack_peak[idx] = std::max(func_stack_peak[idx], func_stack_current[idx]);
             debug(3) << "  Allocation on stack: " << op->name
-                     << "(" << size << ") in pipeline " << pipeline_name
+                     << "(" << size << ") in pipeline " << names.pipeline_name
                      << "; current: " << func_stack_current[idx]
                      << "; peak: " << func_stack_peak[idx] << "\n";
         }
@@ -240,11 +293,11 @@ class InjectProfiling : public IRMutator {
             int idx = get_func_id(op->name);
             debug(3) << "  Allocation on heap: " << op->name
                      << "(" << size << ") in pipeline "
-                     << pipeline_name << "\n";
+                     << names.pipeline_name << "\n";
 
             tasks.push_back(set_current_func(malloc_id));
             tasks.push_back(Evaluate::make(Call::make(Int(32), "halide_profiler_memory_allocate",
-                                                      {profiler_pipeline_state, idx, size}, Call::Extern)));
+                                                      {profiler_instance, idx, size}, Call::Extern)));
         }
 
         Stmt body = mutate(op->body);
@@ -281,12 +334,12 @@ class InjectProfiling : public IRMutator {
             if (!alloc.on_stack) {
                 if (profiling_memory) {
                     int idx = get_func_id(op->name);
-                    debug(3) << "  Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name << "\n";
+                    debug(3) << "  Free on heap: " << op->name << "(" << alloc.size << ") in pipeline " << names.pipeline_name << "\n";
 
                     vector<Stmt> tasks{
                         set_current_func(free_id),
                         Evaluate::make(Call::make(Int(32), "halide_profiler_memory_free",
-                                                  {profiler_pipeline_state, idx, alloc.size}, Call::Extern)),
+                                                  {profiler_instance, idx, alloc.size}, Call::Extern)),
                         stmt,
                         set_current_func(stack.back())};
 
@@ -304,8 +357,10 @@ class InjectProfiling : public IRMutator {
                     idx = get_func_id(op->name);
                 }
                 func_stack_current[idx] -= *int_size;
-                debug(3) << "  Free on stack: " << op->name << "(" << alloc.size << ") in pipeline " << pipeline_name
-                         << "; current: " << func_stack_current[idx] << "; peak: " << func_stack_peak[idx] << "\n";
+                debug(3) << "  Free on stack: " << op->name
+                         << "(" << alloc.size << ") in pipeline " << names.pipeline_name
+                         << "; current: " << func_stack_current[idx]
+                         << "; peak: " << func_stack_peak[idx] << "\n";
             }
         }
         return stmt;
@@ -345,7 +400,7 @@ class InjectProfiling : public IRMutator {
         } else if (const Acquire *a = s.as<Acquire>()) {
             s = Acquire::make(a->semaphore, a->count, visit_parallel_task(a->body));
         } else {
-            s = activate_thread(mutate(s), profiler_state);
+            s = activate_thread(mutate(s));
         }
         if (most_recently_set_func != old) {
             most_recently_set_func = -1;
@@ -355,13 +410,13 @@ class InjectProfiling : public IRMutator {
 
     Stmt visit(const Acquire *op) override {
         Stmt s = visit_parallel_task(op);
-        return suspend_thread(s, profiler_state);
+        return suspend_thread(s);
     }
 
     Stmt visit(const Fork *op) override {
         ScopedValue<bool> bind(in_fork, true);
         Stmt s = visit_parallel_task(op);
-        return suspend_thread(s, profiler_state);
+        return suspend_thread(s);
     }
 
     Stmt visit(const For *op) override {
@@ -378,7 +433,6 @@ class InjectProfiling : public IRMutator {
 
         bool leaf_task = false;
         if (update_active_threads) {
-            body = activate_thread(body, profiler_state);
 
             class ContainsParallelOrBlockingNode : public IRVisitor {
                 using IRVisitor::visit;
@@ -404,6 +458,8 @@ class InjectProfiling : public IRMutator {
             if (leaf_task) {
                 body = claim_sampling_token(body, profiler_shared_sampling_token, profiler_local_sampling_token);
             }
+
+            body = activate_thread(body);
         }
         ScopedValue<bool> bind_leaf_task(in_leaf_task, in_leaf_task || leaf_task);
 
@@ -423,9 +479,9 @@ class InjectProfiling : public IRMutator {
             // Get the profiler state pointer from scratch inside the
             // kernel. There will be a separate copy of the state on
             // the DSP that the host side will periodically query.
-            Expr get_state = Call::make(Handle(), "halide_profiler_get_state", {}, Call::Extern);
-            body = substitute("profiler_state", Variable::make(Handle(), "hvx_profiler_state"), body);
-            body = LetStmt::make("hvx_profiler_state", get_state, body);
+            Expr get_state = Call::make(Handle(), "halide_hexagon_remote_profiler_get_global_instance", {}, Call::Extern);
+            body = substitute(names.profiler_instance, Variable::make(Handle(), names.hvx_profiler_instance), body);
+            body = LetStmt::make(names.hvx_profiler_instance, get_state, body);
         } else if (op->device_api == DeviceAPI::None ||
                    op->device_api == DeviceAPI::Host) {
             body = mutate(body);
@@ -440,7 +496,7 @@ class InjectProfiling : public IRMutator {
         Stmt stmt = For::make(op->name, op->min, op->extent, op->for_type, op->partition_policy, op->device_api, body);
 
         if (update_active_threads) {
-            stmt = suspend_thread(stmt, profiler_state);
+            stmt = suspend_thread(stmt);
         }
 
         return stmt;
@@ -538,77 +594,84 @@ class InjectProfiling : public IRMutator {
 
 }  // namespace
 
-Stmt inject_profiling(Stmt s, const string &pipeline_name, const std::map<string, Function> &env) {
-    InjectProfiling profiling(pipeline_name, env);
-    s = profiling.mutate(s);
+Stmt inject_profiling(const Stmt &stmt, const string &pipeline_name, const std::map<string, Function> &env) {
+    Names names(pipeline_name);
+
+    InjectProfiling profiling(names, env);
+    Stmt s = profiling.mutate(stmt);
 
     int num_funcs = (int)(profiling.indices.size());
 
-    Expr func_names_buf = Variable::make(Handle(), "profiling_func_names");
+    // TODO: unique_name all these strings
 
-    Expr start_profiler = Call::make(Int(32), "halide_profiler_pipeline_start",
-                                     {pipeline_name, num_funcs, func_names_buf}, Call::Extern);
+    Expr instance = Variable::make(Handle(), names.profiler_instance);
 
-    Expr get_state = Call::make(Handle(), "halide_profiler_get_state", {}, Call::Extern);
+    Expr func_names_buf = Variable::make(Handle(), names.profiler_func_names);
 
-    Expr get_pipeline_state = Call::make(Handle(), "halide_profiler_get_pipeline_state", {pipeline_name}, Call::Extern);
+    Expr start_profiler = Call::make(Int(32), "halide_profiler_instance_start",
+                                     {pipeline_name, num_funcs, func_names_buf, instance}, Call::Extern);
 
-    Expr profiler_token = Variable::make(Int(32), "profiler_token");
+    Expr profiler_start_error_code = Variable::make(Int(32), names.profiler_start_error_code);
 
     Expr stop_profiler = Call::make(Handle(), Call::register_destructor,
-                                    {Expr("halide_profiler_pipeline_end"), get_state}, Call::Intrinsic);
+                                    {Expr("halide_profiler_instance_end"), instance}, Call::Intrinsic);
 
     bool no_stack_alloc = profiling.func_stack_peak.empty();
     if (!no_stack_alloc) {
-        Expr func_stack_peak_buf = Variable::make(Handle(), "profiling_func_stack_peak_buf");
+        Expr func_stack_peak_buf = Variable::make(Handle(), names.profiler_func_stack_peak_buf);
 
-        Expr profiler_pipeline_state = Variable::make(Handle(), "profiler_pipeline_state");
         Stmt update_stack = Evaluate::make(Call::make(Int(32), "halide_profiler_stack_peak_update",
-                                                      {profiler_pipeline_state, func_stack_peak_buf}, Call::Extern));
+                                                      {instance, func_stack_peak_buf}, Call::Extern));
         s = Block::make(update_stack, s);
     }
 
-    Expr profiler_state = Variable::make(Handle(), "profiler_state");
-
-    s = activate_thread(s, profiler_state);
+    s = profiling.activate_main_thread(s);
 
     // Initialize the shared sampling token
-    Expr shared_sampling_token_var = Variable::make(Handle(), "profiler_shared_sampling_token");
+    Expr shared_sampling_token_var = Variable::make(Handle(), names.profiler_shared_sampling_token);
     Expr init_sampling_token =
         Call::make(Int(32), "halide_profiler_init_sampling_token", {shared_sampling_token_var, 0}, Call::Extern);
     s = Block::make({Evaluate::make(init_sampling_token), s});
-    s = LetStmt::make("profiler_shared_sampling_token",
+    s = LetStmt::make(names.profiler_shared_sampling_token,
                       Call::make(Handle(), Call::alloca, {Int(32).bytes()}, Call::Intrinsic), s);
 
-    s = LetStmt::make("profiler_pipeline_state", get_pipeline_state, s);
-    s = LetStmt::make("profiler_state", get_state, s);
     // If there was a problem starting the profiler, it will call an
     // appropriate halide error function and then return the
     // (negative) error code as the token.
-    s = Block::make(AssertStmt::make(profiler_token >= 0, profiler_token), s);
-    s = LetStmt::make("profiler_token", start_profiler, s);
+    s = Block::make(AssertStmt::make(profiler_start_error_code == 0, profiler_start_error_code), s);
+    s = LetStmt::make(names.profiler_start_error_code, start_profiler, s);
 
     if (!no_stack_alloc) {
         for (int i = num_funcs - 1; i >= 0; --i) {
-            s = Block::make(Store::make("profiling_func_stack_peak_buf",
+            s = Block::make(Store::make(names.profiler_func_stack_peak_buf,
                                         make_const(UInt(64), profiling.func_stack_peak[i]),
                                         i, Parameter(), const_true(), ModulusRemainder()),
                             s);
         }
-        s = Block::make(s, Free::make("profiling_func_stack_peak_buf"));
-        s = Allocate::make("profiling_func_stack_peak_buf", UInt(64),
+        s = Block::make(s, Free::make(names.profiler_func_stack_peak_buf));
+        s = Allocate::make(names.profiler_func_stack_peak_buf, UInt(64),
                            MemoryType::Auto, {num_funcs}, const_true(), s);
     }
 
     for (const auto &p : profiling.indices) {
-        s = Block::make(Store::make("profiling_func_names", p.first, p.second, Parameter(), const_true(), ModulusRemainder()), s);
+        s = Block::make(Store::make(names.profiler_func_names, p.first, p.second, Parameter(), const_true(), ModulusRemainder()), s);
     }
 
-    s = Block::make(s, Free::make("profiling_func_names"));
-    s = Allocate::make("profiling_func_names", Handle(),
+    s = Block::make(s, Free::make(names.profiler_func_names));
+    s = Allocate::make(names.profiler_func_names, Handle(),
                        MemoryType::Auto, {num_funcs}, const_true(), s);
     s = Block::make(Evaluate::make(stop_profiler), s);
 
+    // Allocate memory for the profiler instance state
+
+    // Check there isn't going to be end-of-struct padding to worry about.
+    static_assert((sizeof(halide_profiler_func_stats) & 7) == 0);
+
+    const int instance_size_bytes = sizeof(halide_profiler_instance_state) + num_funcs * sizeof(halide_profiler_func_stats);
+
+    s = Allocate::make(names.profiler_instance, UInt(64), MemoryType::Auto,
+                       {(instance_size_bytes + 7) / 8}, const_true(), s);
+
     // We have nested definitions of the sampling token
     s = uniquify_variable_names(s);
 
diff --git a/src/Profiling.h b/src/Profiling.h
index afaa47fe6d6e..b91644409655 100644
--- a/src/Profiling.h
+++ b/src/Profiling.h
@@ -40,7 +40,7 @@ class Function;
  * storage flattening, but after all bounds inference.
  *
  */
-Stmt inject_profiling(Stmt, const std::string &, const std::map<std::string, Function> &env);
+Stmt inject_profiling(const Stmt &, const std::string &, const std::map<std::string, Function> &env);
 
 }  // namespace Internal
 }  // namespace Halide
diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
index 4ddb52cd89bf..736d64478c8b 100644
--- a/src/runtime/HalideRuntime.h
+++ b/src/runtime/HalideRuntime.h
@@ -1265,6 +1265,9 @@ enum halide_error_code_t {
     /** "vscale" value of Scalable Vector detected in runtime does not match
      * the vscale value used in compilation. */
     halide_error_code_vscale_invalid = -47,
+
+    /** Profiling failed for a pipeline invocation. */
+    halide_error_code_cannot_profile_pipeline = -48,
 };
 
 /** Halide calls the functions below on various error conditions. The
@@ -1849,7 +1852,7 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_func_stats {
 /** Per-pipeline state tracked by the sampling profiler. These exist
  * in a linked list. */
 struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
-    /** Total time spent inside this pipeline (in nanoseconds) */
+    /** Total time spent in this pipeline (in nanoseconds) */
     uint64_t time;
 
     /** The current memory allocation of funcs in this pipeline. */
@@ -1878,9 +1881,6 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
     /** The number of funcs in this pipeline. */
     int num_funcs;
 
-    /** An internal base id used to identify the funcs in this pipeline. */
-    int first_func_id;
-
     /** The number of times this pipeline has been run. */
     int runs;
 
@@ -1891,48 +1891,98 @@ struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
     int num_allocs;
 };
 
-/** The global state of the profiler. */
+/** Per-invocation-of-a-pipeline state. Lives on the stack of the Halide
+ * code. Exists in a doubly-linked list to that it can be cleanly
+ * removed. */
+struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_instance_state {
+    /** Time billed to funcs in this instance by the sampling thread. */
+    uint64_t billed_time;
 
-struct halide_profiler_state {
-    /** Guards access to the fields below. If not locked, the sampling
-     * profiler thread is free to modify things below (including
-     * reordering the linked list of pipeline stats). */
-    struct halide_mutex lock;
+    /** Wall clock time of the start of the instance. */
+    uint64_t start_time;
 
-    /** The amount of time the profiler thread sleeps between samples
-     * in milliseconds. Defaults to 1 */
-    int sleep_time;
+    /** The current memory allocation of funcs in this instance. */
+    uint64_t memory_current;
 
-    /** An internal id used for bookkeeping. */
-    int first_free_id;
+    /** The peak memory allocation of funcs in this instance. */
+    uint64_t memory_peak;
+
+    /** The total memory allocation of funcs in this instance. */
+    uint64_t memory_total;
+
+    /** The average number of thread pool worker threads doing useful
+     * work while computing this instance. */
+    uint64_t active_threads_numerator, active_threads_denominator;
+
+    /** A pointer to the next running instance, so that the running instances
+     * can exist in a linked list. */
+    struct halide_profiler_instance_state *next;
+
+    /** A pointer to the address of the next pointer of the previous instance,
+     * so that this can be removed from the linked list when the instance
+     * terminates. */
+    struct halide_profiler_instance_state **prev_next;
+
+    /** Information shared across all instances. The stats above are merged into
+     * it when the instance is retired. */
+    struct halide_profiler_pipeline_stats *pipeline_stats;
+
+    /** An array containing states for each Func in this instance of this pipeline. */
+    struct halide_profiler_func_stats *funcs;
 
     /** The id of the current running Func. Set by the pipeline, read
      * periodically by the profiler thread. */
     int current_func;
 
-    /** The number of threads currently doing work. */
+    /** The number of threads currently doing work on this pipeline instance. */
     int active_threads;
 
+    /** The number of samples taken by this instance. */
+    int samples;
+
+    /** The total number of memory allocation of funcs in this instance. */
+    int num_allocs;
+
+    /** Whether or not this instance should count towards pipeline
+     * statistics. */
+    int should_collect_statistics;
+};
+
+/** The global state of the profiler. */
+struct halide_profiler_state {
+    /** Guards access to the fields below. If not locked, the sampling
+     * profiler thread is free to modify things below (including
+     * reordering the linked list of pipeline stats). */
+    struct halide_mutex lock;
+
     /** A linked list of stats gathered for each pipeline. */
     struct halide_profiler_pipeline_stats *pipelines;
 
     /** Retrieve remote profiler state. Used so that the sampling
      * profiler can follow along with execution that occurs elsewhere,
      * e.g. on a DSP. If null, it reads from the int above instead. */
-    void (*get_remote_profiler_state)(int *func, int *active_workers);
 
     /** Sampling thread reference to be joined at shutdown. */
     struct halide_thread *sampling_thread;
-};
 
-/** Profiler func ids with special meanings. */
-enum {
-    /// current_func takes on this value when not inside Halide code
-    halide_profiler_outside_of_halide = -1,
-    /// Set current_func to this value to tell the profiling thread to
-    /// halt. It will start up again next time you run a pipeline with
-    /// profiling enabled.
-    halide_profiler_please_stop = -2
+    /** The running instances of Halide pipelines. */
+    struct halide_profiler_instance_state *instances;
+
+    /** If this callback is defined, the profiler asserts that there is a single
+     * live instance, and then uses it to get the current func and number of
+     * active threads insted of reading the fields in the instance. This is used
+     * so that the profiler can follow along with execution that occurs
+     * elsewhere (e.g. on an accelerator). */
+    void (*get_remote_profiler_state)(int *func, int *active_workers);
+
+    /** The amount of time the profiler thread sleeps between samples in
+     * microseconds. Defaults to 1000. To change it call
+     * halide_profiler_get_state and mutate this field. */
+    int sleep_time;
+
+    /** Set to 1 when you want the profiler to wait for all running instances to
+     * finish and then stop gracefully. */
+    int shutdown;
 };
 
 /** Get a pointer to the global profiler state for programmatic
@@ -1950,34 +2000,24 @@ extern struct halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state
  * accurate time interval if desired. */
 extern int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *prev_t);
 
-/** Reset profiler state cheaply. May leave threads running or some
- * memory allocated but all accumluated statistics are reset.
- * WARNING: Do NOT call this method while any halide pipeline is
- * running; halide_profiler_memory_allocate/free and
- * halide_profiler_stack_peak_update update the profiler pipeline's
- * state without grabbing the global profiler state's lock. */
+/** Reset profiler state cheaply. May leave threads running or some memory
+ * allocated but all accumulated statistics are reset. Blocks until all running
+ * profiled Halide pipelines exit. */
 extern void halide_profiler_reset(void);
 
-/** Reset all profiler state.
- * WARNING: Do NOT call this method while any halide pipeline is
- * running; halide_profiler_memory_allocate/free and
- * halide_profiler_stack_peak_update update the profiler pipeline's
- * state without grabbing the global profiler state's lock. */
-void halide_profiler_shutdown(void);
+/** Reset all profiler state. Blocks until all running profiled Halide
+ * pipelines exit. */
+extern void halide_profiler_shutdown(void);
 
 /** Print out timing statistics for everything run since the last
  * reset. Also happens at process exit. */
 extern void halide_profiler_report(void *user_context);
 
-/** For timer based profiling, this routine starts the timer chain running.
- * halide_get_profiler_state can be called to get the current timer interval.
- */
-extern void halide_start_timer_chain(void);
 /** These routines are called to temporarily disable and then reenable
- * timer interuppts for profiling */
+ * the profiler. */
 //@{
-extern void halide_disable_timer_interrupt(void);
-extern void halide_enable_timer_interrupt(void);
+extern void halide_profiler_lock(struct halide_profiler_state *);
+extern void halide_profiler_unlock(struct halide_profiler_state *);
 //@}
 
 /// \name "Float16" functions
diff --git a/src/runtime/fuchsia_clock.cpp b/src/runtime/fuchsia_clock.cpp
index b532867e0788..9733d6e135bc 100644
--- a/src/runtime/fuchsia_clock.cpp
+++ b/src/runtime/fuchsia_clock.cpp
@@ -30,7 +30,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
     return zx_clock_get_monotonic() - halide_reference_clock;
 }
 
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    zx_nanosleep(zx_deadline_after(ms * 1000));
+WEAK void halide_sleep_us(void *user_context, int us) {
+    zx_nanosleep(zx_deadline_after(us));
 }
 }
diff --git a/src/runtime/hexagon_host.cpp b/src/runtime/hexagon_host.cpp
index 7035741a9fdf..80dcca1dc707 100644
--- a/src/runtime/hexagon_host.cpp
+++ b/src/runtime/hexagon_host.cpp
@@ -275,7 +275,6 @@ WEAK int halide_hexagon_initialize_kernels(void *user_context, void **state_ptr,
     halide_abort_if_false(user_context, state_ptr != nullptr);
 
 #ifdef DEBUG_RUNTIME
-    halide_start_clock(user_context);
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
@@ -478,11 +477,22 @@ WEAK int halide_hexagon_run(void *user_context,
     // get_remote_profiler_func to retrieve the current
     // func. Otherwise leave it alone - the cost of remote running
     // will be billed to the calling Func.
+    halide_profiler_state *s = halide_profiler_get_state();
     if (remote_poll_profiler_state) {
-        halide_profiler_get_state()->get_remote_profiler_state = get_remote_profiler_state;
-        if (remote_profiler_set_current_func) {
-            remote_profiler_set_current_func(halide_profiler_get_state()->current_func);
+        halide_profiler_lock(s);
+        const halide_profiler_instance_state *instance = s->instances;
+        if (instance) {
+            if (instance->next) {
+                halide_profiler_unlock(s);
+                error(user_context) << "Hexagon: multiple simultaneous profiled pipelines is unsupported.";
+                return halide_error_code_cannot_profile_pipeline;
+            }
+            s->get_remote_profiler_state = get_remote_profiler_state;
+            if (remote_profiler_set_current_func) {
+                remote_profiler_set_current_func(instance->current_func);
+            }
         }
+        halide_profiler_unlock(s);
     }
 
     // Call the pipeline on the device side.
@@ -498,7 +508,9 @@ WEAK int halide_hexagon_run(void *user_context,
         return halide_error_code_generic_error;
     }
 
-    halide_profiler_get_state()->get_remote_profiler_state = nullptr;
+    halide_profiler_lock(s);
+    s->get_remote_profiler_state = nullptr;
+    halide_profiler_unlock(s);
 
 #ifdef DEBUG_RUNTIME
     uint64_t t_after = halide_current_time_ns(user_context);
@@ -580,6 +592,7 @@ WEAK int halide_hexagon_device_malloc(void *user_context, halide_buffer_t *buf)
     debug(user_context) << "    allocating buffer of " << (uint64_t)size << " bytes\n";
 
 #ifdef DEBUG_RUNTIME
+    halide_start_clock(user_context);
     uint64_t t_before = halide_current_time_ns(user_context);
 #endif
 
diff --git a/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so b/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so
index 148388d21adf..ee503bb14686 100755
Binary files a/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so and b/src/runtime/hexagon_remote/bin/host/libhalide_hexagon_host.so differ
diff --git a/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote b/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote
index c807c5134edd..e6f36f58084a 100755
Binary files a/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote and b/src/runtime/hexagon_remote/bin/v65/hexagon_sim_remote differ
diff --git a/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so b/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so
index 852b143d9adf..7c58f73b8dec 100755
Binary files a/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so and b/src/runtime/hexagon_remote/bin/v65/libhalide_hexagon_remote_skel.so differ
diff --git a/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a b/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a
index 032a59aa14a4..f0c666e5bb60 100644
Binary files a/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a and b/src/runtime/hexagon_remote/bin/v65/libsim_qurt.a differ
diff --git a/src/runtime/hexagon_remote/bin/v65/signed_by_debug/libhalide_hexagon_remote_skel.so b/src/runtime/hexagon_remote/bin/v65/signed_by_debug/libhalide_hexagon_remote_skel.so
deleted file mode 100644
index 7d8a61050ba5..000000000000
Binary files a/src/runtime/hexagon_remote/bin/v65/signed_by_debug/libhalide_hexagon_remote_skel.so and /dev/null differ
diff --git a/src/runtime/hexagon_remote/qurt/halide_remote.cpp b/src/runtime/hexagon_remote/qurt/halide_remote.cpp
index 2af8b8606cdb..d110803e1a95 100644
--- a/src/runtime/hexagon_remote/qurt/halide_remote.cpp
+++ b/src/runtime/hexagon_remote/qurt/halide_remote.cpp
@@ -419,23 +419,24 @@ int halide_hexagon_remote_release_library(handle_t module_ptr) {
     return 0;
 }
 
+halide_profiler_instance_state *halide_hexagon_remote_profiler_get_global_instance() {
+    static halide_profiler_instance_state hvx_profiler_instance;
+    return &hvx_profiler_instance;
+}
+
 int halide_hexagon_remote_poll_profiler_state(int *func, int *threads) {
     // Increase the current thread priority to match working threads priorities,
     // so profiler can access the remote state without extra latency.
     qurt_thread_t current_thread_id = qurt_thread_get_id();
     qurt_thread_set_priority(current_thread_id, 100);
 
-    *func = halide_profiler_get_state()->current_func;
-    *threads = halide_profiler_get_state()->active_threads;
+    *func = halide_hexagon_remote_profiler_get_global_instance()->current_func;
+    *threads = halide_hexagon_remote_profiler_get_global_instance()->active_threads;
     return 0;
 }
 int halide_hexagon_remote_profiler_set_current_func(int current_func) {
-    halide_profiler_get_state()->current_func = current_func;
+    halide_hexagon_remote_profiler_get_global_instance()->current_func = current_func;
     return 0;
 }
-halide_profiler_state *halide_profiler_get_state() {
-    static halide_profiler_state hvx_profiler_state;
-    return &hvx_profiler_state;
-}
 
 }  // extern "C"
diff --git a/src/runtime/hexagon_remote/qurt/known_symbols.cpp b/src/runtime/hexagon_remote/qurt/known_symbols.cpp
index 48c60b22fd85..0ed27597769e 100644
--- a/src/runtime/hexagon_remote/qurt/known_symbols.cpp
+++ b/src/runtime/hexagon_remote/qurt/known_symbols.cpp
@@ -105,7 +105,6 @@ void *get_known_symbol(const char *name) {
 
         {"halide_error", (char *)(&halide_error)},
         {"halide_print", (char *)(&halide_print)},
-        {"halide_profiler_get_state", (char *)(&halide_profiler_get_state)},
         {"qurt_hvx_lock", (char *)(&qurt_hvx_lock)},
         {"qurt_hvx_unlock", (char *)(&qurt_hvx_unlock)},
 
diff --git a/src/runtime/hexagon_remote/qurt/sim_remote.cpp b/src/runtime/hexagon_remote/qurt/sim_remote.cpp
index 45b89edc2416..771f7fa02e39 100644
--- a/src/runtime/hexagon_remote/qurt/sim_remote.cpp
+++ b/src/runtime/hexagon_remote/qurt/sim_remote.cpp
@@ -152,12 +152,12 @@ int release_library(handle_t module_ptr) {
 }
 
 extern "C" {
-halide_profiler_state profiler_state;
-int *profiler_current_func_addr = &profiler_state.current_func;
-}
+halide_profiler_instance_state hvx_profiler_instance;
+int *profiler_current_func_addr = &hvx_profiler_instance.current_func;
 
-halide_profiler_state *halide_profiler_get_state() {
-    return (halide_profiler_state *)(&profiler_state);
+halide_profiler_instance_state *halide_hexagon_remote_profiler_get_global_instance() {
+    return &hvx_profiler_instance;
+}
 }
 
 extern "C" {
diff --git a/src/runtime/linux_clock.cpp b/src/runtime/linux_clock.cpp
index 4729b2b2bc6d..dbb8fc186cd4 100644
--- a/src/runtime/linux_clock.cpp
+++ b/src/runtime/linux_clock.cpp
@@ -76,7 +76,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
 }
 
 extern int usleep(int);
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    usleep(ms * 1000);
+WEAK void halide_sleep_us(void *user_context, int us) {
+    usleep(us);
 }
 }
diff --git a/src/runtime/osx_clock.cpp b/src/runtime/osx_clock.cpp
index 9da5f99d1c11..a604a76eaf16 100644
--- a/src/runtime/osx_clock.cpp
+++ b/src/runtime/osx_clock.cpp
@@ -45,7 +45,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
 }
 
 extern int usleep(int);
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    usleep(ms * 1000);
+WEAK void halide_sleep_us(void *user_context, int us) {
+    usleep(us);
 }
 }
diff --git a/src/runtime/posix_clock.cpp b/src/runtime/posix_clock.cpp
index ddf258d3e0b2..7f0359b2ef6f 100644
--- a/src/runtime/posix_clock.cpp
+++ b/src/runtime/posix_clock.cpp
@@ -39,7 +39,7 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
 }
 
 extern int usleep(int);
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    usleep(ms * 1000);
+WEAK void halide_sleep_us(void *user_context, int us) {
+    usleep(us);
 }
 }
diff --git a/src/runtime/posix_timer_profiler.cpp b/src/runtime/posix_timer_profiler.cpp
index 83dcfdfde2e8..9fde50ee0611 100644
--- a/src/runtime/posix_timer_profiler.cpp
+++ b/src/runtime/posix_timer_profiler.cpp
@@ -50,7 +50,7 @@ WEAK extern "C" void halide_start_timer_chain() {
         halide_profiler_state *s = halide_profiler_get_state();
         itimerval timer_state;
         timer_state.it_interval.tv_sec = 0;
-        timer_state.it_interval.tv_usec = s->sleep_time * 1000.0;
+        timer_state.it_interval.tv_usec = s->sleep_time;
         timer_state.it_value = timer_state.it_interval;
 
         signal(SIGPROF, &profiler_handler);
diff --git a/src/runtime/profiler_common.cpp b/src/runtime/profiler_common.cpp
index ccbe0bf11ecb..a5633898b8a9 100644
--- a/src/runtime/profiler_common.cpp
+++ b/src/runtime/profiler_common.cpp
@@ -10,7 +10,16 @@
 extern "C" {
 // Returns the address of the global halide_profiler state
 WEAK halide_profiler_state *halide_profiler_get_state() {
-    static halide_profiler_state s = {{{0}}, 1, 0, 0, 0, nullptr, nullptr, nullptr};
+    static halide_profiler_state s = {
+        {{0}},    // The mutex
+        nullptr,  // pipeline stats
+        nullptr,  // sampling thread
+        nullptr,  // running instances
+        nullptr,  // get_remote_profiler_state callback
+        1000,     // Sampling rate in us
+        0         // Flag that tells us to shutdown when it turns to 1
+    };
+
     return &s;
 }
 
@@ -19,10 +28,25 @@ extern "C" void halide_start_timer_chain();
 extern "C" void halide_disable_timer_interrupt();
 extern "C" void halide_enable_timer_interrupt();
 #endif
+
+WEAK void halide_profiler_lock(struct halide_profiler_state *state) {
+#if TIMER_PROFILING
+    halide_disable_timer_interrupt();
+#endif
+    halide_mutex_lock(&state->lock);
+}
+
+WEAK void halide_profiler_unlock(struct halide_profiler_state *state) {
+#if TIMER_PROFILING
+    halide_enable_timer_interrupt();
+#endif
+    halide_mutex_unlock(&state->lock);
+}
 }
 
 namespace Halide {
 namespace Runtime {
+
 namespace Internal {
 
 class LockProfiler {
@@ -31,16 +55,10 @@ class LockProfiler {
 public:
     explicit LockProfiler(halide_profiler_state *s)
         : state(s) {
-#if TIMER_PROFILING
-        halide_disable_timer_interrupt();
-#endif
-        halide_mutex_lock(&state->lock);
+        halide_profiler_lock(s);
     }
     ~LockProfiler() {
-        halide_mutex_unlock(&state->lock);
-#if TIMER_PROFILING
-        halide_enable_timer_interrupt();
-#endif
+        halide_profiler_unlock(state);
     }
 };
 
@@ -64,7 +82,6 @@ WEAK halide_profiler_pipeline_stats *find_or_create_pipeline(const char *pipelin
     }
     p->next = s->pipelines;
     p->name = pipeline_name;
-    p->first_func_id = s->first_free_id;
     p->num_funcs = num_funcs;
     p->runs = 0;
     p->time = 0;
@@ -91,60 +108,48 @@ WEAK halide_profiler_pipeline_stats *find_or_create_pipeline(const char *pipelin
         p->funcs[i].active_threads_numerator = 0;
         p->funcs[i].active_threads_denominator = 0;
     }
-    s->first_free_id += num_funcs;
     s->pipelines = p;
     return p;
 }
 
-WEAK void bill_func(halide_profiler_state *s, int func_id, uint64_t time, int active_threads) {
-    halide_profiler_pipeline_stats *p_prev = nullptr;
-    for (halide_profiler_pipeline_stats *p = s->pipelines; p;
-         p = (halide_profiler_pipeline_stats *)(p->next)) {
-        if (func_id >= p->first_func_id && func_id < p->first_func_id + p->num_funcs) {
-            if (p_prev) {
-                // Bubble the pipeline to the top to speed up future queries.
-                p_prev->next = (halide_profiler_pipeline_stats *)(p->next);
-                p->next = s->pipelines;
-                s->pipelines = p;
-            }
-            halide_profiler_func_stats *f = p->funcs + func_id - p->first_func_id;
-            f->time += time;
-            f->active_threads_numerator += active_threads;
-            f->active_threads_denominator += 1;
-            p->time += time;
-            p->samples++;
-            p->active_threads_numerator += active_threads;
-            p->active_threads_denominator += 1;
-            return;
-        }
-        p_prev = p;
-    }
-    // Someone must have called reset_state while a kernel was running. Do nothing.
+WEAK void update_running_instance(halide_profiler_instance_state *instance, uint64_t time) {
+    halide_profiler_func_stats *f = instance->funcs + instance->current_func;
+    f->time += time;
+    f->active_threads_numerator += instance->active_threads;
+    f->active_threads_denominator += 1;
+    instance->samples++;
+    instance->active_threads_numerator += instance->active_threads;
+    instance->active_threads_denominator += 1;
+    instance->billed_time += time;
 }
 
 extern "C" WEAK int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *prev_t) {
-    int func, active_threads;
+    if (!s->instances) {
+        // No Halide code is currently running
+        return 0;
+    }
+    halide_profiler_instance_state *instance = s->instances;
+
     if (s->get_remote_profiler_state) {
         // Execution has disappeared into remote code running
         // on an accelerator (e.g. Hexagon DSP)
-        s->get_remote_profiler_state(&func, &active_threads);
-    } else {
-        func = s->current_func;
-        active_threads = s->active_threads;
+
+        // It shouldn't be possible to get into a state where multiple
+        // pipelines are being profiled and one or both of them uses
+        // get_remote_profiler_state.
+        halide_debug_assert(nullptr, s->instances->next == nullptr);
+
+        s->get_remote_profiler_state(&(instance->current_func), &(instance->active_threads));
     }
+
     uint64_t t_now = halide_current_time_ns(nullptr);
-    if (func == halide_profiler_please_stop) {
-#if TIMER_PROFILING
-        s->sampling_thread = nullptr;
-#endif
-        return -1;
-    } else if (func >= 0) {
-        // Assume all time since I was last awake is due to
-        // the currently running func.
-        bill_func(s, func, t_now - *prev_t, active_threads);
+    uint64_t dt = t_now - *prev_t;
+    while (instance) {
+        update_running_instance(instance, dt);
+        instance = instance->next;
     }
     *prev_t = t_now;
-    return s->sleep_time;
+    return 0;
 }
 
 WEAK void sampling_profiler_thread(void *) {
@@ -153,19 +158,17 @@ WEAK void sampling_profiler_thread(void *) {
     // grab the lock
     halide_mutex_lock(&s->lock);
 
-    while (s->current_func != halide_profiler_please_stop) {
-        uint64_t t1 = halide_current_time_ns(nullptr);
-        uint64_t t = t1;
-        while (true) {
-            int sleep_ms = halide_profiler_sample(s, &t);
-            if (sleep_ms < 0) {
-                break;
-            }
-            // Release the lock, sleep, reacquire.
-            halide_mutex_unlock(&s->lock);
-            halide_sleep_ms(nullptr, sleep_ms);
-            halide_mutex_lock(&s->lock);
+    uint64_t t1 = halide_current_time_ns(nullptr);
+    uint64_t t = t1;
+    while (!s->shutdown || s->instances) {
+        int err = halide_profiler_sample(s, &t);
+        if (err < 0) {
+            break;
         }
+        // Release the lock, sleep, reacquire.
+        halide_mutex_unlock(&s->lock);
+        halide_sleep_us(nullptr, s->sleep_time);
+        halide_mutex_lock(&s->lock);
     }
 
     halide_mutex_unlock(&s->lock);
@@ -209,43 +212,135 @@ WEAK halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state(const ch
     return nullptr;
 }
 
-// Returns a token identifying this pipeline instance.
-WEAK int halide_profiler_pipeline_start(void *user_context,
+// Populates the instance state struct
+WEAK int halide_profiler_instance_start(void *user_context,
                                         const char *pipeline_name,
                                         int num_funcs,
-                                        const uint64_t *func_names) {
+                                        const uint64_t *func_names,
+                                        halide_profiler_instance_state *instance) {
+    // Tell the instance where we stashed the per-func state - just after the
+    // instance itself.
+
+    // First check that the layout agrees with the amount of stack space
+    // allocated in the pipeline
+    static_assert((sizeof(halide_profiler_func_stats) & 7) == 0);
+    halide_profiler_func_stats *funcs = (halide_profiler_func_stats *)(instance + 1);
+
+    // Zero initialize the instance and func state
+    memset(instance, 0, (uint8_t *)(funcs + num_funcs) - (uint8_t *)instance);
+
+    instance->funcs = funcs;
+
     halide_profiler_state *s = halide_profiler_get_state();
+    {
+        LockProfiler lock(s);
+
+        // Push this instance onto the running instances list
+        if (s->instances) {
+            s->instances->prev_next = &(instance->next);
+
+            // If there was something already running using the remote polling
+            // method, we can't profile something else at the same time.
+            if (s->get_remote_profiler_state) {
+                error(user_context) << "Cannot profile pipeline " << pipeline_name
+                                    << " while pipeline " << s->instances->pipeline_stats->name
+                                    << " is running, because it is running on a device.";
+                return halide_error_code_cannot_profile_pipeline;
+            }
+        }
+        instance->next = s->instances;
+        instance->prev_next = &(s->instances);
+        s->instances = instance;
+
+        // Find or create the pipeline statistics for this pipeline.
+        halide_profiler_pipeline_stats *p =
+            find_or_create_pipeline(pipeline_name, num_funcs, func_names);
+        if (!p) {
+            // Allocating space to track the statistics failed.
+            return halide_error_out_of_memory(user_context);
+        }
 
-    LockProfiler lock(s);
+        // Tell the instance the pipeline to which it belongs.
+        instance->pipeline_stats = p;
 
-    if (!s->sampling_thread) {
+        if (!s->sampling_thread) {
 #if TIMER_PROFILING
-        halide_start_clock(user_context);
-        halide_start_timer_chain();
-        s->sampling_thread = (halide_thread *)1;
+            halide_start_clock(user_context);
+            halide_start_timer_chain();
+            s->sampling_thread = (halide_thread *)1;
 #else
-        halide_start_clock(user_context);
-        s->sampling_thread = halide_spawn_thread(sampling_profiler_thread, nullptr);
+            halide_start_clock(user_context);
+            s->sampling_thread = halide_spawn_thread(sampling_profiler_thread, nullptr);
 #endif
+        }
     }
 
-    halide_profiler_pipeline_stats *p =
-        find_or_create_pipeline(pipeline_name, num_funcs, func_names);
-    if (!p) {
-        // Allocating space to track the statistics failed.
-        return halide_error_out_of_memory(user_context);
+    instance->start_time = halide_current_time_ns(user_context);
+
+    return 0;
+}
+
+WEAK int halide_profiler_instance_end(void *user_context, halide_profiler_instance_state *instance) {
+    uint64_t end_time = halide_current_time_ns(user_context);
+    halide_profiler_state *s = halide_profiler_get_state();
+    LockProfiler lock(s);
+
+    if (instance->should_collect_statistics) {
+
+        uint64_t true_duration = end_time - instance->start_time;
+        halide_profiler_pipeline_stats *p = instance->pipeline_stats;
+
+        // Retire the instance, accumulating statistics onto the statistics for this
+        // pipeline. Fields related to memory usages are tracked in the pipeline stats
+        p->samples += instance->samples;
+        p->time += true_duration;
+        p->active_threads_numerator += instance->active_threads_numerator;
+        p->active_threads_denominator += instance->active_threads_denominator;
+        p->memory_total += instance->memory_total;
+        p->memory_peak = max(p->memory_peak, instance->memory_peak);
+        p->num_allocs += instance->num_allocs;
+        p->runs++;
+
+        // Compute an adjustment factor to account for the fact that the billed
+        // time is not equal to the duration between start and end calls. We
+        // could avoid this by just making sure there is a sampling event a the
+        // start and end of the pipeline, but this would overcount whatever the
+        // last value of current_func is at the end of the pipeline, and is
+        // likely to undercount time spent in the first func in a
+        // pipeline. Sampling events need to happen independently (in the random
+        // variable sense) of any changes in current_func.
+        double adjustment = 1;
+        if (instance->billed_time > 0) {
+            adjustment = (double)true_duration / instance->billed_time;
+        }
+
+        for (int f = 0; f < p->num_funcs; f++) {
+            halide_profiler_func_stats *func = p->funcs + f;
+            const halide_profiler_func_stats *instance_func = instance->funcs + f;
+            // clang-tidy wants me to use a c standard library function to do
+            // the rounding below, but those aren't guaranteed to be available
+            // when compiling the runtime.
+            func->time += (uint64_t)(instance_func->time * adjustment + 0.5);  // NOLINT
+            func->active_threads_numerator += instance_func->active_threads_numerator;
+            func->active_threads_denominator += instance_func->active_threads_denominator;
+            func->num_allocs += instance_func->num_allocs;
+            func->stack_peak = max(func->stack_peak, instance_func->stack_peak);
+            func->memory_peak = max(func->memory_peak, instance_func->memory_peak);
+            func->memory_total += instance_func->memory_total;
+        }
     }
-    p->runs++;
 
-    return p->first_func_id;
+    // Remove myself from the doubly-linked list
+    *(instance->prev_next) = instance->next;
+    if (instance->next) {
+        instance->next->prev_next = instance->prev_next;
+    }
+    return 0;
 }
 
 WEAK void halide_profiler_stack_peak_update(void *user_context,
-                                            void *pipeline_state,
+                                            halide_profiler_instance_state *instance,
                                             uint64_t *f_values) {
-    halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
-    halide_abort_if_false(user_context, p_stats != nullptr);
-
     // Note: Update to the counter is done without grabbing the state's lock to
     // reduce lock contention. One potential issue is that other call that frees the
     // pipeline and function stats structs may be running in parallel. However, the
@@ -253,15 +348,15 @@ WEAK void halide_profiler_stack_peak_update(void *user_context,
     // unless user specifically calls halide_profiler_reset().
 
     // Update per-func memory stats
-    for (int i = 0; i < p_stats->num_funcs; ++i) {
+    for (int i = 0; i < instance->pipeline_stats->num_funcs; ++i) {
         if (f_values[i] != 0) {
-            sync_compare_max_and_swap(&(p_stats->funcs[i]).stack_peak, f_values[i]);
+            sync_compare_max_and_swap(&(instance->funcs[i]).stack_peak, f_values[i]);
         }
     }
 }
 
 WEAK void halide_profiler_memory_allocate(void *user_context,
-                                          void *pipeline_state,
+                                          halide_profiler_instance_state *instance,
                                           int func_id,
                                           uint64_t incr) {
     using namespace Halide::Runtime::Internal::Synchronization;
@@ -272,34 +367,34 @@ WEAK void halide_profiler_memory_allocate(void *user_context,
         return;
     }
 
-    halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
-    halide_abort_if_false(user_context, p_stats != nullptr);
+    halide_abort_if_false(user_context, instance != nullptr);
     halide_abort_if_false(user_context, func_id >= 0);
-    halide_abort_if_false(user_context, func_id < p_stats->num_funcs);
+    halide_abort_if_false(user_context, func_id < instance->pipeline_stats->num_funcs);
 
-    halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id];
+    halide_profiler_func_stats *func = &instance->funcs[func_id];
 
     // Note: Update to the counter is done without grabbing the state's lock to
-    // reduce lock contention. One potential issue is that other call that frees the
-    // pipeline and function stats structs may be running in parallel. However, the
-    // current desctructor (called on profiler shutdown) does not free the structs
-    // unless user specifically calls halide_profiler_reset().
-
-    // Update per-pipeline memory stats
-    atomic_add_fetch_sequentially_consistent(&p_stats->num_allocs, 1);
-    atomic_add_fetch_sequentially_consistent(&p_stats->memory_total, incr);
-    uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent(&p_stats->memory_current, incr);
-    sync_compare_max_and_swap(&p_stats->memory_peak, p_mem_current);
+    // reduce lock contention. One potential issue is that another call that
+    // frees the pipeline and function stats structs may be running in
+    // parallel. However, the current destructor (called on profiler shutdown)
+    // does not free the structs unless user specifically calls
+    // halide_profiler_reset().
+
+    // Update per-instance memory stats
+    atomic_add_fetch_sequentially_consistent(&instance->num_allocs, 1);
+    atomic_add_fetch_sequentially_consistent(&instance->memory_total, incr);
+    uint64_t p_mem_current = atomic_add_fetch_sequentially_consistent(&instance->memory_current, incr);
+    sync_compare_max_and_swap(&instance->memory_peak, p_mem_current);
 
     // Update per-func memory stats
-    atomic_add_fetch_sequentially_consistent(&f_stats->num_allocs, 1);
-    atomic_add_fetch_sequentially_consistent(&f_stats->memory_total, incr);
-    uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent(&f_stats->memory_current, incr);
-    sync_compare_max_and_swap(&f_stats->memory_peak, f_mem_current);
+    atomic_add_fetch_sequentially_consistent(&func->num_allocs, 1);
+    atomic_add_fetch_sequentially_consistent(&func->memory_total, incr);
+    uint64_t f_mem_current = atomic_add_fetch_sequentially_consistent(&func->memory_current, incr);
+    sync_compare_max_and_swap(&func->memory_peak, f_mem_current);
 }
 
 WEAK void halide_profiler_memory_free(void *user_context,
-                                      void *pipeline_state,
+                                      halide_profiler_instance_state *instance,
                                       int func_id,
                                       uint64_t decr) {
     using namespace Halide::Runtime::Internal::Synchronization;
@@ -310,12 +405,11 @@ WEAK void halide_profiler_memory_free(void *user_context,
         return;
     }
 
-    halide_profiler_pipeline_stats *p_stats = (halide_profiler_pipeline_stats *)pipeline_state;
-    halide_abort_if_false(user_context, p_stats != nullptr);
+    halide_abort_if_false(user_context, instance != nullptr);
     halide_abort_if_false(user_context, func_id >= 0);
-    halide_abort_if_false(user_context, func_id < p_stats->num_funcs);
+    halide_abort_if_false(user_context, func_id < instance->pipeline_stats->num_funcs);
 
-    halide_profiler_func_stats *f_stats = &p_stats->funcs[func_id];
+    halide_profiler_func_stats *func = &instance->funcs[func_id];
 
     // Note: Update to the counter is done without grabbing the state's lock to
     // reduce lock contention. One potential issue is that other call that frees the
@@ -324,10 +418,10 @@ WEAK void halide_profiler_memory_free(void *user_context,
     // unless user specifically calls halide_profiler_reset().
 
     // Update per-pipeline memory stats
-    atomic_sub_fetch_sequentially_consistent(&p_stats->memory_current, decr);
+    atomic_sub_fetch_sequentially_consistent(&instance->memory_current, decr);
 
     // Update per-func memory stats
-    atomic_sub_fetch_sequentially_consistent(&f_stats->memory_current, decr);
+    atomic_sub_fetch_sequentially_consistent(&func->memory_current, decr);
 }
 
 WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_state *s) {
@@ -360,7 +454,7 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
 
     for (halide_profiler_pipeline_stats *p = s->pipelines; p;
          p = (halide_profiler_pipeline_stats *)(p->next)) {
-        float t = p->time / 1000000.0f;
+        float total_time = p->time / 1000000.0f;
         if (!p->runs) {
             continue;
         }
@@ -368,10 +462,10 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
         bool serial = p->active_threads_numerator == p->active_threads_denominator;
         float threads = p->active_threads_numerator / (p->active_threads_denominator + 1e-10);
         sstr << p->name << "\n"
-             << " total time: " << t << " ms"
+             << " total time: " << total_time << " ms"
              << "  samples: " << p->samples
              << "  runs: " << p->runs
-             << "  time/run: " << t / p->runs << " ms\n";
+             << "  time per run: " << total_time / p->runs << " ms\n";
         if (!serial) {
             sstr << " average threads used: " << threads << "\n";
         }
@@ -423,12 +517,18 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
             for (int i = 0; i < p->num_funcs; i++) {
                 halide_profiler_func_stats *fs = p->funcs + i;
 
-                // The first func is always a catch-all overhead
+                // The first id is always a catch-all overhead
                 // slot. Only report overhead time if it's non-zero
                 if (i == 0 && fs->time == 0) {
                     continue;
                 }
 
+                // These two ids are malloc and free. Don't print them if there
+                // were no heap allocations.
+                if ((i == 2 || i == 3) && p->num_allocs == 0) {
+                    continue;
+                }
+
                 f_stats[f_stats_count++] = fs;
             }
 
@@ -606,7 +706,6 @@ WEAK void halide_profiler_reset_unlocked(halide_profiler_state *s) {
         free(p->funcs);
         free(p);
     }
-    s->first_free_id = 0;
 }
 
 WEAK void halide_profiler_reset() {
@@ -616,6 +715,7 @@ WEAK void halide_profiler_reset() {
     // state without grabbing the global profiler state's lock.
     halide_profiler_state *s = halide_profiler_get_state();
     LockProfiler lock(s);
+    halide_abort_if_false(nullptr, s->instances == nullptr);
     halide_profiler_reset_unlocked(s);
 }
 
@@ -624,12 +724,16 @@ __attribute__((destructor))
 #endif
 WEAK void
 halide_profiler_shutdown() {
+    using namespace Halide::Runtime::Internal::Synchronization;
+
     halide_profiler_state *s = halide_profiler_get_state();
     if (!s->sampling_thread) {
         return;
     }
 
-    s->current_func = halide_profiler_please_stop;
+    int one = 1;
+    atomic_store_relaxed(&(s->shutdown), &one);
+
 #if TIMER_PROFILING
     // Wait for timer interrupt to fire and notice things are shutdown.
     // volatile should be the right tool to use to wait for storage to be
@@ -643,7 +747,9 @@ halide_profiler_shutdown() {
     s->sampling_thread = nullptr;
 #endif
 
-    s->current_func = halide_profiler_outside_of_halide;
+    // The join_thread should have waited for any running instances to
+    // terminate.
+    halide_debug_assert(nullptr, s->instances == nullptr);
 
     // Print results. No need to lock anything because we just shut
     // down the thread.
@@ -664,7 +770,7 @@ WEAK void halide_windows_profiler_shutdown() {
     // sections in a static destructor as it may run after threads
     // have been killed by the OS. Furthermore, may calls, even things
     // like EnterCriticalSection may be set to kill the process if
-    // called during process shutdown. Hence kthis routine doesn't attmept
+    // called during process shutdown. Hence this routine doesn't attmept
     // to clean up state as the destructor does on other platforms.
 
     // Print results. Avoid locking as it will cause problems and
@@ -674,8 +780,4 @@ WEAK void halide_windows_profiler_shutdown() {
 #endif
 }  // namespace
 
-WEAK void halide_profiler_pipeline_end(void *user_context, void *state) {
-    ((halide_profiler_state *)state)->current_func = halide_profiler_outside_of_halide;
-}
-
 }  // extern "C"
diff --git a/src/runtime/profiler_inlined.cpp b/src/runtime/profiler_inlined.cpp
index f31289cbcb36..fc5bda1b1bcc 100644
--- a/src/runtime/profiler_inlined.cpp
+++ b/src/runtime/profiler_inlined.cpp
@@ -3,21 +3,27 @@
 
 extern "C" {
 
-WEAK_INLINE int halide_profiler_set_current_func(halide_profiler_state *state, int pipeline, int func, int *sampling_token) {
+WEAK_INLINE int halide_profiler_set_current_func(halide_profiler_instance_state *instance, int func, int *sampling_token) {
     if (sampling_token == nullptr || *sampling_token == 0) {
 
         // Use empty volatile asm blocks to prevent code motion. Otherwise
         // llvm reorders or elides the stores.
-        volatile int *ptr = &(state->current_func);
+        volatile int *ptr = &(instance->current_func);
         // clang-format off
         asm volatile ("":::);
-        *ptr = pipeline + func;
+        *ptr = func;
         asm volatile ("":::);
         // clang-format on
     }
     return 0;
 }
 
+// Called once we're sure we're not in bounds query code
+WEAK_INLINE int halide_profiler_enable_instance(halide_profiler_instance_state *instance) {
+    instance->should_collect_statistics = 1;
+    return 0;
+}
+
 // Invariant: shared xor local, and both are either 0 or 1. 0 means acquired.
 WEAK_INLINE int halide_profiler_acquire_sampling_token(int32_t *shared, int32_t *local) {
     using namespace Halide::Runtime::Internal::Synchronization;
@@ -42,15 +48,15 @@ WEAK_INLINE int halide_profiler_init_sampling_token(int32_t *sampling_token, int
     return 0;
 }
 
-WEAK_INLINE int halide_profiler_incr_active_threads(halide_profiler_state *state) {
+WEAK_INLINE int halide_profiler_incr_active_threads(halide_profiler_instance_state *instance) {
     using namespace Halide::Runtime::Internal::Synchronization;
 
-    return atomic_fetch_add_sequentially_consistent(&(state->active_threads), 1);
+    return atomic_fetch_add_sequentially_consistent(&(instance->active_threads), 1);
 }
 
-WEAK_INLINE int halide_profiler_decr_active_threads(halide_profiler_state *state) {
+WEAK_INLINE int halide_profiler_decr_active_threads(halide_profiler_instance_state *instance) {
     using namespace Halide::Runtime::Internal::Synchronization;
 
-    return atomic_fetch_sub_sequentially_consistent(&(state->active_threads), 1);
+    return atomic_fetch_sub_sequentially_consistent(&(instance->active_threads), 1);
 }
 }
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
index 7955e8749df7..4ce1092264c9 100644
--- a/src/runtime/runtime_api.cpp
+++ b/src/runtime/runtime_api.cpp
@@ -165,9 +165,10 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_print,
     (void *)&halide_profiler_get_pipeline_state,
     (void *)&halide_profiler_get_state,
+    (void *)&halide_profiler_instance_start,
+    (void *)&halide_profiler_instance_end,
     (void *)&halide_profiler_memory_allocate,
     (void *)&halide_profiler_memory_free,
-    (void *)&halide_profiler_pipeline_start,
     (void *)&halide_profiler_report,
     (void *)&halide_profiler_reset,
     (void *)&halide_profiler_stack_peak_update,
@@ -195,7 +196,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_set_trace_file,
     (void *)&halide_shutdown_thread_pool,
     (void *)&halide_shutdown_trace,
-    (void *)&halide_sleep_ms,
+    (void *)&halide_sleep_us,
     (void *)&halide_spawn_thread,
     (void *)&halide_start_clock,
     (void *)&halide_start_timer_chain,
diff --git a/src/runtime/runtime_atomics.h b/src/runtime/runtime_atomics.h
index 5c82e0a66008..61139a622d75 100644
--- a/src/runtime/runtime_atomics.h
+++ b/src/runtime/runtime_atomics.h
@@ -128,7 +128,8 @@ ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val)
     return __sync_or_and_fetch(addr, val);
 }
 
-ALWAYS_INLINE void atomic_store_relaxed(uintptr_t *addr, uintptr_t *val) {
+template<typename T>
+ALWAYS_INLINE void atomic_store_relaxed(T *addr, T *val) {
     *addr = *val;
 }
 
@@ -247,7 +248,8 @@ ALWAYS_INLINE uintptr_t atomic_or_fetch_relaxed(uintptr_t *addr, uintptr_t val)
     return __atomic_or_fetch(addr, val, __ATOMIC_RELAXED);
 }
 
-ALWAYS_INLINE void atomic_store_relaxed(uintptr_t *addr, uintptr_t *val) {
+template<typename T>
+ALWAYS_INLINE void atomic_store_relaxed(T *addr, T *val) {
     __atomic_store(addr, val, __ATOMIC_RELAXED);
 }
 
diff --git a/src/runtime/runtime_internal.h b/src/runtime/runtime_internal.h
index 8df9dcb8eb2c..b42728be27e6 100644
--- a/src/runtime/runtime_internal.h
+++ b/src/runtime/runtime_internal.h
@@ -153,28 +153,35 @@ WEAK void *halide_get_library_symbol(void *lib, const char *name);
 
 WEAK int halide_start_clock(void *user_context);
 WEAK int64_t halide_current_time_ns(void *user_context);
-WEAK void halide_sleep_ms(void *user_context, int ms);
+WEAK void halide_sleep_us(void *user_context, int us);
 WEAK void halide_device_free_as_destructor(void *user_context, void *obj);
 WEAK void halide_device_and_host_free_as_destructor(void *user_context, void *obj);
 WEAK void halide_device_host_nop_free(void *user_context, void *obj);
 
-// The pipeline_state is declared as void* type since halide_profiler_pipeline_stats
-// is defined inside HalideRuntime.h which includes this header file.
+struct halide_profiler_instance_state;
 WEAK void halide_profiler_stack_peak_update(void *user_context,
-                                            void *pipeline_state,
+                                            halide_profiler_instance_state *instance,
                                             uint64_t *f_values);
 WEAK void halide_profiler_memory_allocate(void *user_context,
-                                          void *pipeline_state,
+                                          halide_profiler_instance_state *instance,
                                           int func_id,
                                           uint64_t incr);
 WEAK void halide_profiler_memory_free(void *user_context,
-                                      void *pipeline_state,
+                                      halide_profiler_instance_state *instance,
                                       int func_id,
                                       uint64_t decr);
-WEAK int halide_profiler_pipeline_start(void *user_context,
+WEAK int halide_profiler_instance_start(void *user_context,
                                         const char *pipeline_name,
                                         int num_funcs,
-                                        const uint64_t *func_names);
+                                        const uint64_t *func_names,
+                                        halide_profiler_instance_state *instance);
+WEAK int halide_profiler_instance_end(void *user_context,
+                                      halide_profiler_instance_state *instance);
+
+WEAK void halide_start_timer_chain();
+WEAK void halide_disable_timer_interrupt();
+WEAK void halide_enable_timer_interrupt();
+
 WEAK int halide_host_cpu_count();
 
 WEAK int halide_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf,
diff --git a/src/runtime/windows_clock.cpp b/src/runtime/windows_clock.cpp
index 717681070c68..94609f63642c 100644
--- a/src/runtime/windows_clock.cpp
+++ b/src/runtime/windows_clock.cpp
@@ -37,7 +37,11 @@ WEAK int64_t halide_current_time_ns(void *user_context) {
     return (int64_t)(ns_per_tick * clock);
 }
 
-WEAK void halide_sleep_ms(void *user_context, int ms) {
-    Sleep(ms);
+WEAK void halide_sleep_us(void *user_context, int us) {
+    // Unfortunately you can't sleep on windows in microsecond amounts. The
+    // below call results in calling Sleep(0) for times less than 1 ms, which is
+    // documented to yield the rest of the time slice. If there is no other
+    // thread waiting to run, it may get rescheduled immediately.
+    Sleep(us / 1000);
 }
 }