diff --git a/.lldbinit b/.lldbinit new file mode 100644 index 000000000000..a07959c1b025 --- /dev/null +++ b/.lldbinit @@ -0,0 +1 @@ +command script import ./tools/lldbhalide.py diff --git a/src/IRPrinter.cpp b/src/IRPrinter.cpp index a42431f232d0..ac203e74dc62 100644 --- a/src/IRPrinter.cpp +++ b/src/IRPrinter.cpp @@ -567,6 +567,11 @@ void IRPrinter::print(const Stmt &ir) { ir.accept(this); } +void IRPrinter::print_summary(const Stmt &ir) { + ScopedValue old(is_summary, true); + ir.accept(this); +} + void IRPrinter::print_list(const std::vector &exprs) { for (size_t i = 0; i < exprs.size(); i++) { print_no_parens(exprs[i]); @@ -865,7 +870,9 @@ void IRPrinter::visit(const Let *op) { stream << "let " << op->name << " = "; print(op->value); stream << " in "; - print(op->body); + if (!is_summary) { + print(op->body); + } close(); } @@ -875,7 +882,9 @@ void IRPrinter::visit(const LetStmt *op) { print_no_parens(op->value); stream << "\n"; - print(op->body); + if (!is_summary) { + print(op->body); + } } void IRPrinter::visit(const AssertStmt *op) { @@ -905,13 +914,9 @@ void IRPrinter::visit(const For *op) { print_no_parens(op->min); stream << ", "; print_no_parens(op->extent); - stream << ") {\n"; + stream << ") "; - indent++; - print(op->body); - indent--; - - stream << get_indent() << "}\n"; + print_braced_stmt(op->body); } void IRPrinter::visit(const Acquire *op) { @@ -919,11 +924,8 @@ void IRPrinter::visit(const Acquire *op) { print_no_parens(op->semaphore); stream << ", "; print_no_parens(op->count); - stream << ") {\n"; - indent++; - print(op->body); - indent--; - stream << get_indent() << "}\n"; + stream << ") "; + print_braced_stmt(op->body, 1); } void IRPrinter::print_lets(const Let *let) { @@ -932,7 +934,9 @@ void IRPrinter::print_lets(const Let *let) { stream << "let " << let->name << " = "; print_no_parens(let->value); stream << " in\n"; - if (const Let *next = let->body.as()) { + if (is_summary) { + stream << get_indent() << "...\n"; + } else if (const Let *next = let->body.as()) { print_lets(next); } else { stream << get_indent(); @@ -941,6 +945,19 @@ void IRPrinter::print_lets(const Let *let) { } } +void IRPrinter::print_braced_stmt(const Stmt &stmt, int extra_indent) { + if (is_summary) { + stream << "{ ... }\n"; + return; + } + + stream << "{\n"; + indent += extra_indent; + print(stmt); + indent -= extra_indent; + stream << get_indent() << "}\n"; +} + void IRPrinter::visit(const Store *op) { stream << get_indent(); const bool has_pred = !is_const_one(op->predicate); @@ -1038,7 +1055,10 @@ void IRPrinter::visit(const Allocate *op) { stream << get_indent() << " custom_delete { " << op->free_function << "(" << op->name << "); }"; } stream << "\n"; - print(op->body); + + if (!is_summary) { + print(op->body); + } } void IRPrinter::visit(const Free *op) { @@ -1067,13 +1087,9 @@ void IRPrinter::visit(const Realize *op) { stream << " if "; print(op->condition); } - stream << " {\n"; - - indent++; - print(op->body); - indent--; - stream << get_indent() << "}\n"; + stream << " "; + print_braced_stmt(op->body); } void IRPrinter::visit(const Prefetch *op) { @@ -1102,12 +1118,16 @@ void IRPrinter::visit(const Prefetch *op) { indent--; stream << get_indent() << "}\n"; } - print(op->body); + if (!is_summary) { + print(op->body); + } } void IRPrinter::visit(const Block *op) { - print(op->first); - print(op->rest); + if (!is_summary) { + print(op->first); + print(op->rest); + } } void IRPrinter::visit(const Fork *op) { @@ -1121,14 +1141,23 @@ void IRPrinter::visit(const Fork *op) { stmts.push_back(rest); stream << get_indent() << "fork "; - for (const Stmt &s : stmts) { - stream << "{\n"; - indent++; - print(s); - indent--; - stream << get_indent() << "} "; + if (is_summary) { + stream << "[" << stmts.size(); + if (stmts.size() == 1) { + stream << " child]"; + } else { + stream << " children]"; + } + } else { + for (const Stmt &s : stmts) { + stream << "{\n"; + indent++; + print(s); + indent--; + stream << get_indent() << "} "; + } + stream << "\n"; } - stream << "\n"; } void IRPrinter::visit(const IfThenElse *op) { @@ -1209,32 +1238,43 @@ void IRPrinter::visit(const VectorReduce *op) { } void IRPrinter::visit(const Atomic *op) { + stream << get_indent(); + if (op->mutex_name.empty()) { - stream << get_indent() << "atomic (" - << op->producer_name << ") {\n"; + stream << "atomic (" << op->producer_name << ") "; } else { - stream << get_indent() << "atomic (" - << op->producer_name << ", " - << op->mutex_name << ") {\n"; + stream << "atomic (" << op->producer_name << ", " << op->mutex_name << ") "; } - indent += 2; - print(op->body); - indent -= 2; - stream << get_indent() << "}\n"; + + print_braced_stmt(op->body); } void IRPrinter::visit(const HoistedStorage *op) { if (op->name.empty()) { - stream << get_indent() << "hoisted_storage {\n"; + stream << get_indent() << "hoisted_storage "; } else { - stream << get_indent() << "hoisted_storage ("; - stream << op->name; - stream << ") {\n"; + stream << get_indent() << "hoisted_storage (" << op->name << ") "; } - indent += 2; - print(op->body); - indent -= 2; - stream << get_indent() << "}\n"; + + print_braced_stmt(op->body); +} + +std::string lldb_string(const Expr &ir) { + std::stringstream s{}; + IRPrinter p(s); + p.print_no_parens(ir); + return s.str(); +} + +std::string lldb_string(const Internal::BaseExprNode *n) { + return lldb_string(Expr(n)); +} + +std::string lldb_string(const Stmt &ir) { + std::stringstream s{}; + IRPrinter p(s); + p.print_summary(ir); + return s.str(); } } // namespace Internal diff --git a/src/IRPrinter.h b/src/IRPrinter.h index 48afef8603d3..b7b5084c1eff 100644 --- a/src/IRPrinter.h +++ b/src/IRPrinter.h @@ -134,6 +134,9 @@ class IRPrinter : public IRVisitor { /** emit a statement on the output stream */ void print(const Stmt &); + /** emit a statement summary on the output stream */ + void print_summary(const Stmt &); + /** emit a comma delimited list of exprs, without any leading or * trailing punctuation. */ void print_list(const std::vector &exprs); @@ -157,6 +160,10 @@ class IRPrinter : public IRVisitor { * surrounding set of parens. */ bool implicit_parens = false; + /** Print only a summary of a statement, with sub-statements replaced by + * ellipses (...). */ + bool is_summary = false; + /** Either emits "(" or "", depending on the value of implicit_parens */ void open(); @@ -170,6 +177,9 @@ class IRPrinter : public IRVisitor { /** A helper for printing a chain of lets with line breaks */ void print_lets(const Let *let); + /** A helper for printing a braced statement */ + void print_braced_stmt(const Stmt &, int extra_indent = 2); + void visit(const IntImm *) override; void visit(const UIntImm *) override; void visit(const FloatImm *) override; @@ -220,6 +230,13 @@ class IRPrinter : public IRVisitor { void visit(const HoistedStorage *) override; }; +/** Debugging helpers for LLDB */ +/// @{ +std::string lldb_string(const Expr &); +std::string lldb_string(const Internal::BaseExprNode *); +std::string lldb_string(const Stmt &); +/// @} + } // namespace Internal } // namespace Halide diff --git a/test/performance/parallel_scenarios.cpp b/test/performance/parallel_scenarios.cpp index 41e590599eb3..2bd29acd6b2f 100644 --- a/test/performance/parallel_scenarios.cpp +++ b/test/performance/parallel_scenarios.cpp @@ -31,32 +31,36 @@ int main(int argc, char **argv) { int native_threads = Halide::Internal::JITSharedRuntime::get_num_threads(); + std::map, std::vector> results; + auto bench = [&](bool m, bool c, int i, int o) { - const int num_samples = 128; const int memory_limit = m ? max_memory : 128; + auto now = std::chrono::high_resolution_clock::now; + auto to_ns = [](auto delta) { return 1e9 * std::chrono::duration(delta).count(); }; + auto bench_one = [&]() { - auto t1 = std::chrono::high_resolution_clock::now(); - // Ignore error code because default halide_error() will abort on failure + auto t1 = now(); (void)callable(i, o, memory_limit, in, out); - auto t2 = std::chrono::high_resolution_clock::now(); - return 1e9 * std::chrono::duration(t2 - t1).count() / (i * o); + auto t2 = now(); + return to_ns(t2 - t1) / (i * o); }; - std::vector times(num_samples); + const int num_tasks = 8; + const int min_samples = 32; + + std::vector times[num_tasks]; if (c) { Halide::Tools::ThreadPool thread_pool; - const int num_tasks = 8; - const int samples_per_task = num_samples / num_tasks; Halide::Internal::JITSharedRuntime::set_num_threads(num_tasks * native_threads); std::vector> futures(num_tasks); for (size_t t = 0; t < futures.size(); t++) { futures[t] = thread_pool.async( [&](size_t t) { bench_one(); - for (int s = 0; s < samples_per_task; s++) { - size_t idx = t * samples_per_task + s; - times[idx] = bench_one(); + auto t_start = now(); + while (to_ns(now() - t_start) < 1e7 || times[t].size() < min_samples / num_tasks) { + times[t].push_back(bench_one()); } }, t); @@ -67,32 +71,43 @@ int main(int argc, char **argv) { } else { Halide::Internal::JITSharedRuntime::set_num_threads(native_threads); bench_one(); - for (int s = 0; s < num_samples; s++) { - times[s] = bench_one(); + auto t_start = now(); + while (to_ns(now() - t_start) < 1e7 || times[0].size() < min_samples) { + times[0].push_back(bench_one()); } } - std::sort(times.begin(), times.end()); - printf("%d %d %d %d ", m, c, i, o); - const int n = 8; - int off = (num_samples / n) / 2; - for (int i = 0; i < n; i++) { - printf("%g ", times[off + (num_samples * i) / n]); + + std::vector &r = results[{m, c, i, o}]; + for (int i = 0; i < num_tasks; i++) { + r.insert(r.end(), times[i].begin(), times[i].end()); } - printf("\n"); }; // The output is designed to be copy-pasted into a spreadsheet, not read by a human - printf("memory_bound contended inner outer t0 t1 t2 t3 t4 t5 t7\n"); - for (bool contended : {false, true}) { - for (bool memory_bound : {false, true}) { - for (int i : {1 << 0, 1 << 6, 1 << 12, 1 << 18}) { - for (int o : {1, 2, 4, 8, 16, 32, 64, 128, 256}) { - bench(memory_bound, contended, i, o); + printf("memory_bound contended inner outer num_samples 10%% 20%% 30%% 40%% 50%% 60%% 70%% 80%% 90%%\n"); + for (int repeat = 0; repeat < 10; repeat++) { + for (bool contended : {false, true}) { + for (bool memory_bound : {false, true}) { + for (int i : {1 << 6, 1 << 9, 1 << 12, 1 << 15}) { + for (int o : {1, 2, 4, 8, 16, 32, 64, 128, 256}) { + bench(memory_bound, contended, i, o); + } } } } } + for (auto p : results) { + auto × = p.second; + std::sort(times.begin(), times.end()); + auto [m, c, i, o] = p.first; + printf("%d %d %d %d %d ", m, c, i, o, (int)times.size()); + for (int decile = 10; decile <= 90; decile += 10) { + printf("%g ", times[(decile * times.size()) / 100]); + } + printf("\n"); + } + printf("Success!\n"); return 0; diff --git a/tools/lldbhalide.py b/tools/lldbhalide.py new file mode 100644 index 000000000000..ba83ffd318ba --- /dev/null +++ b/tools/lldbhalide.py @@ -0,0 +1,111 @@ +# Load this module into LLDB by running: +# command script import /path/to/Halide/tools/lldbhalide.py +import functools + +import lldb + + +def normalize(raw): + return raw.lstrip('"').rstrip('"').replace(r'\n', ' ').replace(' ', ' ') + + +def addr(value): + if ptr := value.GetValueAsUnsigned(0): + return f"0x{ptr:x}" + if ptr := value.AddressOf().GetValueAsUnsigned(0): + return f"0x{ptr:x}" + raise ValueError(f'Could not determine address for: {value}') + + +def summary_string(summary_fn): + @functools.wraps(summary_fn) + def wrapper(value, _): + if value is None or not value.IsValid(): + return "" + + try: + return normalize(summary_fn(value).GetSummary()) + except Exception as e: + return f"" + + return wrapper + + +@summary_string +def call_name(value): + return value.EvaluateExpression("this->name()", lldb.SBExpressionOptions()) + + +@summary_string +def call_lldb_string(value): + return value.EvaluateExpression(f"Halide::Internal::lldb_string(*this)", lldb.SBExpressionOptions()) + + +class ProxyChildrenProvider: + def __init__(self, valobj, _): + self.inner = valobj + self.update() + + def update(self): + pass + + def num_children(self): + return self.inner.GetNumChildren() + + def get_child_index(self, name): + return self.inner.GetIndexOfChildWithName(name) + + def get_child_at_index(self, index): + return self.inner.GetChildAtIndex(index) + + +class IRChildrenProvider(ProxyChildrenProvider): + def __init__(self, valobj, _): + super().__init__(valobj.GetChildMemberWithName("ptr"), None) + + +class BoxChildrenProvider(IRChildrenProvider): + def __init__(self, valobj, _): + super().__init__(valobj.GetChildMemberWithName("contents"), None) + + +class FunctionChildrenProvider(ProxyChildrenProvider): + def __init__(self, valobj, _): + contents = valobj.EvaluateExpression("*this->contents.get()", lldb.SBExpressionOptions()) + print(contents) + super().__init__(contents, None) + + +def __lldb_init_module(debugger, _): + base_exprs = ["Add", "And", "Broadcast", "Call", "Cast", "Div", "EQ", "GE", "GT", "LE", "LT", "Let", "Load", "Max", + "Min", "Mod", "Mul", "NE", "Not", "Or", "Ramp", "Reinterpret", "Select", "Shuffle", "Sub", "Variable", + "VectorReduce"] + + for ty in base_exprs: + debugger.HandleCommand( + f"type summary add Halide::Internal::{ty} --python-function lldbhalide.call_lldb_string" + ) + + for ty in ('Expr', 'Internal::Stmt'): + debugger.HandleCommand( + f"type summary add Halide::{ty} --python-function lldbhalide.call_lldb_string" + ) + debugger.HandleCommand( + f'type synthetic add Halide::{ty} -l lldbhalide.IRChildrenProvider' + ) + + for ty in ("Definition", "FuncSchedule", "ReductionDomain", "StageSchedule"): + debugger.HandleCommand( + f"type synthetic add Halide::Internal::{ty} -l lldbhalide.BoxChildrenProvider" + ) + + debugger.HandleCommand( + f'type synthetic add Halide::Internal::Function -l lldbhalide.FunctionChildrenProvider' + ) + + debugger.HandleCommand("type summary add Halide::Internal::Dim -s '${var.var%S}'") + debugger.HandleCommand("type summary add Halide::RVar --python-function lldbhalide.call_name") + debugger.HandleCommand("type summary add Halide::Var --python-function lldbhalide.call_name") + + debugger.HandleCommand("type summary add halide_type_t -s '${var.code%S} bits=${var.bits%u} lanes=${var.lanes%u}'") + debugger.HandleCommand("type summary add Halide::Internal::RefCount -s ${var.count.Value%S}")