diff --git a/profiling/space-time-stack/kp_space_time_stack.cpp b/profiling/space-time-stack/kp_space_time_stack.cpp index 897ce9252..5fb1c342b 100644 --- a/profiling/space-time-stack/kp_space_time_stack.cpp +++ b/profiling/space-time-stack/kp_space_time_stack.cpp @@ -34,22 +34,28 @@ #include "utils/demangle.hpp" #include "kp_core.hpp" +#include "kp_space_time_stack.hpp" #if USE_MPI #include #endif -#include - namespace KokkosTools { namespace SpaceTimeStack { // Threshold to use for output (can be set via CLI options) double output_threshold = 0.1; -enum Space { SPACE_HOST, SPACE_CUDA, SPACE_HIP, SPACE_SYCL, SPACE_OMPT }; - -enum { NSPACES = 5 }; +const char* getStackKindAsStr(const StackKind kind) { + switch (kind) { + case StackKind::FOR: return "for"; + case StackKind::REDUCE: return "reduce"; + case StackKind::SCAN: return "scan"; + case StackKind::REGION: return "region"; + case StackKind::COPY: return "copy"; + default: std::abort(); + }; +} Space get_space(SpaceHandle const& handle) { // check that name starts with "Cuda" @@ -66,7 +72,7 @@ Space get_space(SpaceHandle const& handle) { return SPACE_HOST; } -const char* get_space_name(int space) { +const char* get_space_name(const Space space) { switch (space) { case SPACE_HOST: return "HOST"; case SPACE_CUDA: return "CUDA"; @@ -78,31 +84,6 @@ const char* get_space_name(int space) { return nullptr; } -struct Now { - typedef std::chrono::time_point Impl; - Impl impl; -}; - -Now now() { - Now t; - t.impl = std::chrono::high_resolution_clock::now(); - return t; -} - -double operator-(Now b, Now a) { - return std::chrono::duration_cast(b.impl - a.impl) - .count() * - 1e-9; -} - -enum StackKind { - STACK_FOR, - STACK_REDUCE, - STACK_SCAN, - STACK_REGION, - STACK_COPY -}; - void print_process_hwm(bool mpi_usable) { struct rusage sys_resources; getrusage(RUSAGE_SELF, &sys_resources); @@ -145,476 +126,392 @@ void print_process_hwm(bool mpi_usable) { } } -struct StackNode { - StackNode* parent; - std::string name; - StackKind kind; - std::set children; - double total_runtime; - double total_kokkos_runtime; - double max_runtime; - double avg_runtime; - std::int64_t number_of_calls; - std::int64_t total_number_of_kernel_calls; // Counts all kernel calls (but - // not region calls) this node and - // below this node in the tree - Now start_time; - StackNode(StackNode* parent_in, std::string&& name_in, StackKind kind_in) - : parent(parent_in), - name(std::move(name_in)), - kind(kind_in), - total_runtime(0.), - total_kokkos_runtime(0.), - number_of_calls(0), - total_number_of_kernel_calls(0) {} - StackNode* get_child(std::string&& child_name, StackKind child_kind) { - StackNode candidate(this, std::move(child_name), child_kind); - auto it = children.find(candidate); - if (it == children.end()) { - auto res = children.emplace(std::move(candidate)); - it = res.first; - assert(res.second); - } - return const_cast(&(*(it))); +StackNode::StackNode(StackNode* parent_in, std::string&& name_in, + StackKind kind_in) + : parent(parent_in), + name(std::move(name_in)), + kind(kind_in), + total_runtime(0.), + total_kokkos_runtime(0.), + number_of_calls(0), + total_number_of_kernel_calls(0) {} + +StackNode* StackNode::get_child(std::string&& child_name, + StackKind child_kind) { + StackNode candidate(this, std::move(child_name), child_kind); + auto it = children.find(candidate); + if (it == children.end()) { + auto res = children.emplace(std::move(candidate)); + it = res.first; + assert(res.second); } - bool operator<(StackNode const& other) const { - if (kind != other.kind) { - return int(kind) < int(other.kind); - } - return name < other.name; + return const_cast(&(*(it))); +} + +bool StackNode::operator<(StackNode const& other) const { + if (kind != other.kind) { + return int(kind) < int(other.kind); } - std::string get_full_name() const { - std::string full_name = this->name; - for (auto p = this->parent; p; p = p->parent) { - if (p->name.empty() && !p->parent) continue; - full_name = p->name + '/' + full_name; - } - return full_name; + return name < other.name; +} + +std::string StackNode::get_full_name() const { + std::string full_name = this->name; + for (auto p = this->parent; p; p = p->parent) { + if (p->name.empty() && !p->parent) continue; + full_name = p->name + '/' + full_name; } - void begin() { - number_of_calls++; - - // Regions are not kernels, so we don't tally those - if (kind == STACK_FOR || kind == STACK_REDUCE || kind == STACK_SCAN || - kind == STACK_COPY) - total_number_of_kernel_calls++; - start_time = now(); + return full_name; +} + +void StackNode::begin() { + number_of_calls++; + + // Regions are not kernels, so we don't tally those + if (kind == StackKind::FOR || kind == StackKind::REDUCE || + kind == StackKind::SCAN || kind == StackKind::COPY) + total_number_of_kernel_calls++; + start_time = clock_type::now(); +} + +void StackNode::end(time_point const& end_time) { + total_runtime += + std::chrono::duration_cast(end_time - start_time).count(); +} + +void StackNode::adopt() { + if (this->kind != StackKind::REGION) { + this->total_kokkos_runtime += this->total_runtime; } - void end(Now const& end_time) { - auto runtime = (end_time - start_time); - total_runtime += runtime; + for (auto& child : this->children) { + const_cast(child).adopt(); + this->total_kokkos_runtime += child.total_kokkos_runtime; + this->total_number_of_kernel_calls += child.total_number_of_kernel_calls; } - void adopt() { - if (this->kind != STACK_REGION) { - this->total_kokkos_runtime += this->total_runtime; - } - for (auto& child : this->children) { - const_cast(child).adopt(); - this->total_kokkos_runtime += child.total_kokkos_runtime; - this->total_number_of_kernel_calls += child.total_number_of_kernel_calls; + assert(this->total_kokkos_runtime >= 0.); +} + +StackNode StackNode::invert() const { + StackNode inv_root(nullptr, "", StackKind::REGION); + std::queue q; + q.push(this); + while (!q.empty()) { + auto node = q.front(); + q.pop(); + auto self_time = node->total_runtime; + auto self_kokkos_time = node->total_kokkos_runtime; + auto calls = node->number_of_calls; + for (auto& child : node->children) { + self_time -= child.total_runtime; + self_kokkos_time -= child.total_kokkos_runtime; + q.push(&child); } - assert(this->total_kokkos_runtime >= 0.); - } - StackNode invert() const { - StackNode inv_root(nullptr, "", STACK_REGION); - std::queue q; - q.push(this); - while (!q.empty()) { - auto node = q.front(); - q.pop(); - auto self_time = node->total_runtime; - auto self_kokkos_time = node->total_kokkos_runtime; - auto calls = node->number_of_calls; - for (auto& child : node->children) { - self_time -= child.total_runtime; - self_kokkos_time -= child.total_kokkos_runtime; - q.push(&child); - } - self_time = std::max( - self_time, - 0.); // floating-point may give negative epsilon instead of zero - self_kokkos_time = std::max( - self_kokkos_time, - 0.); // floating-point may give negative epsilon instead of zero - auto inv_node = &inv_root; + self_time = std::max( + self_time, + 0.); // floating-point may give negative epsilon instead of zero + self_kokkos_time = std::max( + self_kokkos_time, + 0.); // floating-point may give negative epsilon instead of zero + auto inv_node = &inv_root; + inv_node->total_runtime += self_time; + inv_node->number_of_calls += calls; + inv_node->total_kokkos_runtime += self_kokkos_time; + for (; node; node = node->parent) { + std::string name = node->name; + inv_node = inv_node->get_child(std::move(name), node->kind); inv_node->total_runtime += self_time; inv_node->number_of_calls += calls; inv_node->total_kokkos_runtime += self_kokkos_time; - for (; node; node = node->parent) { - std::string name = node->name; - inv_node = inv_node->get_child(std::move(name), node->kind); - inv_node->total_runtime += self_time; - inv_node->number_of_calls += calls; - inv_node->total_kokkos_runtime += self_kokkos_time; - } } - return inv_root; } - void print_recursive_json(std::ostream& os, StackNode const* parent, - double tree_time) const { - static bool add_comma = false; - auto percent = (total_runtime / tree_time) * 100.0; - - if (percent < output_threshold) return; - if (!name.empty()) { - if (add_comma) os << ",\n"; - add_comma = true; - os << "{\n"; - auto imbalance = (max_runtime / avg_runtime - 1.0) * 100.0; - os << "\"average-time\" : "; - os << std::scientific << std::setprecision(2); - os << avg_runtime << ",\n"; - os << std::fixed << std::setprecision(1); - auto percent_kokkos = (total_kokkos_runtime / total_runtime) * 100.0; - - os << "\"percent\" : " << percent << ",\n"; - os << "\"percent-kokkos\" : " << percent_kokkos << ",\n"; - os << "\"imbalance\" : " << imbalance << ",\n"; - - // Sum over kids if we're a region - if (kind == STACK_REGION) { - double child_runtime = 0.0; - for (auto& child : children) { - child_runtime += child.total_runtime; - } - auto remainder = (1.0 - child_runtime / total_runtime) * 100.0; - double kps = total_number_of_kernel_calls / avg_runtime; - os << "\"remainder\" : " << remainder << ",\n"; - os << std::scientific << std::setprecision(2); - os << "\"kernels-per-second\" : " << kps << ",\n"; - } else { - os << "\"remainder\" : \"N/A\",\n"; - os << "\"kernels-per-second\" : \"N/A\",\n"; - } - os << "\"number-of-calls\" : " << number_of_calls << ",\n"; - auto name_escape_double_quote_twices = - std::regex_replace(name, std::regex("\""), "\\\""); - os << "\"name\" : \"" << name_escape_double_quote_twices << "\",\n"; - os << "\"parent-id\" : \"" << parent << "\",\n"; - os << "\"id\" : \"" << this << "\",\n"; - - os << "\"kernel-type\" : "; - switch (kind) { - case STACK_FOR: os << "\"for\""; break; - case STACK_REDUCE: os << "\"reduce\""; break; - case STACK_SCAN: os << "\"scan\""; break; - case STACK_REGION: os << "\"region\""; break; - case STACK_COPY: os << "\"copy\""; break; - }; - - os << "\n}"; - } - if (children.empty()) return; - auto by_time = [](StackNode const* a, StackNode const* b) { - if (a->total_runtime != b->total_runtime) { - return a->total_runtime > b->total_runtime; - } - return a->name < b->name; - }; - std::set children_by_time(by_time); - for (auto& child : children) { - children_by_time.insert(&child); - } - auto last = children_by_time.end(); - --last; - for (auto it = children_by_time.begin(); it != children_by_time.end(); - ++it) { - auto child = *it; - child->print_recursive_json(os, this, tree_time); - } - } - void print_json(std::ostream& os) const { - std::ios saved_state(nullptr); - saved_state.copyfmt(os); + return inv_root; +} + +void StackNode::print_recursive_json(std::ostream& os, StackNode const* parent, + double tree_time) const { + static bool add_comma = false; + auto percent = (total_runtime / tree_time) * 100.0; + + if (percent < output_threshold) return; + if (!name.empty()) { + if (add_comma) os << ",\n"; + add_comma = true; os << "{\n"; - os << "\"space-time-stack-data\" : [\n"; - print_recursive_json(os, nullptr, total_runtime); - os << '\n'; - os << "]\n}\n"; - os.copyfmt(saved_state); - } - void print_recursive(std::ostream& os, std::string my_indent, - std::string const& child_indent, - double tree_time) const { - auto percent = (total_runtime / tree_time) * 100.0; - - if (percent < output_threshold) return; - if (!name.empty()) { - os << my_indent; - auto imbalance = (max_runtime / avg_runtime - 1.0) * 100.0; - os << std::scientific << std::setprecision(2); - os << avg_runtime << " sec "; - os << std::fixed << std::setprecision(1); - auto percent_kokkos = (total_kokkos_runtime / total_runtime) * 100.0; - - // Sum over kids if we're a region - if (kind == STACK_REGION) { - double child_runtime = 0.0; - for (auto& child : children) { - child_runtime += child.total_runtime; - } - auto remainder = (1.0 - child_runtime / total_runtime) * 100.0; - double kps = total_number_of_kernel_calls / avg_runtime; - os << percent << "% " << percent_kokkos << "% " << imbalance << "% " - << remainder << "% " << std::scientific << std::setprecision(2) - << kps << " " << number_of_calls << " " << name; - } else - os << percent << "% " << percent_kokkos << "% " << imbalance << "% " - << "------ " << number_of_calls << " " << name; - - switch (kind) { - case STACK_FOR: os << " [for]"; break; - case STACK_REDUCE: os << " [reduce]"; break; - case STACK_SCAN: os << " [scan]"; break; - case STACK_REGION: os << " [region]"; break; - case STACK_COPY: os << " [copy]"; break; - }; - - os << '\n'; - } - if (children.empty()) return; - auto by_time = [](StackNode const* a, StackNode const* b) { - if (a->total_runtime != b->total_runtime) { - return a->total_runtime > b->total_runtime; + auto imbalance = (max_runtime / avg_runtime - 1.0) * 100.0; + os << "\"average-time\" : "; + os << std::scientific << std::setprecision(2); + os << avg_runtime << ",\n"; + os << std::fixed << std::setprecision(1); + auto percent_kokkos = (total_kokkos_runtime / total_runtime) * 100.0; + + os << "\"percent\" : " << percent << ",\n"; + os << "\"percent-kokkos\" : " << percent_kokkos << ",\n"; + os << "\"imbalance\" : " << imbalance << ",\n"; + + // Sum over kids if we're a region + if (kind == StackKind::REGION) { + double child_runtime = 0.0; + for (auto& child : children) { + child_runtime += child.total_runtime; } - return a->name < b->name; - }; - std::set children_by_time(by_time); - for (auto& child : children) { - children_by_time.insert(&child); + auto remainder = (1.0 - child_runtime / total_runtime) * 100.0; + double kps = total_number_of_kernel_calls / avg_runtime; + os << "\"remainder\" : " << remainder << ",\n"; + os << std::scientific << std::setprecision(2); + os << "\"kernels-per-second\" : " << kps << ",\n"; + } else { + os << "\"remainder\" : \"N/A\",\n"; + os << "\"kernels-per-second\" : \"N/A\",\n"; } - auto last = children_by_time.end(); - --last; - for (auto it = children_by_time.begin(); it != children_by_time.end(); - ++it) { - std::string grandchild_indent; - if (it == last) { - grandchild_indent = child_indent + " "; - } else { - grandchild_indent = child_indent + "| "; - } - auto child = *it; - child->print_recursive(os, child_indent + "|-> ", grandchild_indent, - tree_time); + os << "\"number-of-calls\" : " << number_of_calls << ",\n"; + auto name_escape_double_quote_twices = + std::regex_replace(name, std::regex("\""), "\\\""); + os << "\"name\" : \"" << name_escape_double_quote_twices << "\",\n"; + os << "\"parent-id\" : \"" << parent << "\",\n"; + os << "\"id\" : \"" << this << "\",\n"; + + os << "\"kernel-type\" : \"" << getStackKindAsStr(kind) << "\""; + + os << "\n}"; + } + if (children.empty()) return; + auto by_time = [](StackNode const* a, StackNode const* b) { + if (a->total_runtime != b->total_runtime) { + return a->total_runtime > b->total_runtime; } + return a->name < b->name; + }; + std::set children_by_time(by_time); + for (auto& child : children) { + children_by_time.insert(&child); } - void print(std::ostream& os) const { - std::ios saved_state(nullptr); - saved_state.copyfmt(os); - print_recursive(os, "", "", total_runtime); - os << '\n'; - os.copyfmt(saved_state); + auto last = children_by_time.end(); + --last; + for (auto it = children_by_time.begin(); it != children_by_time.end(); ++it) { + auto child = *it; + child->print_recursive_json(os, this, tree_time); } - void reduce_over_mpi(bool mpi_usable) { -#if USE_MPI - if (mpi_usable) { - int rank, comm_size; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &comm_size); - std::queue q; - std::set> children_to_process; - q.push(this); - while (!q.empty()) { - auto node = q.front(); - q.pop(); - node->max_runtime = node->total_runtime; - node->avg_runtime = node->total_runtime; - MPI_Allreduce(MPI_IN_PLACE, &(node->total_runtime), 1, MPI_DOUBLE, - MPI_SUM, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, &(node->max_runtime), 1, MPI_DOUBLE, - MPI_MAX, MPI_COMM_WORLD); - MPI_Allreduce(MPI_IN_PLACE, &(node->avg_runtime), 1, MPI_DOUBLE, - MPI_SUM, MPI_COMM_WORLD); - node->avg_runtime /= comm_size; - MPI_Allreduce(MPI_IN_PLACE, &(node->total_kokkos_runtime), 1, - MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - /* Not all children necessarily exist on every rank. To handle this we - will: 1) Build a set of the child node names on each rank. 2) Start - with rank 0, broadcast all of it's child names and add them to the - queue, removing them from the set of names to be processed. - If a child doesn't exist on a rank we add an empty node for it. - 3) Do a check for the lowest rank that has any remaining unprocessed - children and repeat step 2 broadcasting from that rank until we - process all children from all ranks. - */ - children_to_process.clear(); - for (auto& child : node->children) { - children_to_process.emplace(child.name, child.kind); - } +} - int bcast_rank = 0; - do { - int nchildren_to_process = int(children_to_process.size()); - MPI_Bcast(&nchildren_to_process, 1, MPI_INT, bcast_rank, - MPI_COMM_WORLD); - if (rank == bcast_rank) { - for (auto& child_info : children_to_process) { - std::string child_name = child_info.first; - int kind = child_info.second; - int name_len = child_name.length(); - MPI_Bcast(&name_len, 1, MPI_INT, bcast_rank, MPI_COMM_WORLD); - MPI_Bcast(&child_name[0], name_len, MPI_CHAR, bcast_rank, - MPI_COMM_WORLD); - MPI_Bcast(&kind, 1, MPI_INT, bcast_rank, MPI_COMM_WORLD); - auto* child = - node->get_child(std::move(child_name), StackKind(kind)); - q.push(child); - } - children_to_process.clear(); - } else { - for (int i = 0; i < nchildren_to_process; ++i) { - int name_len; - MPI_Bcast(&name_len, 1, MPI_INT, bcast_rank, MPI_COMM_WORLD); - std::string name(size_t(name_len), '?'); - MPI_Bcast(&name[0], name_len, MPI_CHAR, bcast_rank, - MPI_COMM_WORLD); - int kind; - MPI_Bcast(&kind, 1, MPI_INT, bcast_rank, MPI_COMM_WORLD); - auto child = node->get_child(std::move(name), StackKind(kind)); - q.push(child); - children_to_process.erase({child->name, child->kind}); - } - } - int local_next_bcast_rank = - children_to_process.empty() ? comm_size : rank; - MPI_Allreduce(&local_next_bcast_rank, &bcast_rank, 1, MPI_INT, - MPI_MIN, MPI_COMM_WORLD); - } while (bcast_rank < comm_size); +void StackNode::print_json(std::ostream& os) const { + std::ios saved_state(nullptr); + saved_state.copyfmt(os); + os << "{\n"; + os << "\"space-time-stack-data\" : [\n"; + print_recursive_json(os, nullptr, total_runtime); + os << '\n'; + os << "]\n}\n"; + os.copyfmt(saved_state); +} + +void StackNode::print_recursive(std::ostream& os, std::string my_indent, + std::string const& child_indent, + double tree_time) const { + auto percent = (total_runtime / tree_time) * 100.0; + + if (percent < output_threshold) return; + if (!name.empty()) { + os << my_indent; + auto imbalance = (max_runtime / avg_runtime - 1.0) * 100.0; + os << std::scientific << std::setprecision(2); + os << avg_runtime << " sec "; + os << std::fixed << std::setprecision(1); + auto percent_kokkos = (total_kokkos_runtime / total_runtime) * 100.0; + + // Sum over kids if we're a region + if (kind == StackKind::REGION) { + double child_runtime = 0.0; + for (auto& child : children) { + child_runtime += child.total_runtime; } + auto remainder = (1.0 - child_runtime / total_runtime) * 100.0; + double kps = total_number_of_kernel_calls / avg_runtime; + os << percent << "% " << percent_kokkos << "% " << imbalance << "% " + << remainder << "% " << std::scientific << std::setprecision(2) << kps + << " " << number_of_calls << " " << name; } else -#else - (void)mpi_usable; -#endif - { - std::queue q; - q.push(this); - while (!q.empty()) { - auto node = q.front(); - q.pop(); - node->max_runtime = node->total_runtime; - node->avg_runtime = node->total_runtime; - for (auto& child : node->children) { - q.push(const_cast(&child)); - } - } - } - } -}; - -struct Allocation { - std::string name; - const void* ptr; - std::uint64_t size; - StackNode* frame; - Allocation(std::string&& name_in, const void* ptr_in, std::uint64_t size_in, - StackNode* frame_in) - : name(std::move(name_in)), ptr(ptr_in), size(size_in), frame(frame_in) {} - bool operator<(Allocation const& other) const { - if (size != other.size) return size > other.size; - return ptr < other.ptr; + os << percent << "% " << percent_kokkos << "% " << imbalance << "% " + << "------ " << number_of_calls << " " << name << " [" + << getStackKindAsStr(kind) << "]"; + + os << '\n'; } -}; - -struct Allocations { - std::uint64_t total_size; - std::set alloc_set; - Allocations() : total_size(0) {} - void allocate(std::string&& name, const void* ptr, std::uint64_t size, - StackNode* frame) { - auto res = alloc_set.emplace(Allocation(std::move(name), ptr, size, frame)); - assert(res.second); - total_size += size; + if (children.empty()) return; + auto by_time = [](StackNode const* a, StackNode const* b) { + if (a->total_runtime != b->total_runtime) { + return a->total_runtime > b->total_runtime; + } + return a->name < b->name; + }; + std::set children_by_time(by_time); + for (auto& child : children) { + children_by_time.insert(&child); } - void deallocate(std::string&& name, const void* ptr, std::uint64_t size, - StackNode* frame) { - auto key = Allocation(std::move(name), ptr, size, frame); - auto it = alloc_set.find(key); - if (it == alloc_set.end()) { - std::stringstream ss; - ss << "WARNING! allocation(\"" << key.name << "\", " << key.ptr << ", " - << key.size << "), deallocated at \"" << frame->get_full_name() - << "\", " - << " was not in the currently allocated set!\n"; - auto s = ss.str(); - std::cerr << s; + auto last = children_by_time.end(); + --last; + for (auto it = children_by_time.begin(); it != children_by_time.end(); ++it) { + std::string grandchild_indent; + if (it == last) { + grandchild_indent = child_indent + " "; } else { - total_size -= it->size; - alloc_set.erase(it); + grandchild_indent = child_indent + "| "; } + auto child = *it; + child->print_recursive(os, child_indent + "|-> ", grandchild_indent, + tree_time); } - void print(std::ostream& os, bool mpi_usable) { - std::string s; +} + +void StackNode::print(std::ostream& os) const { + std::ios saved_state(nullptr); + saved_state.copyfmt(os); + print_recursive(os, "", "", total_runtime); + os << '\n'; + os.copyfmt(saved_state); +} + +void StackNode::reduce_over_mpi(bool mpi_usable) { #if USE_MPI - if (mpi_usable) { - auto max_total_size = total_size; - MPI_Allreduce(MPI_IN_PLACE, &max_total_size, 1, MPI_UINT64_T, MPI_MAX, + if (mpi_usable) { + int rank, comm_size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &comm_size); + std::queue q; + std::set> children_to_process; + q.push(this); + while (!q.empty()) { + auto node = q.front(); + q.pop(); + node->max_runtime = node->total_runtime; + node->avg_runtime = node->total_runtime; + MPI_Allreduce(MPI_IN_PLACE, &(node->total_runtime), 1, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &(node->max_runtime), 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); - /* this bit of logic is here to break ties in case two - * or more MPI ranks allocated the same (maximum) amount of - * memory. the one with the lowest MPI rank will print - * its snapshot */ - int rank, size; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - MPI_Comm_size(MPI_COMM_WORLD, &size); - auto min_max_rank = (max_total_size == total_size) ? rank : size; - MPI_Allreduce(MPI_IN_PLACE, &min_max_rank, 1, MPI_INT, MPI_MIN, + MPI_Allreduce(MPI_IN_PLACE, &(node->avg_runtime), 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - assert(min_max_rank < size); - if (rank == min_max_rank) { - std::stringstream ss; - ss << std::fixed << std::setprecision(1); - ss << "MAX MEMORY ALLOCATED: " << double(total_size) / 1024.0 << " kB" - << '\n'; // convert bytes to kB - ss << "MPI RANK WITH MAX MEMORY: " << rank << '\n'; - ss << "ALLOCATIONS AT TIME OF HIGH WATER MARK:\n"; - std::ios saved_state(nullptr); - for (auto& allocation : alloc_set) { - auto percent = double(allocation.size) / double(total_size) * 100.0; - if (percent < 0.1) continue; - std::string full_name = allocation.frame->get_full_name(); - if (full_name.empty()) - full_name = allocation.name; - else - full_name = full_name + "/" + allocation.name; - ss << " " << percent << "% " << full_name << '\n'; - } - ss << '\n'; - s = ss.str(); - } - // a little MPI dance to send the string from min_max_rank to rank 0 - MPI_Request request; - int string_size; - if (rank == 0) { - MPI_Irecv(&string_size, 1, MPI_INT, min_max_rank, 42, MPI_COMM_WORLD, - &request); - } - if (rank == min_max_rank) { - string_size = int(s.size()); - MPI_Send(&string_size, 1, MPI_INT, 0, 42, MPI_COMM_WORLD); - } - if (rank == 0) { - MPI_Wait(&request, MPI_STATUS_IGNORE); - s.resize(size_t(string_size)); - MPI_Irecv(const_cast(s.data()), string_size, MPI_CHAR, - min_max_rank, 42, MPI_COMM_WORLD, &request); - } - if (rank == min_max_rank) { - MPI_Send(const_cast(s.data()), string_size, MPI_CHAR, 0, 42, - MPI_COMM_WORLD); - } - if (rank == 0) { - MPI_Wait(&request, MPI_STATUS_IGNORE); - os << s; + node->avg_runtime /= comm_size; + MPI_Allreduce(MPI_IN_PLACE, &(node->total_kokkos_runtime), 1, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD); + /* Not all children necessarily exist on every rank. To handle this we + will: 1) Build a set of the child node names on each rank. 2) Start + with rank 0, broadcast all of it's child names and add them to the + queue, removing them from the set of names to be processed. + If a child doesn't exist on a rank we add an empty node for it. + 3) Do a check for the lowest rank that has any remaining unprocessed + children and repeat step 2 broadcasting from that rank until we + process all children from all ranks. + */ + children_to_process.clear(); + for (auto& child : node->children) { + children_to_process.emplace(child.name, child.kind); } - } else + + int bcast_rank = 0; + do { + int nchildren_to_process = int(children_to_process.size()); + MPI_Bcast(&nchildren_to_process, 1, MPI_INT, bcast_rank, + MPI_COMM_WORLD); + if (rank == bcast_rank) { + for (auto& child_info : children_to_process) { + std::string child_name = child_info.first; + int kind = static_cast(child_info.second); + int name_len = child_name.length(); + MPI_Bcast(&name_len, 1, MPI_INT, bcast_rank, MPI_COMM_WORLD); + MPI_Bcast(&child_name[0], name_len, MPI_CHAR, bcast_rank, + MPI_COMM_WORLD); + MPI_Bcast(&kind, 1, MPI_INT, bcast_rank, MPI_COMM_WORLD); + auto* child = + node->get_child(std::move(child_name), StackKind(kind)); + q.push(child); + } + children_to_process.clear(); + } else { + for (int i = 0; i < nchildren_to_process; ++i) { + int name_len; + MPI_Bcast(&name_len, 1, MPI_INT, bcast_rank, MPI_COMM_WORLD); + std::string name(size_t(name_len), '?'); + MPI_Bcast(&name[0], name_len, MPI_CHAR, bcast_rank, MPI_COMM_WORLD); + int kind; + MPI_Bcast(&kind, 1, MPI_INT, bcast_rank, MPI_COMM_WORLD); + auto child = node->get_child(std::move(name), StackKind(kind)); + q.push(child); + children_to_process.erase({child->name, child->kind}); + } + } + int local_next_bcast_rank = + children_to_process.empty() ? comm_size : rank; + MPI_Allreduce(&local_next_bcast_rank, &bcast_rank, 1, MPI_INT, MPI_MIN, + MPI_COMM_WORLD); + } while (bcast_rank < comm_size); + } + } else #else - (void)mpi_usable; + (void)mpi_usable; #endif - { + { + std::queue q; + q.push(this); + while (!q.empty()) { + auto node = q.front(); + q.pop(); + node->max_runtime = node->total_runtime; + node->avg_runtime = node->total_runtime; + for (auto& child : node->children) { + q.push(const_cast(&child)); + } + } + } +} + +void Allocations::allocate(std::string&& name, const void* ptr, + std::uint64_t size, StackNode* frame) { + auto res = alloc_set.emplace(Allocation(std::move(name), ptr, size, frame)); + assert(res.second); + total_size += size; +} + +void Allocations::deallocate(std::string&& name, const void* ptr, + std::uint64_t size, StackNode* frame) { + auto key = Allocation(std::move(name), ptr, size, frame); + auto it = alloc_set.find(key); + if (it == alloc_set.end()) { + std::stringstream ss; + ss << "WARNING! allocation(\"" << key.name << "\", " << key.ptr << ", " + << key.size << "), deallocated at \"" << frame->get_full_name() << "\", " + << " was not in the currently allocated set!\n"; + auto s = ss.str(); + std::cerr << s; + } else { + total_size -= it->size; + alloc_set.erase(it); + } +} + +void Allocations::print(std::ostream& os, bool mpi_usable) { + std::string s; +#if USE_MPI + if (mpi_usable) { + auto max_total_size = total_size; + MPI_Allreduce(MPI_IN_PLACE, &max_total_size, 1, MPI_UINT64_T, MPI_MAX, + MPI_COMM_WORLD); + /* this bit of logic is here to break ties in case two + * or more MPI ranks allocated the same (maximum) amount of + * memory. the one with the lowest MPI rank will print + * its snapshot */ + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + auto min_max_rank = (max_total_size == total_size) ? rank : size; + MPI_Allreduce(MPI_IN_PLACE, &min_max_rank, 1, MPI_INT, MPI_MIN, + MPI_COMM_WORLD); + assert(min_max_rank < size); + if (rank == min_max_rank) { std::stringstream ss; ss << std::fixed << std::setprecision(1); ss << "MAX MEMORY ALLOCATED: " << double(total_size) / 1024.0 << " kB" << '\n'; // convert bytes to kB + ss << "MPI RANK WITH MAX MEMORY: " << rank << '\n'; ss << "ALLOCATIONS AT TIME OF HIGH WATER MARK:\n"; std::ios saved_state(nullptr); for (auto& allocation : alloc_set) { @@ -629,235 +526,293 @@ struct Allocations { } ss << '\n'; s = ss.str(); - os << s; } - } -}; - -struct State { - StackNode stack_root; - StackNode* stack_frame; - Allocations current_allocations[NSPACES]; - Allocations hwm_allocations[NSPACES]; - State() : stack_root(nullptr, "", STACK_REGION), stack_frame(&stack_root) { - stack_frame->begin(); - } - ~State() { - bool mpi_usable = false; -#if USE_MPI - int mpi_initialized; - MPI_Initialized(&mpi_initialized); - if (static_cast(mpi_initialized)) mpi_usable = true; -#endif - auto end_time = now(); - if (stack_frame != &stack_root) { - std::cerr << "Program ended before \"" << stack_frame->get_full_name() - << "\" ended\n"; - abort(); + // a little MPI dance to send the string from min_max_rank to rank 0 + MPI_Request request; + int string_size; + if (rank == 0) { + MPI_Irecv(&string_size, 1, MPI_INT, min_max_rank, 42, MPI_COMM_WORLD, + &request); } - stack_frame->end(end_time); - stack_root.adopt(); - stack_root.reduce_over_mpi(mpi_usable); - if (getenv("KOKKOS_PROFILE_EXPORT_JSON")) { -#if USE_MPI - if (mpi_usable) { - int rank; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (rank == 0) { - std::ofstream fout("noname.json"); - stack_root.print_json(fout); - } - } else + if (rank == min_max_rank) { + string_size = int(s.size()); + MPI_Send(&string_size, 1, MPI_INT, 0, 42, MPI_COMM_WORLD); + } + if (rank == 0) { + MPI_Wait(&request, MPI_STATUS_IGNORE); + s.resize(size_t(string_size)); + MPI_Irecv(const_cast(s.data()), string_size, MPI_CHAR, + min_max_rank, 42, MPI_COMM_WORLD, &request); + } + if (rank == min_max_rank) { + MPI_Send(const_cast(s.data()), string_size, MPI_CHAR, 0, 42, + MPI_COMM_WORLD); + } + if (rank == 0) { + MPI_Wait(&request, MPI_STATUS_IGNORE); + os << s; + } + } else +#else + (void)mpi_usable; #endif - { - std::ofstream fout("noname.json"); - stack_root.print_json(fout); - } - return; + { + std::stringstream ss; + ss << std::fixed << std::setprecision(1); + ss << "MAX MEMORY ALLOCATED: " << double(total_size) / 1024.0 << " kB" + << '\n'; // convert bytes to kB + ss << "ALLOCATIONS AT TIME OF HIGH WATER MARK:\n"; + std::ios saved_state(nullptr); + for (auto& allocation : alloc_set) { + auto percent = double(allocation.size) / double(total_size) * 100.0; + if (percent < 0.1) continue; + std::string full_name = allocation.frame->get_full_name(); + if (full_name.empty()) + full_name = allocation.name; + else + full_name = full_name + "/" + allocation.name; + ss << " " << percent << "% " << full_name << '\n'; } + ss << '\n'; + s = ss.str(); + os << s; + } +} - auto inv_stack_root = stack_root.invert(); - inv_stack_root.reduce_over_mpi(mpi_usable); +State::State() + : stack_root(nullptr, "", StackKind::REGION), stack_frame(&stack_root) { + stack_frame->begin(); +} +State::~State() { + bool mpi_usable = false; +#if USE_MPI + int mpi_initialized; + MPI_Initialized(&mpi_initialized); + if (static_cast(mpi_initialized)) mpi_usable = true; +#endif + auto end_time = StackNode::clock_type::now(); + if (stack_frame != &stack_root) { + std::cerr << "Program ended before \"" << stack_frame->get_full_name() + << "\" ended\n"; + abort(); + } + stack_frame->end(end_time); + stack_root.adopt(); + stack_root.reduce_over_mpi(mpi_usable); + if (getenv("KOKKOS_PROFILE_EXPORT_JSON")) { #if USE_MPI if (mpi_usable) { int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); if (rank == 0) { - std::cout << "\nBEGIN KOKKOS PROFILING REPORT:\n"; - std::cout << "TOTAL TIME: " << stack_root.max_runtime << " seconds\n"; - std::cout << "TOP-DOWN TIME TREE:\n"; - std::cout << " [type]\n"; - std::cout << "=================== \n"; - stack_root.print(std::cout); - std::cout << "BOTTOM-UP TIME TREE:\n"; - std::cout << " " - "[type]\n"; - std::cout << "=================== \n"; - inv_stack_root.print(std::cout); - } - for (int space = 0; space < NSPACES; ++space) { - if (rank == 0) { - std::cout << "KOKKOS " << get_space_name(space) << " SPACE:\n"; - std::cout << "=================== \n"; - std::cout.flush(); - } - hwm_allocations[space].print(std::cout, mpi_usable); - } - print_process_hwm(mpi_usable); - if (rank == 0) { - std::cout << "END KOKKOS PROFILING REPORT.\n"; - std::cout.flush(); + std::ofstream fout("noname.json"); + stack_root.print_json(fout); } } else #endif { + std::ofstream fout("noname.json"); + stack_root.print_json(fout); + } + return; + } + + auto inv_stack_root = stack_root.invert(); + inv_stack_root.reduce_over_mpi(mpi_usable); + +#if USE_MPI + if (mpi_usable) { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (rank == 0) { std::cout << "\nBEGIN KOKKOS PROFILING REPORT:\n"; std::cout << "TOTAL TIME: " << stack_root.max_runtime << " seconds\n"; std::cout << "TOP-DOWN TIME TREE:\n"; std::cout << " [type]\n"; - std::cout << "===================\n"; + std::cout << "=================== \n"; stack_root.print(std::cout); std::cout << "BOTTOM-UP TIME TREE:\n"; - std::cout - << " " - " [type]\n"; - std::cout << "===================\n"; + std::cout << " " + "[type]\n"; + std::cout << "=================== \n"; inv_stack_root.print(std::cout); - - for (int space = 0; space < NSPACES; ++space) { - std::cout << "KOKKOS " << get_space_name(space) << " SPACE:\n"; - std::cout << "===================\n"; + } + for (int space = 0; space < NSPACES; ++space) { + if (rank == 0) { + std::cout << "KOKKOS " << get_space_name(static_cast(space)) + << " SPACE:\n" + << "=================== \n"; std::cout.flush(); - hwm_allocations[space].print(std::cout, mpi_usable); } - print_process_hwm(mpi_usable); + hwm_allocations[space].print(std::cout, mpi_usable); + } + print_process_hwm(mpi_usable); + if (rank == 0) { std::cout << "END KOKKOS PROFILING REPORT.\n"; std::cout.flush(); } + } else +#endif + { + std::cout << "\nBEGIN KOKKOS PROFILING REPORT:\n"; + std::cout << "TOTAL TIME: " << stack_root.max_runtime << " seconds\n"; + std::cout << "TOP-DOWN TIME TREE:\n"; + std::cout << " [type]\n"; + std::cout << "===================\n"; + stack_root.print(std::cout); + std::cout << "BOTTOM-UP TIME TREE:\n"; + std::cout + << " " + " [type]\n"; + std::cout << "===================\n"; + inv_stack_root.print(std::cout); + + for (int space = 0; space < NSPACES; ++space) { + std::cout << "KOKKOS " << get_space_name(static_cast(space)) + << " SPACE:\n" + << "===================\n"; + std::cout.flush(); + hwm_allocations[space].print(std::cout, mpi_usable); + } + print_process_hwm(mpi_usable); + std::cout << "END KOKKOS PROFILING REPORT.\n"; + std::cout.flush(); } +} - void begin_frame(const char* name, StackKind kind) { - std::string name_str(demangleNameKokkos(name)); - stack_frame = stack_frame->get_child(std::move(name_str), kind); - stack_frame->begin(); - } - void end_frame(Now end_time) { - stack_frame->end(end_time); - stack_frame = stack_frame->parent; - } - std::uint64_t begin_kernel(const char* name, StackKind kind) { - begin_frame(name, kind); - return reinterpret_cast(stack_frame); +void State::begin_frame(const char* name, StackKind kind) { + std::string name_str(demangleNameKokkos(name)); + stack_frame = stack_frame->get_child(std::move(name_str), kind); + stack_frame->begin(); +} + +void State::end_frame(const StackNode::time_point& end_time) { + stack_frame->end(end_time); + stack_frame = stack_frame->parent; +} + +std::uint64_t State::begin_kernel(const char* name, StackKind kind) { + begin_frame(name, kind); + return reinterpret_cast(stack_frame); +} + +void State::end_kernel(std::uint64_t kernid) { + auto end_time = StackNode::clock_type::now(); + auto expect_node = reinterpret_cast(kernid); + if (expect_node != stack_frame) { + std::cerr << "Expected \"" << stack_frame->get_full_name() + << "\" to end, got different kernel ID\n"; + abort(); } - void end_kernel(std::uint64_t kernid) { - auto end_time = now(); - auto expect_node = reinterpret_cast(kernid); - if (expect_node != stack_frame) { - std::cerr << "Expected \"" << stack_frame->get_full_name() - << "\" to end, got different kernel ID\n"; - abort(); - } - end_frame(end_time); + end_frame(end_time); +} + +void State::push_region(const char* name) { + begin_frame(name, StackKind::REGION); +} + +void State::pop_region() { end_frame(StackNode::clock_type::now()); } + +void State::allocate(Space space, const char* name, const void* ptr, + std::uint64_t size) { + current_allocations[space].allocate(std::string(name), ptr, size, + stack_frame); + if (current_allocations[space].total_size > + hwm_allocations[space].total_size) { + hwm_allocations[space] = current_allocations[space]; } - void push_region(const char* name) { begin_frame(name, STACK_REGION); } - void pop_region() { end_frame(now()); } - void allocate(Space space, const char* name, const void* ptr, - std::uint64_t size) { - current_allocations[space].allocate(std::string(name), ptr, size, +} + +void State::deallocate(Space space, const char* name, const void* ptr, + std::uint64_t size) { + current_allocations[space].deallocate(std::string(name), ptr, size, stack_frame); - if (current_allocations[space].total_size > - hwm_allocations[space].total_size) { - hwm_allocations[space] = current_allocations[space]; - } - } - void deallocate(Space space, const char* name, const void* ptr, - std::uint64_t size) { - current_allocations[space].deallocate(std::string(name), ptr, size, - stack_frame); - } - void begin_deep_copy(Space dst_space, const char* dst_name, const void*, - Space src_space, const char* src_name, const void*, - std::uint64_t) { - std::string frame_name; - frame_name += "\""; - frame_name += dst_name; - frame_name += "\"=\""; - frame_name += src_name; - frame_name += "\" ("; - frame_name += get_space_name(dst_space); - frame_name += "->"; - frame_name += get_space_name(src_space); - frame_name += ")"; - begin_frame(frame_name.c_str(), STACK_COPY); - } - void end_deep_copy() { end_frame(now()); } -}; +} -State* global_state = nullptr; +void State::begin_deep_copy(Space dst_space, const char* dst_name, const void*, + Space src_space, const char* src_name, const void*, + std::uint64_t) { + std::string frame_name; + frame_name += "\""; + frame_name += dst_name; + frame_name += "\"=\""; + frame_name += src_name; + frame_name += "\" ("; + frame_name += get_space_name(dst_space); + frame_name += "->"; + frame_name += get_space_name(src_space); + frame_name += ")"; + begin_frame(frame_name.c_str(), StackKind::COPY); +} + +void State::end_deep_copy() { end_frame(StackNode::clock_type::now()); } + +void State::initialize() { global_state = std::make_unique(); } + +void State::finalize() { global_state.reset(); } + +std::unique_ptr State::global_state = nullptr; void kokkosp_init_library(int /* loadseq */, uint64_t /* interfaceVer */, uint32_t /* ndevinfos */, Kokkos_Profiling_KokkosPDeviceInfo* /* devinfos */) { - global_state = new State(); + State::initialize(); } -void kokkosp_finalize_library() { - delete global_state; - global_state = nullptr; -} +void kokkosp_finalize_library() { State::finalize(); } void kokkosp_begin_parallel_for(const char* name, std::uint32_t devid, std::uint64_t* kernid) { (void)devid; - *kernid = global_state->begin_kernel(name, STACK_FOR); + *kernid = State::get().begin_kernel(name, StackKind::FOR); } void kokkosp_begin_parallel_reduce(const char* name, std::uint32_t devid, std::uint64_t* kernid) { (void)devid; - *kernid = global_state->begin_kernel(name, STACK_REDUCE); + *kernid = State::get().begin_kernel(name, StackKind::REDUCE); } void kokkosp_begin_parallel_scan(const char* name, std::uint32_t devid, std::uint64_t* kernid) { (void)devid; - *kernid = global_state->begin_kernel(name, STACK_SCAN); + *kernid = State::get().begin_kernel(name, StackKind::SCAN); } void kokkosp_end_parallel_for(std::uint64_t kernid) { - global_state->end_kernel(kernid); + State::get().end_kernel(kernid); } void kokkosp_end_parallel_reduce(std::uint64_t kernid) { - global_state->end_kernel(kernid); + State::get().end_kernel(kernid); } void kokkosp_end_parallel_scan(std::uint64_t kernid) { - global_state->end_kernel(kernid); + State::get().end_kernel(kernid); } void kokkosp_push_profile_region(const char* name) { - global_state->push_region(name); + State::get().push_region(name); } -void kokkosp_pop_profile_region() { global_state->pop_region(); } +void kokkosp_pop_profile_region() { State::get().pop_region(); } void kokkosp_allocate_data(SpaceHandle handle, const char* name, const void* ptr, uint64_t size) { auto space = get_space(handle); - global_state->allocate(space, name, ptr, size); + State::get().allocate(space, name, ptr, size); } void kokkosp_deallocate_data(SpaceHandle handle, const char* name, const void* ptr, uint64_t size) { auto space = get_space(handle); - global_state->deallocate(space, name, ptr, size); + State::get().deallocate(space, name, ptr, size); } void kokkosp_begin_deep_copy(SpaceHandle dst_handle, const char* dst_name, @@ -866,11 +821,11 @@ void kokkosp_begin_deep_copy(SpaceHandle dst_handle, const char* dst_name, uint64_t size) { auto dst_space = get_space(dst_handle); auto src_space = get_space(src_handle); - global_state->begin_deep_copy(dst_space, dst_name, dst_ptr, src_space, - src_name, src_ptr, size); + State::get().begin_deep_copy(dst_space, dst_name, dst_ptr, src_space, + src_name, src_ptr, size); } -void kokkosp_end_deep_copy() { global_state->end_deep_copy(); } +void kokkosp_end_deep_copy() { State::get().end_deep_copy(); } Kokkos::Tools::Experimental::EventSet get_event_set() { Kokkos::Tools::Experimental::EventSet my_event_set; diff --git a/profiling/space-time-stack/kp_space_time_stack.hpp b/profiling/space-time-stack/kp_space_time_stack.hpp new file mode 100644 index 000000000..7605d4a63 --- /dev/null +++ b/profiling/space-time-stack/kp_space_time_stack.hpp @@ -0,0 +1,179 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSTOOLS_PROFILING_SPACETIMESTACK_HPP +#define KOKKOSTOOLS_PROFILING_SPACETIMESTACK_HPP + +#include + +#include "impl/Kokkos_Profiling_Interface.hpp" + +namespace KokkosTools::SpaceTimeStack { + +enum Space { + SPACE_HOST = 0, + SPACE_CUDA = 1, + SPACE_HIP = 2, + SPACE_SYCL = 3, + SPACE_OMPT = 4 +}; + +Space get_space(Kokkos::Tools::SpaceHandle const& handle); +const char* get_space_name(const Space space); + +enum { NSPACES = 5 }; + +enum class StackKind { FOR, REDUCE, SCAN, REGION, COPY }; + +const char* getStackKindAsStr(const StackKind kind); + +class StackNode { + public: + using clock_type = std::chrono::steady_clock; + using time_point = typename clock_type::time_point; + + //! Similar to @c std::chrono::seconds, but represented using @c double. + using seconds = std::chrono::duration>; + + public: + StackNode* parent; + std::string name; + StackKind kind; + std::set children; + double total_runtime; + double total_kokkos_runtime; + double max_runtime; + double avg_runtime; + std::int64_t number_of_calls; + std::int64_t total_number_of_kernel_calls; // Counts all kernel calls (but + // not region calls) this node and + // below this node in the tree + time_point start_time; + + public: + StackNode(StackNode* parent_in, std::string&& name_in, StackKind kind_in); + + StackNode* get_child(std::string&& child_name, StackKind child_kind); + + bool operator<(StackNode const& other) const; + + std::string get_full_name() const; + + void begin(); + + void end(time_point const& end_time); + + void adopt(); + + StackNode invert() const; + + void print_recursive_json(std::ostream& os, StackNode const* parent, + double tree_time) const; + + void print_json(std::ostream& os) const; + + void print_recursive(std::ostream& os, std::string my_indent, + std::string const& child_indent, double tree_time) const; + + void print(std::ostream& os) const; + + void reduce_over_mpi(bool mpi_usable); +}; + +struct Allocation { + std::string name; + const void* ptr; + std::uint64_t size; + StackNode* frame; + Allocation(std::string&& name_in, const void* ptr_in, std::uint64_t size_in, + StackNode* frame_in) + : name(std::move(name_in)), ptr(ptr_in), size(size_in), frame(frame_in) {} + bool operator<(Allocation const& other) const { + if (size != other.size) return size > other.size; + return ptr < other.ptr; + } +}; + +struct Allocations { + std::uint64_t total_size; + std::set alloc_set; + + Allocations() : total_size(0) {} + + void allocate(std::string&& name, const void* ptr, std::uint64_t size, + StackNode* frame); + + void deallocate(std::string&& name, const void* ptr, std::uint64_t size, + StackNode* frame); + + void print(std::ostream& os, bool mpi_usable); +}; + +class State { + private: + StackNode stack_root; + StackNode* stack_frame; + Allocations current_allocations[NSPACES]; + Allocations hwm_allocations[NSPACES]; + + public: + State(); + + ~State(); + + const StackNode& getCurrentStackFrame() const { return *stack_frame; } + + const Allocations& getHighWaterMemAllocs(const Space space) const { + return hwm_allocations[space]; + } + + void begin_frame(const char* name, StackKind kind); + + void end_frame(const StackNode::time_point& end_time); + + std::uint64_t begin_kernel(const char* name, StackKind kind); + + void end_kernel(std::uint64_t kernid); + + void push_region(const char* name); + + void pop_region(); + + void allocate(Space space, const char* name, const void* ptr, + std::uint64_t size); + + void deallocate(Space space, const char* name, const void* ptr, + std::uint64_t size); + + void begin_deep_copy(Space dst_space, const char* dst_name, const void*, + Space src_space, const char* src_name, const void*, + std::uint64_t); + + void end_deep_copy(); + + static State& get() { return *global_state; } + + static void initialize(); + + static void finalize(); + + private: + static std::unique_ptr global_state; +}; + +} // namespace KokkosTools::SpaceTimeStack + +#endif // KOKKOSTOOLS_PROFILING_SPACETIMESTACK_HPP diff --git a/tests/space-time-stack/CMakeLists.txt b/tests/space-time-stack/CMakeLists.txt index e16b0448c..64e24d6f1 100644 --- a/tests/space-time-stack/CMakeLists.txt +++ b/tests/space-time-stack/CMakeLists.txt @@ -3,3 +3,9 @@ kp_add_executable_and_test( SOURCE_FILE test_demangling.cpp KOKKOS_TOOLS_LIBS kp_space_time_stack ) + +kp_add_executable_and_test( + TARGET_NAME test_space_time_stack_state + SOURCE_FILE test_State.cpp + KOKKOS_TOOLS_LIBS kp_space_time_stack +) diff --git a/tests/space-time-stack/SpaceTimeStackTestSetup.hpp b/tests/space-time-stack/SpaceTimeStackTestSetup.hpp new file mode 100644 index 000000000..79e072509 --- /dev/null +++ b/tests/space-time-stack/SpaceTimeStackTestSetup.hpp @@ -0,0 +1,23 @@ +//! Test setup for the 'space-time-stack' tool. +class SpaceTimeStackTest : public ::testing::Test { + public: + //! At the beginning of the test suite, try to add the related callbacks. + static void SetUpTestSuite() { + Kokkos::Tools::Experimental::set_callbacks( + KokkosTools::get_event_set("space-time-stack", nullptr)); + Kokkos::initialize(); + } + + static void TearDownTestSuite() { Kokkos::finalize(); } + + /** + * At test setup, finalize first and then initialize to cleanse + * @ref KokkosTools::SpaceTimeStack::State::global_state. + */ + void SetUp() override { + KokkosTools::SpaceTimeStack::State::finalize(); + KokkosTools::SpaceTimeStack::State::initialize(); + } + + void TearDown() override { KokkosTools::SpaceTimeStack::State::finalize(); } +}; diff --git a/tests/space-time-stack/test_State.cpp b/tests/space-time-stack/test_State.cpp new file mode 100644 index 000000000..a8cf2a514 --- /dev/null +++ b/tests/space-time-stack/test_State.cpp @@ -0,0 +1,108 @@ +#include +#include + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include "Kokkos_Core.hpp" + +#include "kp_all.hpp" +#include "../../profiling/space-time-stack/kp_space_time_stack.hpp" + +#include "SpaceTimeStackTestSetup.hpp" + +/** + * @test Check that the @ref KokkosTools::SpaceTimeStack::State works as + * expected with nested pushed regions. + */ +TEST_F(SpaceTimeStackTest, nested_pushed_regions) { + Kokkos::Profiling::pushRegion("level-0"); + Kokkos::Profiling::pushRegion("level-1"); + Kokkos::Profiling::pushRegion("level-2"); + Kokkos::Profiling::pushRegion("level-3"); + + const auto& state = KokkosTools::SpaceTimeStack::State::get(); + + auto check = [&](const std::string_view name, + const std::string_view full_name) { + const auto& current = state.getCurrentStackFrame(); + ASSERT_EQ(current.name, name); + ASSERT_EQ(current.get_full_name(), full_name); + ASSERT_EQ(current.kind, KokkosTools::SpaceTimeStack::StackKind::REGION); + }; + + check("level-3", "level-0/level-1/level-2/level-3"); + Kokkos::Profiling::popRegion(); + + check("level-2", "level-0/level-1/level-2"); + Kokkos::Profiling::popRegion(); + + check("level-1", "level-0/level-1"); + Kokkos::Profiling::popRegion(); + + check("level-0", "level-0"); + Kokkos::Profiling::popRegion(); +} + +template +struct SetElementAsIndex { + view_t view; + + KOKKOS_FUNCTION + void operator()(const T index, Args...) const { view(index) = index; } +}; + +/** + * @test Ensure that the @ref KokkosTools::SpaceTimeStack tools work as + * expected when there are several stack types and allocations. + */ +TEST_F(SpaceTimeStackTest, several_stack_kind) { + using execution_space = Kokkos::DefaultExecutionSpace; + using view_t = Kokkos::View; + using policy_t = Kokkos::RangePolicy; + using index_t = typename policy_t::index_type; + + constexpr size_t size = 1; + + Kokkos::Profiling::pushRegion("testing"); + + view_t data("my data", size); + + Kokkos::parallel_scan( + "initialize my data values", policy_t(0, size), + SetElementAsIndex{data}); + + view_t copy("my copy of data", size); + Kokkos::deep_copy(copy, data); + + const auto& state = KokkosTools::SpaceTimeStack::State::get(); + + const auto& current = state.getCurrentStackFrame(); + + ASSERT_EQ(current.children.size(), 3); + + auto child = current.children.cbegin(); + + ASSERT_EQ(child->name, + "Kokkos::View::initialization [my copy of data] via memset"); + ASSERT_EQ(child->kind, KokkosTools::SpaceTimeStack::StackKind::FOR); + + ++child; + + ASSERT_EQ(child->name, "Kokkos::View::initialization [my data] via memset"); + ASSERT_EQ(child->kind, KokkosTools::SpaceTimeStack::StackKind::FOR); + + ++child; + + ASSERT_EQ(child->name, "initialize my data values"); + ASSERT_EQ(child->kind, KokkosTools::SpaceTimeStack::StackKind::SCAN); + + const std::string memory_space_name(execution_space::memory_space::name()); + const auto space_as_int = KokkosTools::SpaceTimeStack::get_space( + Kokkos::Tools::make_space_handle(memory_space_name.c_str())); + + const auto& hwm_allocs = state.getHighWaterMemAllocs(space_as_int); + ASSERT_EQ(hwm_allocs.alloc_set.size(), 2); + + Kokkos::Profiling::popRegion(); +} diff --git a/tests/space-time-stack/test_demangling.cpp b/tests/space-time-stack/test_demangling.cpp index 27ca95dd7..ec74e6fdb 100644 --- a/tests/space-time-stack/test_demangling.cpp +++ b/tests/space-time-stack/test_demangling.cpp @@ -6,6 +6,11 @@ #include "Kokkos_Core.hpp" +#include "kp_all.hpp" +#include "../../profiling/space-time-stack/kp_space_time_stack.hpp" + +#include "SpaceTimeStackTestSetup.hpp" + struct Tester { struct TagNamed {}; struct TagUnnamed {}; @@ -53,10 +58,7 @@ static const std::vector matchers{ * @test This test checks that the tool effectively uses * the demangling helpers. */ -TEST(SpaceTimeStackTest, demangling) { - //! Initialize @c Kokkos. - Kokkos::initialize(); - +TEST_F(SpaceTimeStackTest, demangling) { //! Redirect output for later analysis. std::cout.flush(); std::ostringstream output; @@ -65,8 +67,9 @@ TEST(SpaceTimeStackTest, demangling) { //! Run tests. @todo Replace this with Google Test. Tester tester(Kokkos::DefaultExecutionSpace{}); - //! Finalize @c Kokkos. - Kokkos::finalize(); + /// Finalizing will call @ref KokkosTools::SpaceTimeStack::State::~State + /// that outputs in @c std::cout. + KokkosTools::SpaceTimeStack::State::finalize(); //! Restore output buffer. std::cout.flush();