Skip to content

Commit

Permalink
Gbaraldi/navi3fix (#318)
Browse files Browse the repository at this point in the history
* Adding Navi3x gate fixes

* Making reproducible runtime more reproducible

* source formatting (clang-format v11) (#319)

Co-authored-by: ApoKalipse-V <ApoKalipse-V@users.noreply.github.com>

* Update lib/common/utility.hpp

- revert

* Update lib/common/environment.{hpp,cpp}

- support get_env for various integral types

* Update lib/rocprofiler-sdk/hsa/queue.cpp

- query ROCPROFILER_GATE_CAPACITY in ctor of active_capacity_gate

* Update tests/apps/reproducible-runtime

- fix help message
- misc float vs. double changes
- update output messages

---------

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: ApoKalipse-V <ApoKalipse-V@users.noreply.github.com>
Co-authored-by: Jonathan R. Madsen <jonathanrmadsen@gmail.com>
  • Loading branch information
4 people authored Jan 2, 2024
1 parent 3eac7ce commit 3d539c1
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 64 deletions.
67 changes: 46 additions & 21 deletions source/lib/common/environment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
// THE SOFTWARE.

#include "lib/common/environment.hpp"
#include "lib/common/demangle.hpp"

#include <cctype>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
Expand Down Expand Up @@ -50,27 +52,6 @@ get_env(std::string_view env_id, const char* _default)
return get_env(env_id, std::string_view{_default});
}

int
get_env(std::string_view env_id, int _default)
{
if(env_id.empty()) return _default;
char* env_var = ::std::getenv(env_id.data());
if(env_var)
{
try
{
return std::stoi(env_var);
} catch(std::exception& _e)
{
LOG(WARNING) << "[rocprofiler][get_env] Exception thrown converting getenv(\"" << env_id
<< "\") = " << env_var << " to integer :: " << _e.what()
<< ". Using default value of " << _default << "\n";
}
return _default;
}
return _default;
}

bool
get_env(std::string_view env_id, bool _default)
{
Expand Down Expand Up @@ -99,6 +80,50 @@ get_env(std::string_view env_id, bool _default)
}
return _default;
}

template <typename Tp>
Tp
get_env(std::string_view env_id, Tp _default, std::enable_if_t<std::is_integral<Tp>::value, sfinae>)
{
static_assert(!std::is_same<Tp, bool>::value, "unexpected! should be using bool overload");
static_assert(
sizeof(Tp) <= sizeof(uint64_t),
"change use of stol/stoul if instantiating for type larger than a 64-bit integer");

if(env_id.empty()) return _default;
char* env_var = ::std::getenv(env_id.data());
if(env_var)
{
try
{
// use stol/stoul
if constexpr(std::is_signed<Tp>::value)
return static_cast<Tp>(std::stol(env_var));
else
return static_cast<Tp>(std::stoul(env_var));
} catch(std::exception& _e)
{
LOG(ERROR) << "[rocprofiler][get_env] Exception thrown converting getenv(\"" << env_id
<< "\") = " << env_var << " to " << cxx_demangle(typeid(Tp).name())
<< " :: " << _e.what() << ". Using default value of " << _default << "\n";
}
return _default;
}
return _default;
}

#define SPECIALIZE_GET_ENV(TYPE) \
template TYPE get_env<TYPE>( \
std::string_view, TYPE, std::enable_if_t<std::is_integral<TYPE>::value, sfinae>);

SPECIALIZE_GET_ENV(int8_t)
SPECIALIZE_GET_ENV(int16_t)
SPECIALIZE_GET_ENV(int32_t)
SPECIALIZE_GET_ENV(int64_t)
SPECIALIZE_GET_ENV(uint8_t)
SPECIALIZE_GET_ENV(uint16_t)
SPECIALIZE_GET_ENV(uint32_t)
SPECIALIZE_GET_ENV(uint64_t)
} // namespace impl
} // namespace common
} // namespace rocprofiler
9 changes: 6 additions & 3 deletions source/lib/common/environment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,19 @@ namespace common
{
namespace impl
{
struct sfinae
{};

std::string get_env(std::string_view, std::string_view);

std::string
get_env(std::string_view, const char*);

int
get_env(std::string_view, int);

bool
get_env(std::string_view, bool);

template <typename Tp>
Tp get_env(std::string_view, Tp, std::enable_if_t<std::is_integral<Tp>::value, sfinae> = {});
} // namespace impl

template <typename Tp>
Expand Down
3 changes: 2 additions & 1 deletion source/lib/rocprofiler-sdk/hsa/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ signal_limiter()
{
// Limit the maximun number of HSA signals created.
// There is a hard limit to the maximum that can exist.
static common::active_capacity_gate _gate(4);
static auto _gate =
common::active_capacity_gate{common::get_env<size_t>("ROCPROFILER_GATE_CAPACITY", 4)};
return _gate;
}

Expand Down
85 changes: 46 additions & 39 deletions tests/apps/reproducible-runtime/reproducible-runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <random>
Expand All @@ -41,30 +42,29 @@
{ \
auto _hip_api_print_lk = auto_lock_t{print_lock}; \
fprintf(stderr, \
"%s:%d :: HIP error : %s\n", \
"%s:%d :: HIP error %i : %s\n", \
__FILE__, \
__LINE__, \
static_cast<int>(error_), \
hipGetErrorString(error_)); \
throw std::runtime_error("hip_api_call"); \
} \
}

namespace
{
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
double nruntime = 1.0;
size_t nspin = 500000;
size_t nthreads = 2;
size_t nitr = 2;
size_t nsync = 1;
using auto_lock_t = std::unique_lock<std::mutex>;
auto print_lock = std::mutex{};
double nruntime = 500.0; // ms
uint32_t nspin = 1000000;
size_t nthreads = 2;

void
check_hip_error(void);
} // namespace

__global__ void
reproducible_runtime(int64_t nspin);
reproducible_runtime(uint32_t nspin);

void
run(int rank, int tid, hipStream_t stream);
Expand All @@ -79,26 +79,22 @@ main(int argc, char** argv)
if(_arg == "?" || _arg == "-h" || _arg == "--help")
{
fprintf(stderr,
"usage: reproducible-runtime [KERNEL SPIN CYCLES (%zu)] [NUM_THREADS (%zu)] "
"[NUM_ITERATION (%zu)] [SYNC_EVERY_N_ITERATIONS (%zu)]\n",
"usage: reproducible-runtime [KERNEL RUNTIME PER THREAD (default: %f msec)] "
"[SPIN CYCLES PER KERNEL LAUNCH (default: %u)] [NUM_THREADS (default: %zu)]\n",
nruntime,
nspin,
nthreads,
nitr,
nsync);
nthreads);
exit(EXIT_SUCCESS);
}
}

if(argc > 1) nruntime = std::stod(argv[1]);
if(argc > 2) nspin = std::stoll(argv[2]);
if(argc > 3) nthreads = std::stoll(argv[3]);
if(argc > 4) nitr = std::stoll(argv[4]);
if(argc > 5) nsync = std::stoll(argv[5]);

printf("[reproducible-runtime] Kernel spin time: %zu cycles\n", nspin);
printf("[reproducible-runtime] Kernel runtime per thread: %.3f msec\n", nruntime);
printf("[reproducible-runtime] Spin time per kernel: %u cycles\n", nspin);
printf("[reproducible-runtime] Number of threads: %zu\n", nthreads);
printf("[reproducible-runtime] Number of iterations: %zu\n", nitr);
printf("[reproducible-runtime] Syncing every %zu iterations\n", nsync);

// this is a temporary workaround in omnitrace when HIP + MPI is enabled
int ndevice = 0;
Expand Down Expand Up @@ -132,38 +128,49 @@ main(int argc, char** argv)
}

__global__ void
reproducible_runtime(int64_t nspin_v)
reproducible_runtime(uint32_t nspin_v)
{
for(int i = 0; i < nspin_v / 64; i++)
asm volatile("s_sleep 1"); // ~64 cycles
for(uint32_t i = 0; i < nspin_v / 2048; i++)
asm volatile("s_sleep 32"); // ~2048 cycles -> ~1us
uint32_t remainder = nspin_v % 2048;
for(uint32_t i = 0; i < remainder / 64; i++)
asm volatile("s_sleep 1");
}

void
run(int rank, int tid, hipStream_t stream)
{
dim3 grid(4096);
dim3 block(64);
double time = 0.0;
auto t1 = std::chrono::high_resolution_clock::now();
constexpr int min_sa = 8;
constexpr int min_avail_simd = 24;
dim3 grid(min_sa * min_avail_simd);
dim3 block(32);
float time = 0.0f;

hipEvent_t start, stop;
HIP_API_CALL(hipEventCreate(&start));
HIP_API_CALL(hipEventCreate(&stop));
HIP_API_CALL(hipEventRecord(start, stream));

do
{
for(size_t i = 0; i < nitr; ++i)
{
reproducible_runtime<<<grid, block, 0, stream>>>(nspin);
check_hip_error();
if(i % nsync == (nsync - 1)) HIP_API_CALL(hipStreamSynchronize(stream));
}
auto t2 = std::chrono::high_resolution_clock::now();
HIP_API_CALL(hipStreamSynchronize(stream));
time = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
} while(time < nruntime);
uint32_t cyclesleft = 2000 * 1000 * (nruntime - static_cast<double>(time));
reproducible_runtime<<<grid, block, 0, stream>>>(std::min<uint32_t>(nspin, cyclesleft));
check_hip_error();
HIP_API_CALL(hipEventRecord(stop, stream));
HIP_API_CALL(hipEventSynchronize(stop));
HIP_API_CALL(hipEventElapsedTime(&time, start, stop));
} while(static_cast<double>(time) < nruntime);

HIP_API_CALL(hipEventDestroy(start));
HIP_API_CALL(hipEventDestroy(stop));

{
auto _msg = std::stringstream{};
_msg << '[' << rank << "][" << tid << "] Runtime of reproducible-runtime is "
<< std::setprecision(2) << std::fixed << time << " ms (" << std::setprecision(3)
<< (time / 1000.0f) << " sec)\n";
auto_lock_t _lk{print_lock};
std::cout << "[" << rank << "][" << tid << "] Runtime of reproducible-runtime is " << time
<< " sec\n"
<< std::flush;
std::cout << _msg.str() << std::flush;
}

HIP_API_CALL(hipStreamSynchronize(stream));
Expand Down

0 comments on commit 3d539c1

Please sign in to comment.