Skip to content

Commit

Permalink
Add support for device counter collection ioctl (#46) (#1222)
Browse files Browse the repository at this point in the history
Add support for device counter colleciton ioctl

Adds support for the device counter collection IOCTL. This IOCTL
allows for device wide counters to be collected even if the queue
is not intercepted by rocprofiler-sdk (required for system profilers).

A test is also included which checks this behavior by creating a queue
that does not have profiling enabled on it and checks to see if SQ
counters can be read from it. Note: this test will be skipped if the KFD
version does not contain this IOCTL.

Right now the check is "soft" in that if the IOCTL is present and there
is an error with permissions, rocprofiler will continue but will print
an error stating that system wide device profiling and collected counter
values may be degraded. This is primarily to avoid breaking existing
users (like PAPI) who may not need the IOCTL's capability and to give
them time to update.

Co-authored-by: Benjamin Welton <ben@amd.com>
  • Loading branch information
bwelton and Benjamin Welton authored Jan 14, 2025
1 parent 55c25ec commit f5d3fd3
Show file tree
Hide file tree
Showing 12 changed files with 318 additions and 60 deletions.
4 changes: 2 additions & 2 deletions source/include/rocprofiler-sdk/device_counting_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ rocprofiler_configure_device_counting_service(rocprofiler_context_id_t context_i
* @param [in] context_id context id
* @param [in] user_data User supplied data, included in records outputted to buffer.
* @param [in] flags Flags to specify how the counter data should be collected (defaults to sync).
* @param [in/out] output_records Output records collected via sampling (output is also written to
* @param [in] output_records Output records collected via sampling (output is also written to
* buffer). Must be allocated by caller.
* @param [in/out] rec_count On entry, this is the maximum number of records rocprof can store in
* @param [in] rec_count On entry, this is the maximum number of records rocprof can store in
* output_records. On exit, contains the number of actual records.
* @return ::rocprofiler_status_t
* @retval ::ROCPROFILER_STATUS_ERROR_CONTEXT_INVALID Returned if the context does not exist or
Expand Down
3 changes: 2 additions & 1 deletion source/include/rocprofiler-sdk/fwd.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -108,6 +108,7 @@ typedef enum // NOLINT(performance-enum-size)
///< status code for more information.
ROCPROFILER_STATUS_ERROR_EXCEEDS_HW_LIMIT, ///< Exceeds hardware limits for collection.
ROCPROFILER_STATUS_ERROR_AGENT_ARCH_NOT_SUPPORTED, ///< Agent HW architecture not supported.
ROCPROFILER_STATUS_ERROR_PERMISSION_DENIED, ///< Permission denied.
ROCPROFILER_STATUS_LAST,
} rocprofiler_status_t;

Expand Down
23 changes: 18 additions & 5 deletions source/lib/rocprofiler-sdk/counters/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,25 @@
set(ROCPROFILER_LIB_COUNTERS_SOURCES
metrics.cpp dimensions.cpp evaluate_ast.cpp core.cpp id_decode.cpp
dispatch_handlers.cpp controller.cpp device_counting.cpp)
metrics.cpp
dimensions.cpp
evaluate_ast.cpp
core.cpp
id_decode.cpp
dispatch_handlers.cpp
controller.cpp
device_counting.cpp
ioctl.cpp)
set(ROCPROFILER_LIB_COUNTERS_HEADERS
metrics.hpp dimensions.hpp evaluate_ast.hpp core.hpp id_decode.hpp
dispatch_handlers.hpp controller.hpp device_counting.hpp)
metrics.hpp
dimensions.hpp
evaluate_ast.hpp
core.hpp
id_decode.hpp
dispatch_handlers.hpp
controller.hpp
device_counting.hpp
ioctl.hpp)
target_sources(rocprofiler-sdk-object-library PRIVATE ${ROCPROFILER_LIB_COUNTERS_SOURCES}
${ROCPROFILER_LIB_COUNTERS_HEADERS})

add_subdirectory(xml)
add_subdirectory(parser)
add_subdirectory(yaml)
Expand Down
15 changes: 14 additions & 1 deletion source/lib/rocprofiler-sdk/counters/controller.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -29,6 +29,7 @@

#include "lib/rocprofiler-sdk/buffer.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include "lib/rocprofiler-sdk/counters/ioctl.hpp"

namespace rocprofiler
{
Expand Down Expand Up @@ -97,6 +98,18 @@ CounterController::configure_agent_collection(rocprofiler_context_id_t context_i
return ROCPROFILER_STATUS_ERROR_INVALID_ARGUMENT;
}

if(counters::counter_collection_has_device_lock())
{
/**
* Note: This should retrun if the lock fails to aquire in the future. However, this
* is a change in the required permissions for rocprofiler and needs to be communicated
* with partners before strict enforcement. If the required permissions are not obtained,
* those profilers will function as they currently do (without any of the benefits of the
* IOCTL).
*/
counters::counter_collection_device_lock(rocprofiler::agent::get_agent(agent_id), true);
}

ctx.device_counter_collection->agent_data.emplace_back();
ctx.device_counter_collection->agent_data.back().callback_data =
rocprofiler_user_data_t{.ptr = user_data};
Expand Down
3 changes: 2 additions & 1 deletion source/lib/rocprofiler-sdk/counters/controller.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

// MIT License
//
// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
Expand All @@ -26,6 +26,7 @@
#include "lib/common/synchronized.hpp"
#include "lib/rocprofiler-sdk/aql/packet_construct.hpp"
#include "lib/rocprofiler-sdk/counters/evaluate_ast.hpp"
#include "lib/rocprofiler-sdk/counters/ioctl.hpp"
#include "lib/rocprofiler-sdk/counters/metrics.hpp"

#include <rocprofiler-sdk/agent.h>
Expand Down
120 changes: 120 additions & 0 deletions source/lib/rocprofiler-sdk/counters/ioctl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#include "lib/rocprofiler-sdk/counters/ioctl.hpp"
#include "lib/rocprofiler-sdk/details/kfd_ioctl.h"
#include "lib/rocprofiler-sdk/pc_sampling/ioctl/ioctl_adapter.hpp"

#include <sys/ioctl.h>
#include <cerrno>

namespace rocprofiler
{
namespace counters
{
bool
counter_collection_has_device_lock()
{
kfd_ioctl_profiler_args args = {};
args.op = KFD_IOC_PROFILER_VERSION;
int ret = ioctl(pc_sampling::ioctl::get_kfd_fd(), AMDKFD_IOC_PROFILER, &args);
if(ret == 0)
{
return true;
}
return false;
}

rocprofiler_status_t
counter_collection_device_lock(const rocprofiler_agent_t* agent, bool all_queues)
{
CHECK(agent);
kfd_ioctl_profiler_args args = {};
args.op = KFD_IOC_PROFILER_PMC;
args.pmc.gpu_id = agent->gpu_id;
args.pmc.lock = 1;
args.pmc.perfcount_enable = all_queues ? 1 : 0;

int ret = ioctl(pc_sampling::ioctl::get_kfd_fd(), AMDKFD_IOC_PROFILER, &args);
if(ret != 0)
{
switch(ret)
{
case -EBUSY:
ROCP_WARNING << fmt::format(
"Device {} has a profiler attached to it. PMC Counters may be inaccurate.",
agent->id.handle);
return ROCPROFILER_STATUS_ERROR_OUT_OF_RESOURCES;
case -EPERM:
ROCP_WARNING << fmt::format(
"Device {} could not be locked for profiling due to lack of permissions "
"(capability SYS_PERFMON). PMC Counters may be inaccurate and System Counter "
"Collection will be degraded.",
agent->id.handle);
return ROCPROFILER_STATUS_ERROR_PERMISSION_DENIED;
case -EINVAL:
ROCP_WARNING << fmt::format(
"Driver/Kernel version does not support locking device {}. PMC Counters may be "
"inaccurate and System Counter Collection will be degraded.",
agent->id.handle);
return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI;
default:
ROCP_WARNING << fmt::format(
"Failed to lock device {}. PMC Counters may be inaccurate and System Counter "
"Collection will be degraded.",
agent->id.handle);
return ROCPROFILER_STATUS_ERROR;
}
}

return ROCPROFILER_STATUS_SUCCESS;
}

// Not required now but may be useful in the future.
// rocprofiler_status_t
// counter_collection_device_unlock(const rocprofiler_agent_t* agent) {
// CHECK(agent);
// kfd_ioctl_profiler_args args = {};
// args.op = KFD_IOC_PROFILER_PMC;
// args.pmc.gpu_id = agent->gpu_id;
// args.pmc.lock = 0;
// args.pmc.perfcount_enable = 0;

// int ret = ioctl(pc_sampling::ioctl::get_kfd_fd(), AMDKFD_IOC_PROFILER, &args);
// if (ret != 0) {
// switch (ret) {
// case -EBUSY:
// case -EPERM:
// ROCP_WARNING << fmt::format("Could not unlock the device {}", agent->id.handle);
// return ROCPROFILER_STATUS_ERROR;
// case -EINVAL:
// return ROCPROFILER_STATUS_ERROR_INCOMPATIBLE_ABI;
// default:
// ROCP_WARNING << fmt::format("Could not unlock the device {}", agent->id.handle);
// return ROCPROFILER_STATUS_ERROR;
// }
// }

// return ROCPROFILER_STATUS_SUCCESS;
// }
} // namespace counters
} // namespace rocprofiler
37 changes: 37 additions & 0 deletions source/lib/rocprofiler-sdk/counters/ioctl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#pragma once

#include <rocprofiler-sdk/rocprofiler.h>

namespace rocprofiler
{
namespace counters
{
bool
counter_collection_has_device_lock();

rocprofiler_status_t
counter_collection_device_lock(const rocprofiler_agent_t* agent, bool all_queues);

} // namespace counters
} // namespace rocprofiler
Loading

0 comments on commit f5d3fd3

Please sign in to comment.