Description
We've recently released Score-P v9.0, having support for the OpenMP target via the OpenMP Tools Interface for the first time.
However, a user quickly noticed an issue, which boils down to a bug in LLVM.
When the OpenMP runtime initializes a connected tool, the following callback will be dispatched:
typedef int (*ompt_initialize_t) (ompt_function_lookup_t lookup, int initial_device_num, ompt_data_t *tool_data);
initial_device_num
is defined as "the value that a call to omp_get_initial_device would return". This itself is defined as being "the value of the device number is the value of omp_initial_device or the value returned by the omp_get_num_devices routine".
Looking at the actual returned values however, this is not the case. Instead, LLVM always reports initial_device_num = 0
. Since LLVM also does not implement ompt_get_num_devices
correctly
a tool cannot safely determine if a device identifier in some target callback is actually the host.
LLVM typically uses -1
for this, but tools should not have to rely on guessing this, as it might or might not change in later versions. This also makes supporting different runtimes more complicated.
To reproduce the issue:
#include <omp.h>
#include <omp-tools.h>
#include <stdlib.h>
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdatomic.h>
/* MAIN */
int main( int argc, char** argv )
{
#pragma omp target
{
printf( "Hello from target region\n" );
}
}
/* OMPT INTERFACE */
#if ( defined( __ppc64__ ) || defined( __powerpc64__ ) || defined( __PPC64__ ) )
#define OMPT_TOOL_CPU_RELAX ( ( void )0 )
#elif ( defined( __x86_64 ) || defined( __x86_64__ ) || defined( __amd64 ) || defined( _M_X64 ) )
#define OMPT_TOOL_CPU_RELAX __asm__ volatile ( "pause" )
#elif ( defined( __aarch64__ ) || defined( __ARM64__ ) || defined( _M_ARM64 ) )
#define OMPT_TOOL_CPU_RELAX __asm__ volatile ( "yield" )
#else
#define OMPT_TOOL_CPU_RELAX ( ( void )0 )
#endif
/* Test-and-test-and-set lock. Mutexes are of type bool */
#define OMPT_TOOL_LOCK( MUTEX ) \
while ( true ) \
{ \
if ( atomic_flag_test_and_set_explicit( &( MUTEX ), memory_order_acquire ) != true ) \
{ \
break; \
} \
OMPT_TOOL_CPU_RELAX; \
}
#define OMPT_TOOL_UNLOCK( MUTEX ) atomic_flag_clear_explicit( &( MUTEX ), memory_order_release );
#define OMPT_TOOL_GUARDED_PRINTF( ... ) \
OMPT_TOOL_LOCK( ompt_tool_printf_mutex ) \
printf( __VA_ARGS__ ); \
OMPT_TOOL_UNLOCK( ompt_tool_printf_mutex )
_Thread_local int32_t ompt_tool_tid = -1; /* thread counter. >= 1 after thread_begin */
atomic_flag ompt_tool_printf_mutex = ATOMIC_FLAG_INIT;
static int
my_initialize_tool( ompt_function_lookup_t lookup,
int initial_device_num,
ompt_data_t* tool_data )
{
OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 " | initial_device_num %d\n",
__FUNCTION__,
ompt_tool_tid,
initial_device_num );
return 1; /* non-zero indicates success */
}
static void
my_finalize_tool( ompt_data_t* tool_data )
{
OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 "\n",
__FUNCTION__,
ompt_tool_tid );
}
ompt_start_tool_result_t*
ompt_start_tool( unsigned int omp_version,
const char* runtime_version )
{
setbuf( stdout, NULL );
OMPT_TOOL_GUARDED_PRINTF( "[%s] tid = %" PRId32 " | omp_version %d | runtime_version = \'%s\'\n",
__FUNCTION__,
ompt_tool_tid,
omp_version,
runtime_version );
static ompt_start_tool_result_t tool = { &my_initialize_tool,
&my_finalize_tool,
ompt_data_none };
return &tool;
}
Building the reproducer on a system with at least one GPU (maybe -fopenmp-targets=x86_64
works too), one will see the following result:
$ clang -fopenmp --offload-arch=sm_75 test.c
$ ./a.out
[ompt_start_tool] tid = -1 | omp_version 201611 | runtime_version = 'LLVM OMP version: 5.0.20140926'
[my_initialize_tool] tid = -1 | initial_device_num 0
Hello from target region
[my_finalize_tool] tid = -1
As one can see initial_device_num
is 0
. In my case, it should be either -1
or 1
.