Description
Description
When working with OpenMP teams, users are able to use the directive #pragma omp distribute parallel for
to distribute work not only across threads but teams as well. On the OMPT side, we would expect to see two ompt_callback_work
with ompt_scope_begin
and ompt_scope_end
each.
The first callback should use work_type = ompt_work_distribute
and is dispatched by the initial thread of every team. For the parallel for, each thread should dispatch work_type = ompt_work_loop
.
However, while we get the distribute work type correctly for the teams, threads do not use ompt_work_loop
for ompt_scope_end
and instead dispatch the event with ompt_work_distribute
.
Reproducer
The issue can be reproduced with the following code:
#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <omp.h>
#include <omp-tools.h>
__thread bool distribute_begin_called;
__thread bool distribute_end_called;
__thread bool loop_begin_called;
__thread bool loop_end_called;
void work_cb(ompt_work_t work_type,
ompt_scope_endpoint_t endpoint,
ompt_data_t *parallel_data,
ompt_data_t *task_data,
uint64_t count,
const void *codeptr_ra)
{
if( work_type == ompt_work_distribute )
{
if( endpoint == ompt_scope_begin )
{
distribute_begin_called = true;
} else {
distribute_end_called = true;
}
}
if( work_type == ompt_work_loop )
{
if( endpoint == ompt_scope_begin )
{
loop_begin_called = true;
} else {
loop_end_called = true;
}
}
}
static int
my_initialize_tool(ompt_function_lookup_t lookup,
int initial_device_num,
ompt_data_t *tool_data)
{
ompt_set_callback_t set_callback =
(ompt_set_callback_t)lookup("ompt_set_callback");
#define OMPT_CALLBACK(NAME, SIGNATURE, EXPECTED) \
{ \
ompt_set_result_t result = set_callback(ompt_callback_##NAME, (ompt_callback_t)&NAME##_cb); \
assert(result == ompt_set_always); \
}
OMPT_CALLBACK(work, work, ompt_set_always);
#undef OMPT_CALLBACK
return 1; /* non-zero indicates success */
}
static void
my_finalize_tool(ompt_data_t *tool_data)
{
}
ompt_start_tool_result_t *
ompt_start_tool(unsigned int omp_version,
const char *runtime_version)
{
setbuf(stdout, NULL);
printf("[%s] omp_version %d | runtime_version = \'%s\'\n",
__FUNCTION__,
omp_version,
runtime_version);
static ompt_start_tool_result_t tool = {&my_initialize_tool,
&my_finalize_tool,
ompt_data_none};
return &tool;
}
void
report_summary_distribute()
{
#pragma omp parallel
{
printf("Thread ID = %2d | distribute begin = %5s | distribute end = %5s | %s\n",
omp_get_thread_num(),
distribute_begin_called ? "true" : "false",
distribute_end_called ? "true" : "false",
distribute_begin_called == distribute_end_called ? "\033[0;32m pass \033[0m" : "\033[0;31m fail \033[0m");
}
}
void
report_summary_parallel()
{
#pragma omp parallel
{
printf("Thread ID = %2d | loop begin = %5s | loop end = %5s | %s\n",
omp_get_thread_num(),
loop_begin_called ? "true" : "false",
loop_end_called ? "true" : "false",
loop_begin_called == loop_end_called ? "\033[0;32m pass \033[0m" : "\033[0;31m fail \033[0m");
}
}
int main( void )
{
int a[100];
#pragma omp target teams distribute
for(int i = 0; i < 100; ++i)
{
a[i] = i;
}
report_summary_distribute();
report_summary_parallel();
printf("-------------------------------\n");
// Reset
#pragma omp parallel
{
distribute_begin_called = false;
distribute_end_called = false;
loop_begin_called = false;
loop_end_called = false;
}
#pragma omp target teams distribute parallel for
for(int i = 0; i < 100; ++i)
{
a[i] = i;
}
#pragma omp teams num_teams(2)
#pragma omp distribute parallel for
for(int i = 0; i < 100; ++i)
{
a[i] = i;
}
report_summary_distribute();
report_summary_parallel();
return 0;
}
Running the code, we can see the following output:
$ clang --version
clang version 18.0.0 (https://github.com/llvm/llvm-project.git 4b383107fa7585bb5ecd7f03cab7800b33d1585a)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/software/software/LLVM/git/bin
$ clang -fopenmp -fopenmp-targets=x86_64 reproducer.c
$ OMP_NUM_THREADS=4 ./a.out
[ompt_start_tool] omp_version 201611 | runtime_version = 'LLVM OMP version: 5.0.20140926'
Thread ID = 0 | distribute begin = true | distribute end = true | pass
Thread ID = 3 | distribute begin = false | distribute end = false | pass
Thread ID = 2 | distribute begin = false | distribute end = false | pass
Thread ID = 1 | distribute begin = false | distribute end = false | pass
Thread ID = 0 | loop begin = false | loop end = false | pass
Thread ID = 1 | loop begin = false | loop end = false | pass
Thread ID = 3 | loop begin = false | loop end = false | pass
Thread ID = 2 | loop begin = false | loop end = false | pass
-------------------------------
Thread ID = 0 | distribute begin = true | distribute end = true | pass
Thread ID = 3 | distribute begin = false | distribute end = true | fail
Thread ID = 2 | distribute begin = false | distribute end = true | fail
Thread ID = 1 | distribute begin = true | distribute end = true | pass
Thread ID = 0 | loop begin = true | loop end = false | fail
Thread ID = 3 | loop begin = true | loop end = false | fail
Thread ID = 2 | loop begin = true | loop end = false | fail
Thread ID = 1 | loop begin = true | loop end = false | fail
In the second part of the output, we would expect to see distribute end = false
for all threads but Thread ID = 0/1
. In addition, all threads should have loop end = true
. The first part is here to show that it only affects directives using parallel for
as well. Using distribute
only doesn't produce the issue. If we nest #pragma omp parallel for
inside of a distribute directive, it also works fine. However, we need to use two loops in that case.