Skip to content

[OMPT] Incorrect work_type for loop in teams distribute parallel for directive #65103

Closed
@Thyre

Description

@Thyre

Description

When working with OpenMP teams, users are able to use the directive #pragma omp distribute parallel for to distribute work not only across threads but teams as well. On the OMPT side, we would expect to see two ompt_callback_work with ompt_scope_begin and ompt_scope_end each.

The first callback should use work_type = ompt_work_distribute and is dispatched by the initial thread of every team. For the parallel for, each thread should dispatch work_type = ompt_work_loop.

However, while we get the distribute work type correctly for the teams, threads do not use ompt_work_loop for ompt_scope_end and instead dispatch the event with ompt_work_distribute.

Reproducer

The issue can be reproduced with the following code:

#include <assert.h>
#include <stdbool.h>
#include <stdio.h>
#include <omp.h>
#include <omp-tools.h>

__thread bool distribute_begin_called;
__thread bool distribute_end_called;
__thread bool loop_begin_called;
__thread bool loop_end_called;

void work_cb(ompt_work_t work_type,
             ompt_scope_endpoint_t endpoint,
             ompt_data_t *parallel_data,
             ompt_data_t *task_data,
             uint64_t count,
             const void *codeptr_ra)
{
    if( work_type == ompt_work_distribute )
    {
        if( endpoint == ompt_scope_begin )
        {
	    distribute_begin_called = true;
        } else {
            distribute_end_called = true;
        }
    }
    if( work_type == ompt_work_loop )
    {
        if( endpoint == ompt_scope_begin )
        {
            loop_begin_called = true;
        } else {
            loop_end_called = true;
        }
    }
}

static int
my_initialize_tool(ompt_function_lookup_t lookup,
                   int initial_device_num,
                   ompt_data_t *tool_data)
{
    ompt_set_callback_t set_callback =
        (ompt_set_callback_t)lookup("ompt_set_callback");

#define OMPT_CALLBACK(NAME, SIGNATURE, EXPECTED)                                                    \
    {                                                                                               \
        ompt_set_result_t result = set_callback(ompt_callback_##NAME, (ompt_callback_t)&NAME##_cb); \
        assert(result == ompt_set_always);                                                          \
    }

    OMPT_CALLBACK(work, work, ompt_set_always);

#undef OMPT_CALLBACK
    return 1; /* non-zero indicates success */
}

static void
my_finalize_tool(ompt_data_t *tool_data)
{
    
}

ompt_start_tool_result_t *
ompt_start_tool(unsigned int omp_version,
                const char *runtime_version)
{
    setbuf(stdout, NULL);
    printf("[%s] omp_version %d | runtime_version = \'%s\'\n",
           __FUNCTION__,
           omp_version,
           runtime_version);
    static ompt_start_tool_result_t tool = {&my_initialize_tool,
                                            &my_finalize_tool,
                                            ompt_data_none};
    return &tool;
}


void 
report_summary_distribute()
{
    #pragma omp parallel 
    {
        printf("Thread ID = %2d | distribute begin = %5s | distribute end = %5s | %s\n", 
        omp_get_thread_num(), 
        distribute_begin_called ? "true" : "false", 
        distribute_end_called ? "true" : "false", 
        distribute_begin_called == distribute_end_called ? "\033[0;32m pass \033[0m" : "\033[0;31m fail \033[0m");
    }
}


void 
report_summary_parallel()
{
    #pragma omp parallel
    {
        printf("Thread ID = %2d | loop begin = %5s | loop end = %5s | %s\n", 
        omp_get_thread_num(), 
        loop_begin_called ? "true" : "false", 
        loop_end_called ? "true" : "false", 
        loop_begin_called == loop_end_called ? "\033[0;32m pass \033[0m" : "\033[0;31m fail \033[0m");
    }
}

int main( void )
{
    int a[100];
    
    #pragma omp target teams distribute 
    for(int i = 0; i < 100; ++i)
    {
        a[i] = i;
    }
    report_summary_distribute();
    report_summary_parallel();

    printf("-------------------------------\n");

    // Reset
    #pragma omp parallel
    {
	distribute_begin_called = false;
	distribute_end_called = false;
	loop_begin_called = false;
	loop_end_called = false;
    }

    #pragma omp target teams distribute parallel for
    for(int i = 0; i < 100; ++i)
    {
        a[i] = i;
    }
    #pragma omp teams num_teams(2)
    #pragma omp distribute parallel for 
    for(int i = 0; i < 100; ++i)
    {
        a[i] = i;
    }
    report_summary_distribute();
    report_summary_parallel();
 
    return 0;
}

Running the code, we can see the following output:

$ clang --version
clang version 18.0.0 (https://github.com/llvm/llvm-project.git 4b383107fa7585bb5ecd7f03cab7800b33d1585a)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /opt/software/software/LLVM/git/bin
$ clang -fopenmp -fopenmp-targets=x86_64 reproducer.c
$ OMP_NUM_THREADS=4 ./a.out  
[ompt_start_tool] omp_version 201611 | runtime_version = 'LLVM OMP version: 5.0.20140926'
Thread ID =  0 | distribute begin =  true | distribute end =  true |  pass 
Thread ID =  3 | distribute begin = false | distribute end = false |  pass 
Thread ID =  2 | distribute begin = false | distribute end = false |  pass 
Thread ID =  1 | distribute begin = false | distribute end = false |  pass 
Thread ID =  0 | loop begin = false | loop end = false |  pass 
Thread ID =  1 | loop begin = false | loop end = false |  pass 
Thread ID =  3 | loop begin = false | loop end = false |  pass 
Thread ID =  2 | loop begin = false | loop end = false |  pass 
-------------------------------
Thread ID =  0 | distribute begin =  true | distribute end =  true |  pass 
Thread ID =  3 | distribute begin = false | distribute end =  true |  fail 
Thread ID =  2 | distribute begin = false | distribute end =  true |  fail 
Thread ID =  1 | distribute begin =  true | distribute end =  true |  pass 
Thread ID =  0 | loop begin =  true | loop end = false |  fail 
Thread ID =  3 | loop begin =  true | loop end = false |  fail 
Thread ID =  2 | loop begin =  true | loop end = false |  fail 
Thread ID =  1 | loop begin =  true | loop end = false |  fail 

In the second part of the output, we would expect to see distribute end = false for all threads but Thread ID = 0/1. In addition, all threads should have loop end = true. The first part is here to show that it only affects directives using parallel for as well. Using distribute only doesn't produce the issue. If we nest #pragma omp parallel for inside of a distribute directive, it also works fine. However, we need to use two loops in that case.

Metadata

Metadata

Assignees

No one assigned

    Labels

    clang:codegenIR generation bugs: mangling, exceptions, etc.clang:openmpOpenMP related changes to Clang

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions