Skip to content

Commit 73ca116

Browse files
committed
[OpenACC][CIR] Implement 'wait' clause lowering for combined constructs
1 parent 14be7a7 commit 73ca116

File tree

2 files changed

+153
-3
lines changed

2 files changed

+153
-3
lines changed

clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -360,9 +360,11 @@ class OpenACCClauseCIREmitter final
360360
operation.addWaitOperands(builder.getContext(), clause.hasDevNumExpr(),
361361
values, lastDeviceTypeValues);
362362
}
363+
} else if constexpr (isCombinedType<OpTy>) {
364+
applyToComputeOp(clause);
363365
} else {
364366
// TODO: When we've implemented this for everything, switch this to an
365-
// unreachable. Enter data, exit data, update, Combined constructs remain.
367+
// unreachable. Enter data, exit data, update constructs remain.
366368
return clauseNotImplemented(clause);
367369
}
368370
}

clang/test/CIR/CodeGenOpenACC/combined.cpp

Lines changed: 150 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
// RUN: %clang_cc1 -fopenacc -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir %s -o - | FileCheck %s
22

3-
extern "C" void acc_combined(int N) {
4-
// CHECK: cir.func @acc_combined(%[[ARG_N:.*]]: !s32i loc{{.*}}) {
3+
extern "C" void acc_combined(int N, int cond) {
4+
// CHECK: cir.func @acc_combined(%[[ARG_N:.*]]: !s32i loc{{.*}}, %[[ARG_COND:.*]]: !s32i loc{{.*}}) {
55
// CHECK-NEXT: %[[ALLOCA_N:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["N", init]
6+
// CHECK-NEXT: %[[COND:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["cond", init]
67
// CHECK-NEXT: cir.store %[[ARG_N]], %[[ALLOCA_N]] : !s32i, !cir.ptr<!s32i>
8+
// CHECK-NEXT: cir.store %[[ARG_COND]], %[[COND]] : !s32i, !cir.ptr<!s32i>
79

810
#pragma acc parallel loop
911
for(unsigned I = 0; I < N; ++I);
@@ -573,4 +575,150 @@ extern "C" void acc_combined(int N) {
573575
// CHECK-NEXT: } loc
574576
// CHECK: acc.terminator
575577
// CHECK-NEXT: } loc
578+
579+
#pragma acc parallel loop wait
580+
for(unsigned I = 0; I < N; ++I);
581+
// CHECK-NEXT: acc.parallel combined(loop) wait {
582+
// CHECK-NEXT: acc.loop combined(parallel) {
583+
// CHECK: acc.yield
584+
// CHECK-NEXT: } loc
585+
// CHECK-NEXT: acc.yield
586+
// CHECK-NEXT: } loc
587+
588+
#pragma acc serial loop wait device_type(nvidia) wait
589+
for(unsigned I = 0; I < N; ++I);
590+
// CHECK-NEXT: acc.serial combined(loop) wait([#acc.device_type<none>, #acc.device_type<nvidia>]) {
591+
// CHECK-NEXT: acc.loop combined(serial) {
592+
// CHECK: acc.yield
593+
// CHECK-NEXT: } loc
594+
// CHECK-NEXT: acc.yield
595+
// CHECK-NEXT: } loc
596+
597+
#pragma acc kernels loop wait(1) device_type(nvidia) wait
598+
for(unsigned I = 0; I < N; ++I);
599+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
600+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
601+
// CHECK-NEXT: acc.kernels combined(loop) wait([#acc.device_type<nvidia>], {%[[ONE_CAST]] : si32}) {
602+
// CHECK-NEXT: acc.loop combined(kernels) {
603+
// CHECK: acc.yield
604+
// CHECK-NEXT: } loc
605+
// CHECK-NEXT: acc.terminator
606+
// CHECK-NEXT: } loc
607+
608+
#pragma acc parallel loop wait device_type(nvidia) wait(1)
609+
for(unsigned I = 0; I < N; ++I);
610+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
611+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
612+
// CHECK-NEXT: acc.parallel combined(loop) wait([#acc.device_type<none>], {%[[ONE_CAST]] : si32} [#acc.device_type<nvidia>]) {
613+
// CHECK-NEXT: acc.loop combined(parallel) {
614+
// CHECK: acc.yield
615+
// CHECK-NEXT: } loc
616+
// CHECK-NEXT: acc.yield
617+
// CHECK-NEXT: } loc
618+
619+
#pragma acc serial loop wait(1) device_type(nvidia) wait(1)
620+
for(unsigned I = 0; I < N; ++I);
621+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
622+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
623+
// CHECK-NEXT: %[[ONE_LITERAL2:.*]] = cir.const #cir.int<1> : !s32i
624+
// CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL2]] : !s32i to si32
625+
// CHECK-NEXT: acc.serial combined(loop) wait({%[[ONE_CAST]] : si32}, {%[[ONE_CAST2]] : si32} [#acc.device_type<nvidia>]) {
626+
// CHECK-NEXT: acc.loop combined(serial) {
627+
// CHECK: acc.yield
628+
// CHECK-NEXT: } loc
629+
// CHECK-NEXT: acc.yield
630+
// CHECK-NEXT: } loc
631+
632+
#pragma acc kernels loop wait(devnum: cond : 1)
633+
for(unsigned I = 0; I < N; ++I);
634+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
635+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
636+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
637+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
638+
// CHECK-NEXT: acc.kernels combined(loop) wait({devnum: %[[CONV_CAST]] : si32, %[[ONE_CAST]] : si32}) {
639+
// CHECK-NEXT: acc.loop combined(kernels) {
640+
// CHECK: acc.yield
641+
// CHECK-NEXT: } loc
642+
// CHECK-NEXT: acc.terminator
643+
// CHECK-NEXT: } loc
644+
645+
#pragma acc parallel loop wait(devnum: cond : 1) device_type(nvidia) wait(devnum: cond : 1)
646+
for(unsigned I = 0; I < N; ++I);
647+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
648+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
649+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
650+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
651+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
652+
// CHECK-NEXT: %[[CONV_CAST2:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
653+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
654+
// CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
655+
// CHECK-NEXT: acc.parallel combined(loop) wait({devnum: %[[CONV_CAST]] : si32, %[[ONE_CAST]] : si32}, {devnum: %[[CONV_CAST2]] : si32, %[[ONE_CAST2]] : si32} [#acc.device_type<nvidia>]) {
656+
// CHECK-NEXT: acc.loop combined(parallel) {
657+
// CHECK: acc.yield
658+
// CHECK-NEXT: } loc
659+
// CHECK-NEXT: acc.yield
660+
// CHECK-NEXT: } loc
661+
662+
#pragma acc serial loop wait(devnum: cond : 1, 2)
663+
for(unsigned I = 0; I < N; ++I);
664+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
665+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
666+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
667+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
668+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
669+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
670+
// CHECK-NEXT: acc.serial combined(loop) wait({devnum: %[[CONV_CAST]] : si32, %[[ONE_CAST]] : si32, %[[TWO_CAST]] : si32}) {
671+
// CHECK-NEXT: acc.loop combined(serial) {
672+
// CHECK: acc.yield
673+
// CHECK-NEXT: } loc
674+
// CHECK-NEXT: acc.yield
675+
// CHECK-NEXT: } loc
676+
677+
#pragma acc kernels loop wait(devnum: cond : 1, 2) device_type(nvidia, radeon) wait(devnum: cond : 1, 2)
678+
for(unsigned I = 0; I < N; ++I);
679+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
680+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
681+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
682+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
683+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
684+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
685+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
686+
// CHECK-NEXT: %[[CONV_CAST2:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
687+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
688+
// CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
689+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
690+
// CHECK-NEXT: %[[TWO_CAST2:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
691+
// CHECK-NEXT: acc.kernels combined(loop) wait({devnum: %[[CONV_CAST]] : si32, %[[ONE_CAST]] : si32, %[[TWO_CAST]] : si32}, {devnum: %[[CONV_CAST2]] : si32, %[[ONE_CAST2]] : si32, %[[TWO_CAST2]] : si32} [#acc.device_type<nvidia>], {devnum: %[[CONV_CAST2]] : si32, %[[ONE_CAST2]] : si32, %[[TWO_CAST2]] : si32} [#acc.device_type<radeon>]) {
692+
// CHECK-NEXT: acc.loop combined(kernels) {
693+
// CHECK: acc.yield
694+
// CHECK-NEXT: } loc
695+
// CHECK-NEXT: acc.terminator
696+
// CHECK-NEXT: } loc
697+
698+
#pragma acc parallel loop wait(cond, 1)
699+
for(unsigned I = 0; I < N; ++I);
700+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
701+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
702+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
703+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
704+
// CHECK-NEXT: acc.parallel combined(loop) wait({%[[CONV_CAST]] : si32, %[[ONE_CAST]] : si32}) {
705+
// CHECK-NEXT: acc.loop combined(parallel) {
706+
// CHECK: acc.yield
707+
// CHECK-NEXT: } loc
708+
// CHECK-NEXT: acc.yield
709+
// CHECK-NEXT: } loc
710+
711+
#pragma acc serial loop wait(queues: cond, 1) device_type(radeon)
712+
for(unsigned I = 0; I < N; ++I);
713+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
714+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
715+
// CHECK-NEXT: %[[ONE_LITERAL:.*]] = cir.const #cir.int<1> : !s32i
716+
// CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE_LITERAL]] : !s32i to si32
717+
// CHECK-NEXT: acc.serial combined(loop) wait({%[[CONV_CAST]] : si32, %[[ONE_CAST]] : si32}) {
718+
// CHECK-NEXT: acc.loop combined(serial) {
719+
// CHECK: acc.yield
720+
// CHECK-NEXT: } loc
721+
// CHECK-NEXT: acc.yield
722+
// CHECK-NEXT: } loc
723+
576724
}

0 commit comments

Comments
 (0)