Skip to content

[SYCL] Split device images based on accuracy level provided in option #10140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Jul 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 25 additions & 15 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -513,12 +513,18 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
// TODO: Replace AttrList with a single attribute. The call can only have a
// single FPAccuracy attribute.
llvm::AttributeList AttrList;
// "sycl_used_aspects" metadata associated with the call.
llvm::Metadata *AspectMD = nullptr;
// sincos() doesn't return a value, but it still has a type associated with
// it that corresponds to the operand type.
CGF.CGM.getFPAccuracyFuncAttributes(
Name, AttrList, ID,
Name, AttrList, AspectMD, ID,
Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType());
CI->setAttributes(AttrList);

if (CGF.getLangOpts().SYCLIsDevice && AspectMD)
CI->setMetadata("sycl_used_aspects",
llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectMD));
return CI;
}

Expand Down Expand Up @@ -22144,21 +22150,22 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
// Even if the current function doesn't have a clang builtin, create
// an 'fpbuiltin-max-error' attribute for it; unless it's marked with
// an NoBuiltin attribute.
if (!FD->hasAttr<NoBuiltinAttr>()) {
Name = FD->getName();
FPAccuracyIntrinsicID =
llvm::StringSwitch<unsigned>(Name)
.Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
.Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv)
.Case("fmul", llvm::Intrinsic::fpbuiltin_fmul)
.Case("fsub", llvm::Intrinsic::fpbuiltin_fsub)
.Case("frem", llvm::Intrinsic::fpbuiltin_frem)
.Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
.Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
.Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt);
} else {
if (FD->hasAttr<NoBuiltinAttr>() ||
!FD->getNameInfo().getName().isIdentifier())
return nullptr;
}

Name = FD->getName();
FPAccuracyIntrinsicID =
llvm::StringSwitch<unsigned>(Name)
.Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
.Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv)
.Case("fmul", llvm::Intrinsic::fpbuiltin_fmul)
.Case("fsub", llvm::Intrinsic::fpbuiltin_fsub)
.Case("frem", llvm::Intrinsic::fpbuiltin_frem)
.Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
.Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
.Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt)
.Default(0);
} else {
// The function has a clang builtin. Create an attribute for it
// only if it has an fpbuiltin intrinsic.
Expand Down Expand Up @@ -22238,6 +22245,9 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
break;
}
}
if (!FPAccuracyIntrinsicID)
return nullptr;

Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType());
return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs),
FPAccuracyIntrinsicID);
Expand Down
21 changes: 19 additions & 2 deletions clang/lib/CodeGen/CGCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "CGCXXABI.h"
#include "CGCleanup.h"
#include "CGRecordLayout.h"
#include "CGSYCLRuntime.h"
#include "CodeGenFunction.h"
#include "CodeGenModule.h"
#include "TargetInfo.h"
Expand Down Expand Up @@ -1846,9 +1847,21 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
.Case("cuda", llvm::fp::FPAccuracy::CUDA);
}

static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to add an assert here to ensure this function is called with appropriate FPAccuracyStr?

Thanks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added assert, thanks.

assert(FPAccuracyStr.equals("high") || FPAccuracyStr.equals("medium") ||
FPAccuracyStr.equals("low") || FPAccuracyStr.equals("sycl") ||
FPAccuracyStr.equals("cuda"));
return llvm::StringSwitch<int32_t>(FPAccuracyStr)
.Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high)
.Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium)
.Case("low", SYCLInternalAspect::fp_intrinsic_accuracy_low)
.Case("sycl", SYCLInternalAspect::fp_intrinsic_accuracy_sycl)
.Case("cuda", SYCLInternalAspect::fp_intrinsic_accuracy_cuda);
}

void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID,
const llvm::Type *FuncType) {
StringRef Name, llvm::AttrBuilder &FuncAttrs, llvm::Metadata *&MD,
unsigned ID, const llvm::Type *FuncType) {
// Priority is given to to the accuracy specific to the function.
// So, if the command line is something like this:
// 'clang -fp-accuracy = high -fp-accuracy = low:[sin]'.
Expand All @@ -1864,6 +1877,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
ID, FuncType, convertFPAccuracy(FuncMapIt->second));
assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second)));
}
}
if (FuncAttrs.attrs().size() == 0)
Expand All @@ -1872,6 +1887,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal));
assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal)));
}
}

Expand Down
13 changes: 13 additions & 0 deletions clang/lib/CodeGen/CGSYCLRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,19 @@ namespace CodeGen {

class CodeGenModule;

// These aspects are internal and used for device image splitting purposes only.
// They are not exposed to the SYCL users through "aspect" enum. That's why
// they are intentionally assigned negative values to filter them out at the
// stage of embedding used aspects as device requirements to the executable.
// We don't pass these internal aspects to the SYCL RT.
enum SYCLInternalAspect : int32_t {
fp_intrinsic_accuracy_high = -1,
fp_intrinsic_accuracy_medium = -2,
fp_intrinsic_accuracy_low = -3,
fp_intrinsic_accuracy_sycl = -4,
fp_intrinsic_accuracy_cuda = -5,
};

class CGSYCLRuntime {
protected:
CodeGenModule &CGM;
Expand Down
3 changes: 2 additions & 1 deletion clang/lib/CodeGen/CodeGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7902,10 +7902,11 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {

void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name,
llvm::AttributeList &AttrList,
llvm::Metadata *&MD,
unsigned ID,
const llvm::Type *FuncType) {
llvm::AttrBuilder FuncAttrs(getLLVMContext());
getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, ID, FuncType);
getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MD, ID, FuncType);
AttrList = llvm::AttributeList::get(
getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);
}
5 changes: 3 additions & 2 deletions clang/lib/CodeGen/CodeGenModule.h
Original file line number Diff line number Diff line change
Expand Up @@ -1594,7 +1594,8 @@ class CodeGenModule : public CodeGenTypeCache {
void moveLazyEmissionStates(CodeGenModule *NewBuilder);

void getFPAccuracyFuncAttributes(StringRef Name,
llvm::AttributeList &AttrList, unsigned ID,
llvm::AttributeList &AttrList,
llvm::Metadata *&MDs, unsigned ID,
const llvm::Type *FuncType);

private:
Expand Down Expand Up @@ -1793,7 +1794,7 @@ class CodeGenModule : public CodeGenTypeCache {

void getDefaultFunctionFPAccuracyAttributes(StringRef Name,
llvm::AttrBuilder &FuncAttrs,
unsigned ID,
llvm::Metadata *&MD, unsigned ID,
const llvm::Type *FuncType);

llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
Expand Down
127 changes: 127 additions & 0 deletions clang/test/CodeGenSYCL/fp-accuracy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-FUNC %s
// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-TU %s
// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=medium -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=cuda:exp -ffp-builtin-accuracy=sycl:log -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-MIX %s

// Tests that sycl_used_aspects metadata is attached to the fpbuiltin call based on -ffp-accuracy option.

#include "sycl.hpp"

extern "C" SYCL_EXTERNAL double sin(double);
extern "C" SYCL_EXTERNAL double cos(double);
extern "C" SYCL_EXTERNAL double tan(double);
extern "C" SYCL_EXTERNAL double log(double);
extern "C" SYCL_EXTERNAL double exp(double);
extern "C" SYCL_EXTERNAL double acos(double);
extern "C" SYCL_EXTERNAL double asin(double);
extern "C" SYCL_EXTERNAL double sqrt(double);

using namespace sycl;

int main() {
const unsigned array_size = 4;
double Value = .5;
queue deviceQueue;
range<1> numOfItems{array_size};

// Kernel0 doesn't use math functions.
deviceQueue.submit([&](handler& cgh) {
cgh.parallel_for<class Kernel0>(numOfItems,
[=](id<1> wiID) {
(void)Value;
});
});

// Kernel1 uses high-accuracy sin.
deviceQueue.submit([&](handler& cgh) {
cgh.parallel_for<class Kernel1>(numOfItems,
[=](id<1> wiID) {
// CHECK-FUNC: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
// CHECK-TU: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
// CHECK-MIX: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
(void)sin(Value);
});
});

deviceQueue.submit([&](handler& cgh) {
cgh.parallel_for<class Kernel2>(numOfItems,
[=](id<1> wiID) {
// CHECK-FUNC: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]]
// CHECK-TU: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
// CHECK-MIX: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]]
(void)cos(Value);
});
});

// Kernel3 uses low-accuracy tan.
deviceQueue.submit([&](handler& cgh) {
cgh.parallel_for<class Kernel3>(numOfItems,
[=](id<1> wiID) {
// CHECK-FUNC: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[LOW_ACC:[0-9]+]]
// CHECK-TU: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
// CHECK-MIX: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
(void)tan(Value);
});
});

// Kernel4 uses cuda-accuracy exp and sycl-accuracy log.
deviceQueue.submit([&](handler& cgh) {
cgh.parallel_for<class Kernel4>(numOfItems,
[=](id<1> wiID) {
// CHECK-FUNC: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]]
// CHECK-FUNC: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]]
// CHECK-TU: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
// CHECK-TU: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
// CHECK-MIX: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]]
// CHECK-MIX: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]]
(void)log(exp(Value));
});
});
deviceQueue.wait();

// Kernel5 uses cuda-accuracy acos.
deviceQueue.submit([&](handler& cgh) {
cgh.parallel_for<class Kernel5>(numOfItems,
[=](id<1> wiID) {
// CHECK-FUNC: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC]]
// CHECK-TU: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
// CHECK-MIX: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
(void)acos(Value);
});
});

// Kernel6 uses sycl-accuracy asin.
deviceQueue.submit([&](handler& cgh) {
cgh.parallel_for<class Kernel6>(numOfItems,
[=](id<1> wiID) {
// CHECK-FUNC: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC]]
// CHECK-TU: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
// CHECK-MIX: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
(void)asin(Value);
});
});

// Kernel7 uses high-accuracy sqrt.
deviceQueue.submit([&](handler& cgh) {
cgh.parallel_for<class Kernel7>(numOfItems,
[=](id<1> wiID) {
// CHECK-FUNC: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
// CHECK-TU: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
// CHECK-MIX: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
(void)sqrt(Value);
});
});
return 0;
}

// CHECK-FUNC: [[HIGH_ACC]] = !{i32 -1}
// CHECK-FUNC: [[MEDIUM_ACC]] = !{i32 -2}
// CHECK-FUNC: [[LOW_ACC]] = !{i32 -3}
// CHECK-FUNC: [[CUDA_ACC]] = !{i32 -5}
// CHECK-FUNC: [[SYCL_ACC]] = !{i32 -4}

// CHECK-TU: [[HIGH_ACC]] = !{i32 -1}

// CHECK-MIX: [[HIGH_ACC]] = !{i32 -1}
// CHECK-MIX: [[MEDIUM_ACC]] = !{i32 -2}
// CHECK-MIX: [[CUDA_ACC]] = !{i32 -5}
// CHECK-MIX: [[SYCL_ACC]] = !{i32 -4}
7 changes: 7 additions & 0 deletions llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I,
Result.insert(Aspects.begin(), Aspects.end());
}

if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) {
for (const MDOperand &MDOp : InstApsects->operands()) {
const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();
Result.insert(cast<ConstantInt>(C)->getSExtValue());
}
}

return Result;
}

Expand Down
66 changes: 66 additions & 0 deletions llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s
;
; Test checks that the pass is able to propagate information about aspects
; used in the instruction through a call graph
;
; K1 K2
; / \/ \
; F1 F2 F3
;
; F1 doesn't use optional type and doesn't have instruction with attached 'sycl_used_aspects' metadata.
; F2 uses optional A and has instruction with attached 'sycl_used_aspects' metadata.
; F3 uses optional B and has instruction with attached 'sycl_used_aspects' metadata.

%Optional.A = type { i32 }
%Optional.B = type { i32 }

; CHECK: spir_kernel void @kernel1() !sycl_used_aspects ![[#ID1:]]
define spir_kernel void @kernel1() {
call spir_func void @func1()
call spir_func void @func2()
ret void
}

; CHECK: spir_kernel void @kernel2() !sycl_used_aspects ![[#ID2:]]
define spir_kernel void @kernel2() {
call spir_func void @func2()
call spir_func void @func3()
ret void
}

; CHECK: spir_func void @func1() {
define spir_func void @func1() {
%tmp = alloca i32
ret void
}

declare void @llvm.fpbuiltin.f64()

; CHECK: spir_func void @func2() !sycl_used_aspects ![[#ID1]] {
define spir_func void @func2() {
%tmp1 = alloca %Optional.A
call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !3
ret void
}

; CHECK: spir_func void @func3() !sycl_used_aspects ![[#ID3:]] {
define spir_func void @func3() {
%tmp = alloca %Optional.B
call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !4
ret void
}

!sycl_types_that_use_aspects = !{!0, !1}
!0 = !{!"Optional.A", i32 1}
!1 = !{!"Optional.B", i32 2}

!sycl_aspects = !{!2}
!2 = !{!"fp64", i32 6}
!3 = !{i32 -1}
!4 = !{i32 -2}

; CHECK: ![[#ID1]] = !{i32 1, i32 -1}
; CHECK: ![[#ID2]] = !{i32 1, i32 -1, i32 2, i32 -2}
; CHECK: ![[#ID3]] = !{i32 2, i32 -2}


Loading