Skip to content

Commit c89a914

Browse files
authored
[CUDA] Make PTXAS optimisation default to -O3 (#5188)
Previously the PTX optimization defaulted to -O0. The ```ptxjitcompiler``` defaults to -O3, so this change makes the optimization levels of ahead of time and JIT ptxas compilation the same.
1 parent 9f2b7bd commit c89a914

File tree

2 files changed

+6
-9
lines changed

2 files changed

+6
-9
lines changed

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -427,9 +427,6 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
427427
CmdArgs.push_back("--return-at-end");
428428
} else if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
429429
// Map the -O we received to -O{0,1,2,3}.
430-
//
431-
// TODO: Perhaps we should map host -O2 to ptxas -O3. -O3 is ptxas's
432-
// default, so it may correspond more closely to the spirit of clang -O2.
433430

434431
// -O3 seems like the least-bad option when -Osomething is specified to
435432
// clang but it isn't handled below.
@@ -451,9 +448,9 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
451448
}
452449
CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
453450
} else {
454-
// If no -O was passed, pass -O0 to ptxas -- no opt flag should correspond
455-
// to no optimizations, but ptxas's default is -O3.
456-
CmdArgs.push_back("-O0");
451+
// If no -O was passed, pass -O3 to ptxas -- this makes ptxas's
452+
// optimization level the same as the ptxjitcompiler.
453+
CmdArgs.push_back("-O3");
457454
}
458455
if (DIKind == DebugDirectivesOnly)
459456
CmdArgs.push_back("-lineinfo");

clang/test/Driver/cuda-external-tools.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@
4040
// RUN: --no-cuda-noopt-device-debug -O2 -c %s 2>&1 \
4141
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT2 %s
4242

43-
// Regular compile without -O. This should result in us passing -O0 to ptxas.
43+
// Regular compile without -O. This should result in us passing -O3 to ptxas.
4444
// RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
4545
// RUN: --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
46-
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
46+
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
4747

4848
// Regular compiles with -Os and -Oz. For lack of a better option, we map
4949
// these to ptxas -O3.
@@ -75,7 +75,7 @@
7575
// Compile with -fintegrated-as. This should still cause us to invoke ptxas.
7676
// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
7777
// RUN: --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \
78-
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT0 %s
78+
// RUN: | FileCheck -check-prefixes=CHECK,ARCH64,SM35,OPT3 %s
7979
// Check that we still pass -c when generating relocatable device code.
8080
// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -fgpu-rdc -c %s 2>&1 \
8181
// RUN: --offload-arch=sm_35 --cuda-path=%S/Inputs/CUDA/usr/local/cuda \

0 commit comments

Comments
 (0)