Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDA] Move CUDA to new driver by default #122312

Merged
merged 2 commits into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1059,6 +1059,12 @@ CUDA Support
- Clang now supports CUDA SDK up to 12.6
- Added support for sm_100
- Added support for `__grid_constant__` attribute.
- CUDA now uses the new offloading driver by default. The new driver supports
device-side LTO, interoperability with OpenMP and other languages, and native ``-fgpu-rdc``
support with static libraries. The old behavior can be returned using the
``--no-offload-new-driver`` flag. The binary format is no longer compatible
with the NVIDIA compiler's RDC-mode support. More information can be found at:
https://clang.llvm.org/docs/OffloadingDesign.html

AIX Support
^^^^^^^^^^^
Expand Down
6 changes: 4 additions & 2 deletions clang/lib/Driver/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4334,7 +4334,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
Args.hasFlag(options::OPT_foffload_via_llvm,
options::OPT_fno_offload_via_llvm, false) ||
Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false);
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda));

// Builder to be used to build offloading actions.
std::unique_ptr<OffloadingActionBuilder> OffloadBuilder =
Expand Down Expand Up @@ -5084,7 +5085,8 @@ Action *Driver::ConstructPhaseAction(
offloadDeviceOnly() ||
(TargetDeviceOffloadKind == Action::OFK_HIP &&
!Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false)))
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda))))
? types::TY_LLVM_IR
: types::TY_LLVM_BC;
return C.MakeAction<BackendJobAction>(Input, Output);
Expand Down
9 changes: 6 additions & 3 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5064,7 +5064,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
JA.isHostOffloading(Action::OFK_SYCL) ||
(JA.isHostOffloading(C.getActiveOffloadKinds()) &&
Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false));
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda)));

bool IsRDCMode =
Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false);
Expand Down Expand Up @@ -5418,7 +5419,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
if (IsUsingLTO) {
if (IsDeviceOffloadAction && !JA.isDeviceOffloading(Action::OFK_OpenMP) &&
!Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false) &&
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda)) &&
!Triple.isAMDGPU()) {
D.Diag(diag::err_drv_unsupported_opt_for_target)
<< Args.getLastArg(options::OPT_foffload_lto,
Expand Down Expand Up @@ -6895,7 +6897,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
options::OPT_fno_offload_via_llvm, false)) {
CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"});
} else if (Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false)) {
options::OPT_no_offload_new_driver,
C.isOffloadingHostKind(Action::OFK_Cuda))) {
CmdArgs.push_back("--offload-new-driver");
}

Expand Down
2 changes: 1 addition & 1 deletion clang/lib/Driver/ToolChains/Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
static bool shouldIncludePTX(const ArgList &Args, StringRef InputArch) {
// The new driver does not include PTX by default to avoid overhead.
bool includePTX = !Args.hasFlag(options::OPT_offload_new_driver,
options::OPT_no_offload_new_driver, false);
options::OPT_no_offload_new_driver, true);
for (Arg *A : Args.filtered(options::OPT_cuda_include_ptx_EQ,
options::OPT_no_cuda_include_ptx_EQ)) {
A->claim();
Expand Down
26 changes: 13 additions & 13 deletions clang/test/Driver/cuda-arch-translation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,19 @@

// HIP: clang-offload-bundler

// SM20:--image=profile=sm_20{{.*}}--image=profile=compute_20
// SM21:--image=profile=sm_21{{.*}}--image=profile=compute_20
// SM30:--image=profile=sm_30{{.*}}--image=profile=compute_30
// SM32:--image=profile=sm_32{{.*}}--image=profile=compute_32
// SM35:--image=profile=sm_35{{.*}}--image=profile=compute_35
// SM37:--image=profile=sm_37{{.*}}--image=profile=compute_37
// SM50:--image=profile=sm_50{{.*}}--image=profile=compute_50
// SM52:--image=profile=sm_52{{.*}}--image=profile=compute_52
// SM53:--image=profile=sm_53{{.*}}--image=profile=compute_53
// SM60:--image=profile=sm_60{{.*}}--image=profile=compute_60
// SM61:--image=profile=sm_61{{.*}}--image=profile=compute_61
// SM62:--image=profile=sm_62{{.*}}--image=profile=compute_62
// SM70:--image=profile=sm_70{{.*}}--image=profile=compute_70
// SM20:--image=profile=sm_20{{.*}}
// SM21:--image=profile=sm_21{{.*}}
// SM30:--image=profile=sm_30{{.*}}
// SM32:--image=profile=sm_32{{.*}}
// SM35:--image=profile=sm_35{{.*}}
// SM37:--image=profile=sm_37{{.*}}
// SM50:--image=profile=sm_50{{.*}}
// SM52:--image=profile=sm_52{{.*}}
// SM53:--image=profile=sm_53{{.*}}
// SM60:--image=profile=sm_60{{.*}}
// SM61:--image=profile=sm_61{{.*}}
// SM62:--image=profile=sm_62{{.*}}
// SM70:--image=profile=sm_70{{.*}}
// GFX600:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx600
// GFX601:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx601
// GFX602:-targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx602
Expand Down
43 changes: 12 additions & 31 deletions clang/test/Driver/cuda-bindings.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@
// BIN-NOT: cuda-bindings-device-cuda-nvptx64
// BIN: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output:
// BIN-NOT: cuda-bindings-device-cuda-nvptx64
// BIN: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "a.out"
// BIN: # "powerpc64le-ibm-linux-gnu" - "Offload::Linker", inputs:{{.*}}, output: "a.out"

//
// Test single gpu architecture up to the assemble phase.
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --cuda-gpu-arch=sm_30 %s -S 2>&1 \
// RUN: | FileCheck -check-prefix=ASM %s
// ASM-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-cuda-nvptx64-nvidia-cuda-sm_30.s"
// ASM-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "[[PTX:.+]].s"
// ASM-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"

//
Expand Down Expand Up @@ -61,40 +61,21 @@
// BIN2-NOT: cuda-bindings-device-cuda-nvptx64
// BIN2: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output:
// BIN2-NOT: cuda-bindings-device-cuda-nvptx64
// AOUT: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "a.out"
// TOUT: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "{{.*}}/out"
// AOUT: # "powerpc64le-ibm-linux-gnu" - "Offload::Linker", inputs:{{.*}}, output: "a.out"
// TOUT: # "powerpc64le-ibm-linux-gnu" - "Offload::Linker", inputs:{{.*}}, output: "{{.*}}/out"

// .. same, but with -fsyntax-only
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
// RUN: | FileCheck -check-prefix=SYN %s
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
// RUN: --offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \
// RUN: | FileCheck -check-prefix=SYN %s
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
// RUN: | FileCheck -check-prefix=SYN %s
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
// RUN: --offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \
// RUN: | FileCheck -check-prefix=SYN %s
// SYN-NOT: inputs:
// SYN: # "powerpc64le-ibm-linux-gnu" - "clang", inputs: [{{.*}}], output: (nothing)
// SYN-NEXT: # "nvptx64-nvidia-cuda" - "clang", inputs: [{{.*}}], output: (nothing)
// SYN-NEXT: # "nvptx64-nvidia-cuda" - "clang", inputs: [{{.*}}], output: (nothing)
// SYN-NOT: inputs

// .. and with --offload-new-driver
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 --offload-new-driver %s 2>&1 \
// RUN: | FileCheck -check-prefix=NDSYN %s
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
// RUN: --offload-arch=sm_30,sm_35 %s --offload-new-driver -o %t/out 2>&1 \
// RUN: --offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \
// RUN: | FileCheck -check-prefix=NDSYN %s
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --offload-new-driver 2>&1 \
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
// RUN: | FileCheck -check-prefix=NDSYN %s
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \
// RUN: --offload-arch=sm_30,sm_35 %s --offload-new-driver -o %t/out 2>&1 \
// RUN: --offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \
// RUN: | FileCheck -check-prefix=NDSYN %s
// NDSYN-NOT: inputs:
// NDSYN: # "nvptx64-nvidia-cuda" - "clang", inputs: [{{.*}}], output: (nothing)
Expand All @@ -109,8 +90,8 @@
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
// RUN: | FileCheck -check-prefix=ASM2 %s
// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-cuda-nvptx64-nvidia-cuda-sm_30.s"
// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-cuda-nvptx64-nvidia-cuda-sm_35.s"
// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "[[SM30:.+]].s"
// ASM2-DAG: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "[[SM35:.+]].s"
// ASM2-DAG: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: "cuda-bindings.s"

//
Expand All @@ -125,7 +106,7 @@
// RUN: | FileCheck -check-prefix=HBIN %s
// HBIN: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output:
// HBIN-NOT: cuda-bindings-device-cuda-nvptx64
// HBIN: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "a.out"
// HBIN: # "powerpc64le-ibm-linux-gnu" - "Offload::Linker", inputs:{{.*}}, output: "a.out"

//
// Test one or more gpu architecture up to the assemble phase in host-only
Expand Down Expand Up @@ -163,7 +144,7 @@
// Test two gpu architectures with complete compilation in device-only
// compilation mode.
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --no-offload-new-driver \
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
// RUN: | FileCheck -check-prefix=DBIN2 %s
// DBIN2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output:
Expand All @@ -177,7 +158,7 @@
// Test two gpu architectures up to the assemble phase in device-only
// compilation mode.
//
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \
// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --no-offload-new-driver \
// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
// RUN: | FileCheck -check-prefix=DASM2 %s
// DASM2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: "cuda-bindings-cuda-nvptx64-nvidia-cuda-sm_30.s"
Expand Down
30 changes: 15 additions & 15 deletions clang/test/Driver/cuda-options.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

// Simple compilation case. Compile device-side to PTX assembly and make sure
// we use it on the host side.
// RUN: %clang -### -target x86_64-linux-gnu -c -nogpulib -nogpuinc %s 2>&1 \
// RUN: %clang -### --cuda-include-ptx=all -target x86_64-linux-gnu -c -nogpulib -nogpuinc %s 2>&1 \
// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
// RUN: -check-prefix HOST -check-prefix INCLUDES-DEVICE \
// RUN: -check-prefix NOLINK %s

// Typical compilation + link case.
// RUN: %clang -### -target x86_64-linux-gnu -nogpulib -nogpuinc %s 2>&1 \
// RUN: %clang -### --cuda-include-ptx=all -target x86_64-linux-gnu -nogpulib -nogpuinc %s 2>&1 \
// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
// RUN: -check-prefix HOST -check-prefix INCLUDES-DEVICE \
// RUN: -check-prefix LINK %s
Expand All @@ -33,7 +33,7 @@
// RUN: -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s

// RUN: %clang -### --target=x86_64-linux-gnu --cuda-compile-host-device \
// RUN: --cuda-host-only -nogpulib -nogpuinc %s 2>&1 \
// RUN: --cuda-host-only --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
// RUN: | FileCheck -check-prefix NODEVICE -check-prefix HOST \
// RUN: -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s

Expand All @@ -47,28 +47,28 @@
// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
// RUN: -check-prefix NOHOST -check-prefix NOLINK %s

// RUN: %clang -### --target=x86_64-linux-gnu --cuda-host-only \
// RUN: %clang -### --cuda-include-ptx=all --target=x86_64-linux-gnu --cuda-host-only \
// RUN: -nogpulib -nogpuinc --cuda-compile-host-device %s 2>&1 \
// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
// RUN: -check-prefix HOST -check-prefix INCLUDES-DEVICE \
// RUN: -check-prefix LINK %s

// RUN: %clang -### --target=x86_64-linux-gnu --cuda-device-only \
// RUN: %clang -### --cuda-include-ptx=all --target=x86_64-linux-gnu --cuda-device-only \
// RUN: -nogpulib -nogpuinc --cuda-compile-host-device %s 2>&1 \
// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
// RUN: -check-prefix HOST -check-prefix INCLUDES-DEVICE \
// RUN: -check-prefix LINK %s

// Verify that --cuda-gpu-arch option passes the correct GPU architecture to
// device compilation.
// RUN: %clang -### -nogpulib -nogpuinc --target=x86_64-linux-gnu --cuda-gpu-arch=sm_52 -c %s 2>&1 \
// RUN: %clang -### -nogpulib -nogpuinc --cuda-include-ptx=all --target=x86_64-linux-gnu --cuda-gpu-arch=sm_52 -c %s 2>&1 \
// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
// RUN: -check-prefix DEVICE-SM52 -check-prefix HOST \
// RUN: -check-prefix INCLUDES-DEVICE -check-prefix NOLINK %s

// Verify that there is one device-side compilation per --cuda-gpu-arch args
// and that all results are included on the host side.
// RUN: %clang -### --target=x86_64-linux-gnu \
// RUN: %clang -### --cuda-include-ptx=all --target=x86_64-linux-gnu \
// RUN: -nogpulib -nogpuinc --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,DEVICE2 \
// RUN: -check-prefixes DEVICE-SM52,DEVICE2-SM60 \
Expand Down Expand Up @@ -128,9 +128,9 @@
// f) --no-cuda-gpu-arch=all negates all preceding --cuda-gpu-arch=X
// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only \
// RUN: -nogpulib -nogpuinc --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
// RUN: --no-cuda-gpu-arch=all \
// RUN: --no-cuda-version-check --no-cuda-gpu-arch=all \
// RUN: --cuda-gpu-arch=sm_70 \
// RUN: -c -nogpulib -nogpuinc %s 2>&1 \
// RUN: -c --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \
// RUN: | FileCheck -check-prefixes NOARCH-SM52,NOARCH-SM60,ARCH-SM70 %s

// g) There's no --cuda-gpu-arch=all
Expand All @@ -141,9 +141,9 @@


// Verify that --[no-]cuda-include-ptx arguments are handled correctly.
// a) by default we're including PTX for all GPUs.
// a) by default we're not including PTX for all GPUs.
// RUN: %clang -### --target=x86_64-linux-gnu -nogpulib -nogpuinc \
// RUN: --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
// RUN: --cuda-include-ptx=all --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
// RUN: -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM60,PTX-SM52 %s

Expand All @@ -157,12 +157,12 @@
// c) --no-cuda-include-ptx=sm_XX disables PTX inclusion for that GPU only.
// RUN: %clang -### --target=x86_64-linux-gnu -nogpulib -nogpuinc \
// RUN: --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
// RUN: --no-cuda-include-ptx=sm_60 \
// RUN: --no-cuda-include-ptx=sm_60 --cuda-include-ptx=sm_52 \
// RUN: -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes FATBIN-COMMON,NOPTX-SM60,PTX-SM52 %s
// RUN: %clang -### --target=x86_64-linux-gnu -nogpulib -nogpuinc \
// RUN: --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_52 \
// RUN: --no-cuda-include-ptx=sm_52 \
// RUN: --no-cuda-include-ptx=sm_52 --cuda-include-ptx=sm_60 \
// RUN: -c %s 2>&1 \
// RUN: | FileCheck -check-prefixes FATBIN-COMMON,PTX-SM60,NOPTX-SM52 %s

Expand All @@ -183,8 +183,8 @@
// Verify -flto=thin -fwhole-program-vtables handling. This should result in
// both options being passed to the host compilation, with neither passed to
// the device compilation.
// RUN: %clang -### --target=x86_64-linux-gnu -nogpulib -nogpuinc -c -flto=thin -fwhole-program-vtables %s 2>&1 \
// RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,HOST,INCLUDES-DEVICE,NOLINK,THINLTOWPD %s
// RUN: %clang -### --cuda-include-ptx=sm_60 --target=x86_64-linux-gnu -nogpulib -nogpuinc -c -flto=thin -fwhole-program-vtables %s 2>&1 \
// RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,HOST,NOLINK,THINLTOWPD %s
// THINLTOWPD-NOT: error: invalid argument '-fwhole-program-vtables' only allowed with '-flto'

// ARCH-SM52: "-cc1"{{.*}}"-target-cpu" "sm_52"
Expand Down
4 changes: 0 additions & 4 deletions clang/test/Driver/cuda-output-asm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,9 @@
// SM30-DAG: "-cc1" "-triple" "nvptx64-nvidia-cuda"
// SM30-same: "-target-cpu" "sm_30"

// RUN: not %clang -### -S --target=x86_64-linux-gnu -o foo.s %s 2>&1 \
// RUN: | FileCheck -check-prefix MULTIPLE-OUTPUT-FILES %s
// RUN: not %clang -### -S --target=x86_64-linux-gnu --cuda-device-only \
// RUN: --cuda-gpu-arch=sm_20 --cuda-gpu-arch=sm_30 -o foo.s %s 2>&1 \
// RUN: | FileCheck -check-prefix MULTIPLE-OUTPUT-FILES %s
// RUN: not %clang -### -emit-llvm -c --target=x86_64-linux-gnu -o foo.s %s 2>&1 \
// RUN: | FileCheck -check-prefix MULTIPLE-OUTPUT-FILES %s
// MULTIPLE-OUTPUT-FILES: error: cannot specify -o when generating multiple output files
// Make sure we do not get duplicate diagnostics.
// MULTIPLE-OUTPUT-FILES-NOT: error: cannot specify -o when generating multiple output files
Loading