From 712359dae4c1ef18c69459e7bfc073369f29ff28 Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon@amd.com>
Date: Tue, 21 Jan 2025 10:55:39 -0600
Subject: [PATCH 1/7] [Flang] Fix -werror from recent resolve-names.cpp
 changes, left over unused variable

---
 flang/lib/Semantics/resolve-names.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 9857ae61939d7..aee0357333159 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -8369,7 +8369,6 @@ bool DeclarationVisitor::FindAndMarkDeclareTargetSymbol(
         // Search preceding scopes until we find a matching symbol or run out
         // of scopes to search, we skip the current scope as it's already been
         // designated as implicit here.
-        Symbol *symbol = nullptr;
         for (auto *scope = &currScope().parent();; scope = &scope->parent()) {
           if (Symbol * symbol{scope->FindSymbol(name.source)}) {
             if (symbol->test(Symbol::Flag::Subroutine) ||

From b7abc510c515c4df521c84c6f664a138f8cf01e0 Mon Sep 17 00:00:00 2001
From: Danial Klimkin <dklimkin@google.com>
Date: Tue, 21 Jan 2025 18:00:59 +0100
Subject: [PATCH 2/7] [bazel]Fix bazel build past
 e7e3c45bc70904e24e2b3221ac8521e67eb84668 (#123780)

---
 .../llvm-project-overlay/mlir/BUILD.bazel     | 67 +------------------
 1 file changed, 3 insertions(+), 64 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 1ec1c4bfad562..092c2de414e36 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4316,7 +4316,6 @@ cc_library(
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUToSPIRV",
-        ":GPUToVulkanTransforms",
         ":IndexToLLVM",
         ":IndexToSPIRV",
         ":LinalgToStandard",
@@ -6182,28 +6181,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "GPUToVulkanTransforms",
-    srcs = [
-        "lib/Conversion/GPUToVulkan/ConvertGPULaunchFuncToVulkanLaunchFunc.cpp",
-        "lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp",
-    ],
-    hdrs = ["include/mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"],
-    includes = ["include"],
-    deps = [
-        ":ConversionPassIncGen",
-        ":FuncDialect",
-        ":GPUDialect",
-        ":IR",
-        ":LLVMDialect",
-        ":Pass",
-        ":SPIRVDialect",
-        ":SPIRVSerialization",
-        ":Support",
-        "//llvm:Support",
-    ],
-)
-
 cc_library(
     name = "GPUToGPURuntimeTransforms",
     srcs = [
@@ -9777,7 +9754,6 @@ cc_library(
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUToSPIRV",
-        ":GPUToVulkanTransforms",
         ":GPUTransformOps",
         ":GPUTransforms",
         ":IR",
@@ -10224,8 +10200,8 @@ cc_binary(
 
 cc_library(
     name = "VulkanRuntime",
-    srcs = ["tools/mlir-vulkan-runner/VulkanRuntime.cpp"],
-    hdrs = ["tools/mlir-vulkan-runner/VulkanRuntime.h"],
+    srcs = ["lib/ExecutionEngine/VulkanRuntime.cpp"],
+    hdrs = ["lib/ExecutionEngine/VulkanRuntime.h"],
     tags = [
         "manual",  # External dependency
     ],
@@ -10238,7 +10214,7 @@ cc_library(
 
 cc_binary(
     name = "libvulkan-runtime-wrappers.so",
-    srcs = ["tools/mlir-vulkan-runner/vulkan-runtime-wrappers.cpp"],
+    srcs = ["lib/ExecutionEngine/VulkanRuntimeWrappers.cpp"],
     linkshared = True,
     linkstatic = False,
     tags = [
@@ -10247,43 +10223,6 @@ cc_binary(
     deps = [":VulkanRuntime"],
 )
 
-cc_binary(
-    name = "mlir-vulkan-runner",
-    srcs = ["tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp"],
-    deps = [
-        ":ArithDialect",
-        ":ArithToLLVM",
-        ":BuiltinToLLVMIRTranslation",
-        ":ControlFlowToLLVM",
-        ":ConvertToSPIRV",
-        ":ExecutionEngineUtils",
-        ":FuncDialect",
-        ":FuncToLLVM",
-        ":FuncToSPIRV",
-        ":GPUDialect",
-        ":GPUToSPIRV",
-        ":GPUToVulkanTransforms",
-        ":GPUTransforms",
-        ":LLVMCommonConversion",
-        ":LLVMDialect",
-        ":LLVMIRTransforms",
-        ":LLVMToLLVMIRTranslation",
-        ":MemRefDialect",
-        ":MemRefToLLVM",
-        ":MemRefTransforms",
-        ":MlirJitRunner",
-        ":Pass",
-        ":ReconcileUnrealizedCasts",
-        ":SCFDialect",
-        ":SPIRVDialect",
-        ":SPIRVTransforms",
-        ":ToLLVMIRTranslation",
-        ":VectorDialect",
-        ":VectorToLLVM",
-        "//llvm:Support",
-    ],
-)
-
 cc_library(
     name = "TableGen",
     srcs = glob(["lib/TableGen/*.cpp"]),

From cf6d79ad6e4ad3a3e58c8b4d50fb08da3efc2918 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Tue, 21 Jan 2025 17:02:52 +0000
Subject: [PATCH 3/7] [FMV][GlobalOpt] Add an option for static resolution of
 non-FMV callers. (#123757)

Adds `optimize-non-fmv-callers` (disabled by default) as a short term
workaround to keep the llvm testsuite bots green, until we decide what
is the right solution for the problem which was previously addressed
with https://github.com/llvm/llvm-project/pull/123383.
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp | 35 ++++++++++++++++++---------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index eb97d8b4a74f3..00c20ad5f3709 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -95,6 +95,21 @@ STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
 STATISTIC(NumGlobalArraysPadded,
           "Number of global arrays padded to alignment boundary");
 
+// FIXME:
+// Optimizing non-FMV callers is causing a regression in the llvm test suite,
+// specifically a 'predres' version is unexpectedly trapping on GravitonG4.
+// My explanation is that when the caller in not a versioned function, the
+// compiler exclusively relies on the command line option, or target attribute
+// to deduce whether a feature is available. However, there is no guarantee
+// that in reality the host supports those implied features, which arguably
+// is a user error. This option allows disabling the optimization as a short
+// term workaround to keep the bots green.
+static cl::opt<bool>
+    OptimizeNonFMVCallers("optimize-non-fmv-callers",
+                          cl::desc("Statically resolve calls to versioned "
+                                   "functions from non-versioned callers."),
+                          cl::init(false), cl::Hidden);
+
 static cl::opt<bool>
     EnableColdCCStressTest("enable-coldcc-stress-test",
                            cl::desc("Enable stress test of coldcc by adding "
@@ -2715,6 +2730,9 @@ static bool OptimizeNonTrivialIFuncs(
 
     assert(!Callees.empty() && "Expecting successful collection of versions");
 
+    LLVM_DEBUG(dbgs() << "Statically resolving calls to function "
+                      << Resolver->getName() << "\n");
+
     // Cache the feature mask for each callee.
     for (Function *Callee : Callees) {
       auto [It, Inserted] = FeatureMask.try_emplace(Callee);
@@ -2785,20 +2803,15 @@ static bool OptimizeNonTrivialIFuncs(
       } else {
         // We can't reason much about non-FMV callers. Just pick the highest
         // priority callee if it matches, otherwise bail.
-        // if (I > 0 || !implies(CallerBits, CalleeBits))
-        //
-        // FIXME: This is causing a regression in the llvm test suite,
-        // specifically a 'predres' version is unexpectedly trapping on
-        // GravitonG4. My explanation is that when the caller in not a
-        // versioned function, the compiler exclusively relies on the
-        // command line option, or target attribute to deduce whether a
-        // feature is available. However, there is no guarantee that in
-        // reality the host supports those implied features.
-        continue;
+        if (!OptimizeNonFMVCallers || I > 0 || !implies(CallerBits, CalleeBits))
+          continue;
       }
       auto &Calls = CallSites[Caller];
-      for (CallBase *CS : Calls)
+      for (CallBase *CS : Calls) {
+        LLVM_DEBUG(dbgs() << "Redirecting call " << Caller->getName() << " -> "
+                          << Callee->getName() << "\n");
         CS->setCalledOperand(Callee);
+      }
       Changed = true;
     }
     if (IF.use_empty() ||

From 485c80e1188192a4bb2a8cbddccdca82a6e33b81 Mon Sep 17 00:00:00 2001
From: Akira Hatanaka <ahatanak@gmail.com>
Date: Tue, 21 Jan 2025 09:03:50 -0800
Subject: [PATCH 4/7] [PAC] Ignore noexcept on function type when computing
 discriminator of member function pointers (#109056)

This fixes a bug where a member function pointer signed using a function type with noexcept as the discriminator was being authenticated using a function type without noexcept.

Fixes https://github.com/llvm/llvm-project/issues/106487.
---
 clang/lib/AST/ASTContext.cpp                  | 28 +++++++++
 .../ptrauth-member-function-pointer.cpp       | 63 +++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 155dbcfcaeed3..a4ba9fd055346 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3503,6 +3503,34 @@ uint16_t ASTContext::getPointerAuthTypeDiscriminator(QualType T) {
     encodeTypeForFunctionPointerAuth(*this, Out, T);
   } else {
     T = T.getUnqualifiedType();
+    // Calls to member function pointers don't need to worry about
+    // language interop or the laxness of the C type compatibility rules.
+    // We just mangle the member pointer type directly, which is
+    // implicitly much stricter about type matching. However, we do
+    // strip any top-level exception specification before this mangling.
+    // C++23 requires calls to work when the function type is convertible
+    // to the pointer type by a function pointer conversion, which can
+    // change the exception specification. This does not technically
+    // require the exception specification to not affect representation,
+    // because the function pointer conversion is still always a direct
+    // value conversion and therefore an opportunity to resign the
+    // pointer. (This is in contrast to e.g. qualification conversions,
+    // which can be applied in nested pointer positions, effectively
+    // requiring qualified and unqualified representations to match.)
+    // However, it is pragmatic to ignore exception specifications
+    // because it allows a certain amount of `noexcept` mismatching
+    // to not become a visible ODR problem. This also leaves some
+    // room for the committee to add laxness to function pointer
+    // conversions in future standards.
+    if (auto *MPT = T->getAs<MemberPointerType>())
+      if (MPT->isMemberFunctionPointer()) {
+        QualType PointeeType = MPT->getPointeeType();
+        if (PointeeType->castAs<FunctionProtoType>()->getExceptionSpecType() !=
+            EST_None) {
+          QualType FT = getFunctionTypeWithExceptionSpec(PointeeType, EST_None);
+          T = getMemberPointerType(FT, MPT->getClass());
+        }
+      }
     std::unique_ptr<MangleContext> MC(createMangleContext());
     MC->mangleCanonicalTypeName(T, Out);
   }
diff --git a/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp b/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
index 0a9ac3fa510f5..e9436f11b5106 100644
--- a/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-member-function-pointer.cpp
@@ -1,10 +1,12 @@
 // RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG,DARWIN %s
+// RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++17 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG,DARWIN,CXX17 %s
 // RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -debug-info-kind=limited -o - %s | FileCheck -check-prefixes=CHECK,DARWIN %s
 // RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 1 -o - %s | FileCheck %s -check-prefix=STACK-PROT
 // RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 2 -o - %s | FileCheck %s -check-prefix=STACK-PROT
 // RUN: %clang_cc1 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 3 -o - %s | FileCheck %s -check-prefix=STACK-PROT
 
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG,ELF %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++17 -O1 -disable-llvm-passes -o - %s | FileCheck -check-prefixes=CHECK,NODEBUG,ELF,CXX17 %s
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -debug-info-kind=limited -o - %s | FileCheck -check-prefixes=CHECK,ELF %s
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 1 -o - %s | FileCheck %s -check-prefix=STACK-PROT
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics -emit-llvm -std=c++11 -O1 -disable-llvm-passes -stack-protector 2 -o - %s | FileCheck %s -check-prefix=STACK-PROT
@@ -20,6 +22,10 @@
 // CHECK: @__const._Z13testArrayInitv.c0 = private unnamed_addr constant %struct.Class0 { { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base011nonvirtual0Ev, i32 0, i64 35591) to i64), i64 0 } }, align 8
 // CHECK: @__const._Z13testArrayInitv.c1 = private unnamed_addr constant %struct.Class0 { { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN5Base08virtual1Ev_vfpthunk_, i32 0, i64 35591) to i64), i64 0 } }, align 8
 
+// CHECK: @_ZN22testNoexceptConversion6mfptr1E = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN22testNoexceptConversion1S19nonvirtual_noexceptEv, i32 0, i64 [[TYPEDISC3:.*]]) to i64), i64 0 },
+// CHECK: @_ZN22testNoexceptConversion6mfptr2E = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN22testNoexceptConversion1S16virtual_noexceptEv_vfpthunk_, i32 0, i64 [[TYPEDISC3]]) to i64), i64 0 },
+// CHECK: @_ZN22testNoexceptConversion15mfptr3_noexceptE = global { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN22testNoexceptConversion1S19nonvirtual_noexceptEv, i32 0, i64 [[TYPEDISC3]]) to i64), i64 0 },
+
 // CHECK: @_ZTV5Base0 = unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr @_ZTI5Base0,
 // CHECK-SAME: ptr ptrauth (ptr @_ZN5Base08virtual1Ev, i32 0, i64 55600, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5Base0, i32 0, i32 0, i32 2)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZN5Base08virtual3Ev, i32 0, i64 53007, ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV5Base0, i32 0, i32 0, i32 3)),
@@ -77,6 +83,9 @@ struct Derived1 : Base0, Base1 {
 };
 
 typedef void (Base0::*MethodTy0)();
+#if __cplusplus >= 201703L
+typedef void (Base0::*NoExceptMethodTy0)() noexcept;
+#endif
 typedef void (Base0::*VariadicMethodTy0)(int, ...);
 typedef void (Derived0::*MethodTy1)();
 
@@ -293,6 +302,16 @@ void test1(Base0 *a0, MethodTy0 a1) {
   (a0->*a1)();
 }
 
+// CXX17: define{{.*}} void @_Z14test1_noexceptP5Base0MS_DoFvvE(
+// CXX17: %[[V14:.*]] = phi ptr [ %{{.*}}, {{.*}} ], [ %{{.*}}, {{.*}} ]
+// CXX17: %[[V15:.*]] = phi i64 [ 0, {{.*}} ], [ [[TYPEDISC0]], {{.*}} ]
+// CXX17: call void %[[V14]](ptr noundef nonnull align {{[0-9]+}} dereferenceable(8) %{{.*}}) {{.*}}[ "ptrauth"(i32 0, i64 %[[V15]]) ]
+#if __cplusplus >= 201703L
+void test1_noexcept(Base0 *a0, NoExceptMethodTy0 a1) {
+  (a0->*a1)();
+}
+#endif
+
 // CHECK: define{{.*}} void @_Z15testConversion0M5Base0FvvEM8Derived0FvvE([2 x i64] %[[METHOD0_COERCE:.*]], [2 x i64] %[[METHOD1_COERCE:.*]])
 // CHECK: %[[METHOD0:.*]] = alloca { i64, i64 }, align 8
 // CHECK: %[[METHOD1:.*]] = alloca { i64, i64 }, align 8
@@ -438,3 +457,47 @@ void testArrayInit() {
 void testConvertNull() {
   VariadicMethodTy0 t = (VariadicMethodTy0)(MethodTy0{});
 }
+
+namespace testNoexceptConversion {
+
+// CHECK-LABEL: define internal void @__cxx_global_var_init()
+// CHECK: %[[V0:.*]] = load { i64, i64 }, ptr @_ZN22testNoexceptConversion15mfptr0_noexceptE, align 8
+// CHECK: store { i64, i64 } %[[V0]], ptr @_ZN22testNoexceptConversion6mfptr4E, align 8
+
+// CHECK: define {{.*}}void @_ZN22testNoexceptConversion5test0Ev()
+// CHECK: %[[P0:.*]] = alloca { i64, i64 }, align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN22testNoexceptConversion1S19nonvirtual_noexceptEv, i32 0, i64 [[TYPEDISC3]]) to i64), i64 0 }, ptr %[[P0]], align 8,
+
+// CHECK: define {{.*}}void @_ZN22testNoexceptConversion5test1Ev()
+// CHECK: %[[P0:.*]] = alloca { i64, i64 }, align 8
+// CHECK: store { i64, i64 } { i64 ptrtoint (ptr ptrauth (ptr @_ZN22testNoexceptConversion1S16virtual_noexceptEv_vfpthunk_, i32 0, i64 [[TYPEDISC3]]) to i64), i64 0 }, ptr %[[P0]], align 8,
+
+// CHECK: define {{.*}}void @_ZN22testNoexceptConversion5test2Ev()
+// CHECK: %[[P0:.*]] = alloca { i64, i64 }, align 8
+// CHECK: %[[V0:.*]] = load { i64, i64 }, ptr @_ZN22testNoexceptConversion15mfptr0_noexceptE, align 8
+// CHECK: store { i64, i64 } %[[V0]], ptr %[[P0]], align 8,
+
+struct S {
+  void nonvirtual_noexcept() noexcept;
+  virtual void virtual_noexcept() noexcept;
+};
+
+void (S::*mfptr0_noexcept)() noexcept;
+void (S::*mfptr1)() = &S::nonvirtual_noexcept;
+void (S::*mfptr2)() = &S::virtual_noexcept;
+void (S::*mfptr3_noexcept)() noexcept = &S::nonvirtual_noexcept;
+void (S::*mfptr4)() = mfptr0_noexcept;
+
+void test0() {
+  void (S::*p0)() = &S::nonvirtual_noexcept;
+}
+
+void test1() {
+  void (S::*p0)() = &S::virtual_noexcept;
+}
+
+void test2() {
+  void (S::*p0)() = mfptr0_noexcept;
+}
+
+}

From e6c9cd9c060c1fa8343398b9556a5a6c0f35d515 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Tue, 21 Jan 2025 09:04:49 -0800
Subject: [PATCH 5/7] [BOLT] Drop parsing sample PC when processing LBR perf
 data (#123420)

Remove options to generate autofdo data (unused) and `use-event-pc`
(not beneficial).

Cuts down perf2bolt time for 11GB perf.data by 40s (11:10->10:30).
---
 bolt/include/bolt/Profile/DataAggregator.h |   4 -
 bolt/lib/Profile/DataAggregator.cpp        | 109 ++-------------------
 2 files changed, 9 insertions(+), 104 deletions(-)

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 320623cfa15af..aa83d7f9b13ab 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -80,7 +80,6 @@ class DataAggregator : public DataReader {
 private:
   struct PerfBranchSample {
     SmallVector<LBREntry, 32> LBR;
-    uint64_t PC;
   };
 
   struct PerfBasicSample {
@@ -334,9 +333,6 @@ class DataAggregator : public DataReader {
   /// Process all branch events.
   void processBranchEvents();
 
-  /// This member function supports generating data for AutoFDO LLVM tools.
-  std::error_code writeAutoFDOData(StringRef OutputFilename);
-
   /// Parse the full output generated by perf script to report non-LBR samples.
   std::error_code parseBasicEvents();
 
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 2b02086e3e0c9..de9ec6c1723d5 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -108,15 +108,6 @@ TimeAggregator("time-aggr",
   cl::ZeroOrMore,
   cl::cat(AggregatorCategory));
 
-static cl::opt<bool>
-    UseEventPC("use-event-pc",
-               cl::desc("use event PC in combination with LBR sampling"),
-               cl::cat(AggregatorCategory));
-
-static cl::opt<bool> WriteAutoFDOData(
-    "autofdo", cl::desc("generate autofdo textual data instead of bolt data"),
-    cl::cat(AggregatorCategory));
-
 } // namespace opts
 
 namespace {
@@ -187,15 +178,13 @@ void DataAggregator::start() {
                       /*Wait = */false);
   } else if (!opts::ITraceAggregation.empty()) {
     std::string ItracePerfScriptArgs = llvm::formatv(
-        "script -F pid,ip,brstack --itrace={0}", opts::ITraceAggregation);
+        "script -F pid,brstack --itrace={0}", opts::ITraceAggregation);
     launchPerfProcess("branch events with itrace", MainEventsPPI,
                       ItracePerfScriptArgs.c_str(),
                       /*Wait = */ false);
   } else {
-    launchPerfProcess("branch events",
-                      MainEventsPPI,
-                      "script -F pid,ip,brstack",
-                      /*Wait = */false);
+    launchPerfProcess("branch events", MainEventsPPI, "script -F pid,brstack",
+                      /*Wait = */ false);
   }
 
   // Note: we launch script for mem events regardless of the option, as the
@@ -381,67 +370,6 @@ void DataAggregator::parsePreAggregated() {
   }
 }
 
-std::error_code DataAggregator::writeAutoFDOData(StringRef OutputFilename) {
-  outs() << "PERF2BOLT: writing data for autofdo tools...\n";
-  NamedRegionTimer T("writeAutoFDO", "Processing branch events", TimerGroupName,
-                     TimerGroupDesc, opts::TimeAggregator);
-
-  std::error_code EC;
-  raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
-  if (EC)
-    return EC;
-
-  // Format:
-  // number of unique traces
-  // from_1-to_1:count_1
-  // from_2-to_2:count_2
-  // ......
-  // from_n-to_n:count_n
-  // number of unique sample addresses
-  // addr_1:count_1
-  // addr_2:count_2
-  // ......
-  // addr_n:count_n
-  // number of unique LBR entries
-  // src_1->dst_1:count_1
-  // src_2->dst_2:count_2
-  // ......
-  // src_n->dst_n:count_n
-
-  const uint64_t FirstAllocAddress = this->BC->FirstAllocAddress;
-
-  // AutoFDO addresses are relative to the first allocated loadable program
-  // segment
-  auto filterAddress = [&FirstAllocAddress](uint64_t Address) -> uint64_t {
-    if (Address < FirstAllocAddress)
-      return 0;
-    return Address - FirstAllocAddress;
-  };
-
-  OutFile << FallthroughLBRs.size() << "\n";
-  for (const auto &[Trace, Info] : FallthroughLBRs) {
-    OutFile << formatv("{0:x-}-{1:x-}:{2}\n", filterAddress(Trace.From),
-                       filterAddress(Trace.To),
-                       Info.InternCount + Info.ExternCount);
-  }
-
-  OutFile << BasicSamples.size() << "\n";
-  for (const auto [PC, HitCount] : BasicSamples)
-    OutFile << formatv("{0:x-}:{1}\n", filterAddress(PC), HitCount);
-
-  OutFile << BranchLBRs.size() << "\n";
-  for (const auto &[Trace, Info] : BranchLBRs) {
-    OutFile << formatv("{0:x-}->{1:x-}:{2}\n", filterAddress(Trace.From),
-                       filterAddress(Trace.To), Info.TakenCount);
-  }
-
-  outs() << "PERF2BOLT: wrote " << FallthroughLBRs.size() << " unique traces, "
-         << BasicSamples.size() << " sample addresses and " << BranchLBRs.size()
-         << " unique branches to " << OutputFilename << "\n";
-
-  return std::error_code();
-}
-
 void DataAggregator::filterBinaryMMapInfo() {
   if (opts::FilterPID) {
     auto MMapInfoIter = BinaryMMapInfo.find(opts::FilterPID);
@@ -583,15 +511,6 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
       (opts::BasicAggregation && parseBasicEvents()))
     errs() << "PERF2BOLT: failed to parse samples\n";
 
-  // We can finish early if the goal is just to generate data for autofdo
-  if (opts::WriteAutoFDOData) {
-    if (std::error_code EC = writeAutoFDOData(opts::OutputFilename))
-      errs() << "Error writing autofdo data to file: " << EC.message() << "\n";
-
-    deleteTempFiles();
-    exit(0);
-  }
-
   // Special handling for memory events
   if (prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback))
     return Error::success();
@@ -1158,14 +1077,6 @@ ErrorOr<DataAggregator::PerfBranchSample> DataAggregator::parseBranchSample() {
     return make_error_code(errc::no_such_process);
   }
 
-  while (checkAndConsumeFS()) {
-  }
-
-  ErrorOr<uint64_t> PCRes = parseHexField(FieldSeparator, true);
-  if (std::error_code EC = PCRes.getError())
-    return EC;
-  Res.PC = PCRes.get();
-
   if (checkAndConsumeNewLine())
     return Res;
 
@@ -1472,9 +1383,9 @@ std::error_code DataAggregator::printLBRHeatMap() {
 uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
                                         bool NeedsSkylakeFix) {
   uint64_t NumTraces{0};
-  // LBRs are stored in reverse execution order. NextPC refers to the next
-  // recorded executed PC.
-  uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0;
+  // LBRs are stored in reverse execution order. NextLBR refers to the next
+  // executed branch record.
+  const LBREntry *NextLBR = nullptr;
   uint32_t NumEntry = 0;
   for (const LBREntry &LBR : Sample.LBR) {
     ++NumEntry;
@@ -1486,10 +1397,10 @@ uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
     // chronological order)
     if (NeedsSkylakeFix && NumEntry <= 2)
       continue;
-    if (NextPC) {
+    if (NextLBR) {
       // Record fall-through trace.
       const uint64_t TraceFrom = LBR.To;
-      const uint64_t TraceTo = NextPC;
+      const uint64_t TraceTo = NextLBR->From;
       const BinaryFunction *TraceBF =
           getBinaryFunctionContainingAddress(TraceFrom);
       if (TraceBF && TraceBF->containsAddress(TraceTo)) {
@@ -1524,7 +1435,7 @@ uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
       }
       ++NumTraces;
     }
-    NextPC = LBR.From;
+    NextLBR = &LBR;
 
     uint64_t From = getBinaryFunctionContainingAddress(LBR.From) ? LBR.From : 0;
     uint64_t To = getBinaryFunctionContainingAddress(LBR.To) ? LBR.To : 0;
@@ -1561,8 +1472,6 @@ std::error_code DataAggregator::parseBranchEvents() {
     ++NumSamples;
 
     PerfBranchSample &Sample = SampleRes.get();
-    if (opts::WriteAutoFDOData)
-      ++BasicSamples[Sample.PC];
 
     if (Sample.LBR.empty()) {
       ++NumSamplesNoLBR;

From 5fd563141b60d2a04188f7f1d4b900885fd1aa19 Mon Sep 17 00:00:00 2001
From: jofrn <jofernau@amd.com>
Date: Wed, 18 Dec 2024 03:33:03 -0500
Subject: [PATCH 6/7] IR/Verifier: Allow vector type in atomic load and store

Vector types on atomics are assumed to be invalid by the verifier. However,
this type can be valid if it is lowered by codegen.

commit-id:72529270
---
 llvm/docs/LangRef.rst         |  8 ++++----
 llvm/lib/IR/Verifier.cpp      | 14 ++++++++------
 llvm/test/Assembler/atomic.ll | 19 +++++++++++++++++++
 llvm/test/Verifier/atomics.ll | 15 ++++++++-------
 4 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8cc9036d1b67f..c4003a7299e18 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -11095,8 +11095,8 @@ If the ``load`` is marked as ``atomic``, it takes an extra :ref:`ordering
 <ordering>` and optional ``syncscope("<target-scope>")`` argument. The
 ``release`` and ``acq_rel`` orderings are not valid on ``load`` instructions.
 Atomic loads produce :ref:`defined <memmodel>` results when they may see
-multiple atomic stores. The type of the pointee must be an integer, pointer, or
-floating-point type whose bit width is a power of two greater than or equal to
+multiple atomic stores. The type of the pointee must be an integer, pointer,
+floating-point, or vector type whose bit width is a power of two greater than or equal to
 eight and less than or equal to a target-specific size limit.  ``align`` must be
 explicitly specified on atomic loads. Note: if the alignment is not greater or
 equal to the size of the `<value>` type, the atomic operation is likely to
@@ -11236,8 +11236,8 @@ If the ``store`` is marked as ``atomic``, it takes an extra :ref:`ordering
 <ordering>` and optional ``syncscope("<target-scope>")`` argument. The
 ``acquire`` and ``acq_rel`` orderings aren't valid on ``store`` instructions.
 Atomic loads produce :ref:`defined <memmodel>` results when they may see
-multiple atomic stores. The type of the pointee must be an integer, pointer, or
-floating-point type whose bit width is a power of two greater than or equal to
+multiple atomic stores. The type of the pointee must be an integer, pointer,
+floating-point, or vector type whose bit width is a power of two greater than or equal to
 eight and less than or equal to a target-specific size limit.  ``align`` must be
 explicitly specified on atomic stores. Note: if the alignment is not greater or
 equal to the size of the `<value>` type, the atomic operation is likely to
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 7b6f7b5aa6171..f9f5d2faafec7 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4263,9 +4263,10 @@ void Verifier::visitLoadInst(LoadInst &LI) {
     Check(LI.getOrdering() != AtomicOrdering::Release &&
               LI.getOrdering() != AtomicOrdering::AcquireRelease,
           "Load cannot have Release ordering", &LI);
-    Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
-          "atomic load operand must have integer, pointer, or floating point "
-          "type!",
+    Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+              ElTy->getScalarType()->isFloatingPointTy(),
+          "atomic load operand must have integer, pointer, floating point, "
+          "or vector type!",
           ElTy, &LI);
     checkAtomicMemAccessSize(ElTy, &LI);
   } else {
@@ -4289,9 +4290,10 @@ void Verifier::visitStoreInst(StoreInst &SI) {
     Check(SI.getOrdering() != AtomicOrdering::Acquire &&
               SI.getOrdering() != AtomicOrdering::AcquireRelease,
           "Store cannot have Acquire ordering", &SI);
-    Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
-          "atomic store operand must have integer, pointer, or floating point "
-          "type!",
+    Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+              ElTy->getScalarType()->isFloatingPointTy(),
+          "atomic store operand must have integer, pointer, floating point, "
+          "or vector type!",
           ElTy, &SI);
     checkAtomicMemAccessSize(ElTy, &SI);
   } else {
diff --git a/llvm/test/Assembler/atomic.ll b/llvm/test/Assembler/atomic.ll
index a44dcccc16bef..75f5d0c0741fb 100644
--- a/llvm/test/Assembler/atomic.ll
+++ b/llvm/test/Assembler/atomic.ll
@@ -52,6 +52,25 @@ define void @f(ptr %x) {
   ; CHECK: atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
   atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
 
+  ; CHECK : load atomic <1 x i32>, ptr %x unordered, align 4
+  load atomic <1 x i32>, ptr %x unordered, align 4
+  ; CHECK : store atomic <1 x i32> splat (i32 3), ptr %x release, align 4
+  store atomic <1 x i32> <i32 3>, ptr %x release, align 4
+  ; CHECK : load atomic <2 x i32>, ptr %x unordered, align 4
+  load atomic <2 x i32>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+  store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+
+  ; CHECK : load atomic <2 x ptr>, ptr %x unordered, align 4
+  load atomic <2 x ptr>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x ptr> zeroinitializer, ptr %x release, align 4
+  store atomic <2 x ptr> zeroinitializer, ptr %x release, align 4
+
+  ; CHECK : load atomic <2 x float>, ptr %x unordered, align 4
+  load atomic <2 x float>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x float> <float 3.0, float 4.0>, ptr %x release, align 4
+  store atomic <2 x float> <float 3.0, float 4.0>, ptr %x release, align 4
+
   ; CHECK: fence syncscope("singlethread") release
   fence syncscope("singlethread") release
   ; CHECK: fence seq_cst
diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll
index f835b98b24345..17bf5a0528d73 100644
--- a/llvm/test/Verifier/atomics.ll
+++ b/llvm/test/Verifier/atomics.ll
@@ -1,14 +1,15 @@
 ; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+; CHECK: atomic store operand must have integer, pointer, floating point, or vector type!
+; CHECK: atomic load operand must have integer, pointer, floating point, or vector type!
 
-; CHECK: atomic store operand must have integer, pointer, or floating point type!
-; CHECK: atomic load operand must have integer, pointer, or floating point type!
+%ty = type { i32 };
 
-define void @foo(ptr %P, <1 x i64> %v) {
-  store atomic <1 x i64> %v, ptr %P unordered, align 8
+define void @foo(ptr %P, %ty %v) {
+  store atomic %ty %v, ptr %P unordered, align 8
   ret void
 }
 
-define <1 x i64> @bar(ptr %P) {
-  %v = load atomic <1 x i64>, ptr %P unordered, align 8
-  ret <1 x i64> %v
+define %ty @bar(ptr %P) {
+  %v = load atomic %ty, ptr %P unordered, align 8
+  ret %ty %v
 }

From ff979b4758b6d7b3120bd656726d273414c44170 Mon Sep 17 00:00:00 2001
From: jofrn <jofernau@amd.com>
Date: Wed, 18 Dec 2024 03:37:17 -0500
Subject: [PATCH 7/7] [SelectionDAG] Legalize <1 x T> vector types for atomic
 load

`load atomic <1 x T>` is not valid. This change legalizes
vector types of atomic load via scalarization in SelectionDAG
so that it can, for example, translate from `v1i32` to `i32`.

commit-id:5c36cc8c
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  15 +++
 llvm/test/CodeGen/X86/atomic-load-store.ll    | 121 +++++++++++++++++-
 3 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index f13f70e66cfaa..893fca233d191 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -863,6 +863,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ScalarizeVecRes_UnaryOpWithExtraInput(SDNode *N);
   SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+  SDValue ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N);
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_VSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f39d9ca15496a..980c157fe6f3f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -64,6 +64,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
     R = ScalarizeVecRes_UnaryOpWithExtraInput(N);
     break;
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::ATOMIC_LOAD:
+    R = ScalarizeVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N));
+    break;
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
   case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
   case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
@@ -457,6 +460,18 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
   return Op;
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N) {
+  SDValue Result = DAG.getAtomic(
+      ISD::ATOMIC_LOAD, SDLoc(N), N->getMemoryVT().getVectorElementType(),
+      N->getValueType(0).getVectorElementType(), N->getChain(), N->getBasePtr(),
+      N->getMemOperand());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
   assert(N->isUnindexed() && "Indexed vector load?");
 
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 5bce4401f7bdb..d23cfb89f9fc8 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs -O0 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK3
+; RUN: llc < %s -mtriple=x86_64-apple-macosx10.7.0 -verify-machineinstrs -O0 | FileCheck %s --check-prefixes=CHECK,CHECK0
 
 define void @test1(ptr %ptr, i32 %val1) {
 ; CHECK-LABEL: test1:
@@ -28,3 +28,120 @@ define i32 @test3(ptr %ptr) {
   %val = load atomic i32, ptr %ptr seq_cst, align 4
   ret i32 %val
 }
+
+define <1 x i32> @atomic_vec1_i32(ptr %x) {
+; CHECK-LABEL: atomic_vec1_i32:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x i32>, ptr %x acquire, align 4
+  ret <1 x i32> %ret
+}
+
+define <1 x i8> @atomic_vec1_i8(ptr %x) {
+; CHECK3-LABEL: atomic_vec1_i8:
+; CHECK3:       ## %bb.0:
+; CHECK3-NEXT:    movzbl (%rdi), %eax
+; CHECK3-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_vec1_i8:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movb (%rdi), %al
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x i8>, ptr %x acquire, align 1
+  ret <1 x i8> %ret
+}
+
+define <1 x i16> @atomic_vec1_i16(ptr %x) {
+; CHECK3-LABEL: atomic_vec1_i16:
+; CHECK3:       ## %bb.0:
+; CHECK3-NEXT:    movzwl (%rdi), %eax
+; CHECK3-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_vec1_i16:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movw (%rdi), %ax
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x i16>, ptr %x acquire, align 2
+  ret <1 x i16> %ret
+}
+
+define <1 x i32> @atomic_vec1_i8_zext(ptr %x) {
+; CHECK3-LABEL: atomic_vec1_i8_zext:
+; CHECK3:       ## %bb.0:
+; CHECK3-NEXT:    movzbl (%rdi), %eax
+; CHECK3-NEXT:    movzbl %al, %eax
+; CHECK3-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_vec1_i8_zext:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movb (%rdi), %al
+; CHECK0-NEXT:    movzbl %al, %eax
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x i8>, ptr %x acquire, align 1
+  %zret = zext <1 x i8> %ret to <1 x i32>
+  ret <1 x i32> %zret
+}
+
+define <1 x i64> @atomic_vec1_i16_sext(ptr %x) {
+; CHECK3-LABEL: atomic_vec1_i16_sext:
+; CHECK3:       ## %bb.0:
+; CHECK3-NEXT:    movzwl (%rdi), %eax
+; CHECK3-NEXT:    movswq %ax, %rax
+; CHECK3-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_vec1_i16_sext:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movw (%rdi), %ax
+; CHECK0-NEXT:    movswq %ax, %rax
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x i16>, ptr %x acquire, align 2
+  %sret = sext <1 x i16> %ret to <1 x i64>
+  ret <1 x i64> %sret
+}
+
+define <1 x ptr addrspace(270)> @atomic_vec1_ptr270(ptr %x) {
+; CHECK-LABEL: atomic_vec1_ptr270:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x ptr addrspace(270)>, ptr %x acquire, align 4
+  ret <1 x ptr addrspace(270)> %ret
+}
+
+define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
+; CHECK3-LABEL: atomic_vec1_bfloat:
+; CHECK3:       ## %bb.0:
+; CHECK3-NEXT:    movzwl (%rdi), %eax
+; CHECK3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK3-NEXT:    retq
+;
+; CHECK0-LABEL: atomic_vec1_bfloat:
+; CHECK0:       ## %bb.0:
+; CHECK0-NEXT:    movw (%rdi), %cx
+; CHECK0-NEXT:    ## implicit-def: $eax
+; CHECK0-NEXT:    movw %cx, %ax
+; CHECK0-NEXT:    ## implicit-def: $xmm0
+; CHECK0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK0-NEXT:    retq
+  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 2
+  ret <1 x bfloat> %ret
+}
+
+define <1 x ptr> @atomic_vec1_ptr_align(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec1_ptr_align:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 8
+  ret <1 x ptr> %ret
+}
+
+define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec1_i64_align:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x i64>, ptr %x acquire, align 8
+  ret <1 x i64> %ret
+}