From ddb75ca6062c9312896647c206f8b96574bef11d Mon Sep 17 00:00:00 2001
From: Chaitanya <Krishna.Sankisa@amd.com>
Date: Wed, 24 Jul 2024 12:19:57 +0530
Subject: [PATCH] [AMDGPU] Utilities to asan instrument memory instructions.
 (#98863)

This change adds the utilities required to asan instrument memory
instructions. In "amdgpu-sw-lower-lds" pass #87265, during lowering from
LDS to global memory, new instructions in global memory would be created
which need to be asan instrumented.
---
 .../Utils/AMDGPUAsanInstrumentation.cpp       | 332 ++++++++++++++++++
 .../AMDGPU/Utils/AMDGPUAsanInstrumentation.h  |  60 ++++
 llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt   |   1 +
 3 files changed, 393 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPUAsanInstrumentation.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/Utils/AMDGPUAsanInstrumentation.h

diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsanInstrumentation.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsanInstrumentation.cpp
new file mode 100644
index 00000000000000..593fca5bc3ed68
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsanInstrumentation.cpp
@@ -0,0 +1,332 @@
+//===AMDGPUAsanInstrumentation.cpp - ASAN related helper functions===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-------------------------------------------------------------===//
+
+#include "AMDGPUAsanInstrumentation.h"
+
+#define DEBUG_TYPE "amdgpu-asan-instrumentation"
+
+using namespace llvm;
+
+namespace llvm {
+namespace AMDGPU {
+
+static uint64_t getRedzoneSizeForScale(int AsanScale) {
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  return std::max(32U, 1U << AsanScale);
+}
+
+static uint64_t getMinRedzoneSizeForGlobal(int AsanScale) {
+  return getRedzoneSizeForScale(AsanScale);
+}
+
+uint64_t getRedzoneSizeForGlobal(int AsanScale, uint64_t SizeInBytes) {
+  constexpr uint64_t kMaxRZ = 1 << 18;
+  const uint64_t MinRZ = getMinRedzoneSizeForGlobal(AsanScale);
+
+  uint64_t RZ = 0;
+  if (SizeInBytes <= MinRZ / 2) {
+    // Reduce redzone size for small size objects, e.g. int, char[1]. MinRZ is
+    // at least 32 bytes, optimize when SizeInBytes is less than or equal to
+    // half of MinRZ.
+    RZ = MinRZ - SizeInBytes;
+  } else {
+    // Calculate RZ, where MinRZ <= RZ <= MaxRZ, and RZ ~ 1/4 * SizeInBytes.
+    RZ = std::clamp((SizeInBytes / MinRZ / 4) * MinRZ, MinRZ, kMaxRZ);
+
+    // Round up to multiple of MinRZ.
+    if (SizeInBytes % MinRZ)
+      RZ += MinRZ - (SizeInBytes % MinRZ);
+  }
+
+  assert((RZ + SizeInBytes) % MinRZ == 0);
+
+  return RZ;
+}
+
+static size_t TypeStoreSizeToSizeIndex(uint32_t TypeSize) {
+  size_t Res = llvm::countr_zero(TypeSize / 8);
+  return Res;
+}
+
+static Instruction *genAMDGPUReportBlock(Module &M, IRBuilder<> &IRB,
+                                         Value *Cond, bool Recover) {
+  Value *ReportCond = Cond;
+  if (!Recover) {
+    auto *Ballot =
+        IRB.CreateIntrinsic(Intrinsic::amdgcn_ballot, IRB.getInt64Ty(), {Cond});
+    ReportCond = IRB.CreateIsNotNull(Ballot);
+  }
+
+  auto *Trm = SplitBlockAndInsertIfThen(
+      ReportCond, &*IRB.GetInsertPoint(), false,
+      MDBuilder(M.getContext()).createUnlikelyBranchWeights());
+  Trm->getParent()->setName("asan.report");
+
+  if (Recover)
+    return Trm;
+
+  Trm = SplitBlockAndInsertIfThen(Cond, Trm, false);
+  IRB.SetInsertPoint(Trm);
+  return IRB.CreateIntrinsic(Intrinsic::amdgcn_unreachable, {}, {});
+}
+
+static Value *createSlowPathCmp(Module &M, IRBuilder<> &IRB, Type *IntptrTy,
+                                Value *AddrLong, Value *ShadowValue,
+                                uint32_t TypeStoreSize, int AsanScale) {
+  uint64_t Granularity = static_cast<uint64_t>(1) << AsanScale;
+  // Addr & (Granularity - 1)
+  Value *LastAccessedByte =
+      IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+  // (Addr & (Granularity - 1)) + size - 1
+  if (TypeStoreSize / 8 > 1)
+    LastAccessedByte = IRB.CreateAdd(
+        LastAccessedByte, ConstantInt::get(IntptrTy, TypeStoreSize / 8 - 1));
+  // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+  LastAccessedByte =
+      IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
+  // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+  return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+static Instruction *generateCrashCode(Module &M, IRBuilder<> &IRB,
+                                      Type *IntptrTy, Instruction *InsertBefore,
+                                      Value *Addr, bool IsWrite,
+                                      size_t AccessSizeIndex,
+                                      Value *SizeArgument, bool Recover) {
+  IRB.SetInsertPoint(InsertBefore);
+  CallInst *Call = nullptr;
+  SmallString<128> kAsanReportErrorTemplate{"__asan_report_"};
+  SmallString<64> TypeStr{IsWrite ? "store" : "load"};
+  SmallString<64> EndingStr{Recover ? "_noabort" : ""};
+
+  SmallString<128> AsanErrorCallbackSizedString;
+  raw_svector_ostream AsanErrorCallbackSizedOS(AsanErrorCallbackSizedString);
+  AsanErrorCallbackSizedOS << kAsanReportErrorTemplate << TypeStr << "_n"
+                           << EndingStr;
+
+  SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+  AttributeList AL2;
+  FunctionCallee AsanErrorCallbackSized = M.getOrInsertFunction(
+      AsanErrorCallbackSizedOS.str(),
+      FunctionType::get(IRB.getVoidTy(), Args2, false), AL2);
+  SmallVector<Type *, 2> Args1{1, IntptrTy};
+  AttributeList AL1;
+
+  SmallString<128> AsanErrorCallbackString;
+  raw_svector_ostream AsanErrorCallbackOS(AsanErrorCallbackString);
+  AsanErrorCallbackOS << kAsanReportErrorTemplate << TypeStr
+                      << (1ULL << AccessSizeIndex) << EndingStr;
+
+  FunctionCallee AsanErrorCallback = M.getOrInsertFunction(
+      AsanErrorCallbackOS.str(),
+      FunctionType::get(IRB.getVoidTy(), Args1, false), AL1);
+  if (SizeArgument) {
+    Call = IRB.CreateCall(AsanErrorCallbackSized, {Addr, SizeArgument});
+  } else {
+    Call = IRB.CreateCall(AsanErrorCallback, Addr);
+  }
+
+  Call->setCannotMerge();
+  return Call;
+}
+
+static Value *memToShadow(Module &M, IRBuilder<> &IRB, Type *IntptrTy,
+                          Value *Shadow, int AsanScale, uint32_t AsanOffset) {
+  // Shadow >> scale
+  Shadow = IRB.CreateLShr(Shadow, AsanScale);
+  if (AsanOffset == 0)
+    return Shadow;
+  // (Shadow >> scale) | offset
+  Value *ShadowBase = ConstantInt::get(IntptrTy, AsanOffset);
+  return IRB.CreateAdd(Shadow, ShadowBase);
+}
+
+void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns,
+                       Instruction *InsertBefore, Value *Addr,
+                       MaybeAlign Alignment, uint32_t TypeStoreSize,
+                       bool IsWrite, Value *SizeArgument, bool UseCalls,
+                       bool Recover, int AsanScale, int AsanOffset) {
+  Type *AddrTy = Addr->getType();
+  Type *IntptrTy = M.getDataLayout().getIntPtrType(
+      M.getContext(), AddrTy->getPointerAddressSpace());
+  IRB.SetInsertPoint(InsertBefore);
+  size_t AccessSizeIndex = TypeStoreSizeToSizeIndex(TypeStoreSize);
+  Type *ShadowTy = IntegerType::get(M.getContext(),
+                                    std::max(8U, TypeStoreSize >> AsanScale));
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *AddrLong = IRB.CreatePtrToInt(Addr, IntptrTy);
+  Value *ShadowPtr =
+      memToShadow(M, IRB, IntptrTy, AddrLong, AsanScale, AsanOffset);
+  const uint64_t ShadowAlign =
+      std::max<uint64_t>(Alignment.valueOrOne().value() >> AsanScale, 1);
+  Value *ShadowValue = IRB.CreateAlignedLoad(
+      ShadowTy, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy), Align(ShadowAlign));
+  Value *Cmp = IRB.CreateIsNotNull(ShadowValue);
+  auto *Cmp2 = createSlowPathCmp(M, IRB, IntptrTy, AddrLong, ShadowValue,
+                                 TypeStoreSize, AsanScale);
+  Cmp = IRB.CreateAnd(Cmp, Cmp2);
+  Instruction *CrashTerm = genAMDGPUReportBlock(M, IRB, Cmp, Recover);
+  Instruction *Crash =
+      generateCrashCode(M, IRB, IntptrTy, CrashTerm, AddrLong, IsWrite,
+                        AccessSizeIndex, SizeArgument, Recover);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
+  return;
+}
+
+void getInterestingMemoryOperands(
+    Module &M, Instruction *I,
+    SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
+  const DataLayout &DL = M.getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
+                             LI->getType(), LI->getAlign());
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
+                             SI->getValueOperand()->getType(), SI->getAlign());
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
+                             RMW->getValOperand()->getType(), std::nullopt);
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
+                             XCHG->getCompareOperand()->getType(),
+                             std::nullopt);
+  } else if (auto CI = dyn_cast<CallInst>(I)) {
+    switch (CI->getIntrinsicID()) {
+    case Intrinsic::masked_load:
+    case Intrinsic::masked_store:
+    case Intrinsic::masked_gather:
+    case Intrinsic::masked_scatter: {
+      bool IsWrite = CI->getType()->isVoidTy();
+      // Masked store has an initial operand for the value.
+      unsigned OpOffset = IsWrite ? 1 : 0;
+      Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
+      MaybeAlign Alignment = Align(1);
+      // Otherwise no alignment guarantees. We probably got Undef.
+      if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        Alignment = Op->getMaybeAlignValue();
+      Value *Mask = CI->getOperand(2 + OpOffset);
+      Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
+      break;
+    }
+    case Intrinsic::masked_expandload:
+    case Intrinsic::masked_compressstore: {
+      bool IsWrite = CI->getIntrinsicID() == Intrinsic::masked_compressstore;
+      unsigned OpOffset = IsWrite ? 1 : 0;
+      auto BasePtr = CI->getOperand(OpOffset);
+      MaybeAlign Alignment = BasePtr->getPointerAlignment(DL);
+      Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
+      IRBuilder<> IB(I);
+      Value *Mask = CI->getOperand(1 + OpOffset);
+      Type *IntptrTy = M.getDataLayout().getIntPtrType(
+          M.getContext(), BasePtr->getType()->getPointerAddressSpace());
+      // Use the popcount of Mask as the effective vector length.
+      Type *ExtTy = VectorType::get(IntptrTy, cast<VectorType>(Ty));
+      Value *ExtMask = IB.CreateZExt(Mask, ExtTy);
+      Value *EVL = IB.CreateAddReduce(ExtMask);
+      Value *TrueMask = ConstantInt::get(Mask->getType(), 1);
+      Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, TrueMask,
+                               EVL);
+      break;
+    }
+    case Intrinsic::vp_load:
+    case Intrinsic::vp_store:
+    case Intrinsic::experimental_vp_strided_load:
+    case Intrinsic::experimental_vp_strided_store: {
+      auto *VPI = cast<VPIntrinsic>(CI);
+      unsigned IID = CI->getIntrinsicID();
+      bool IsWrite = CI->getType()->isVoidTy();
+      unsigned PtrOpNo = *VPI->getMemoryPointerParamPos(IID);
+      Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
+      MaybeAlign Alignment = VPI->getOperand(PtrOpNo)->getPointerAlignment(DL);
+      Value *Stride = nullptr;
+      if (IID == Intrinsic::experimental_vp_strided_store ||
+          IID == Intrinsic::experimental_vp_strided_load) {
+        Stride = VPI->getOperand(PtrOpNo + 1);
+        // Use the pointer alignment as the element alignment if the stride is a
+        // mutiple of the pointer alignment. Otherwise, the element alignment
+        // should be Align(1).
+        unsigned PointerAlign = Alignment.valueOrOne().value();
+        if (!isa<ConstantInt>(Stride) ||
+            cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
+          Alignment = Align(1);
+      }
+      Interesting.emplace_back(I, PtrOpNo, IsWrite, Ty, Alignment,
+                               VPI->getMaskParam(), VPI->getVectorLengthParam(),
+                               Stride);
+      break;
+    }
+    case Intrinsic::vp_gather:
+    case Intrinsic::vp_scatter: {
+      auto *VPI = cast<VPIntrinsic>(CI);
+      unsigned IID = CI->getIntrinsicID();
+      bool IsWrite = IID == Intrinsic::vp_scatter;
+      unsigned PtrOpNo = *VPI->getMemoryPointerParamPos(IID);
+      Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
+      MaybeAlign Alignment = VPI->getPointerAlignment();
+      Interesting.emplace_back(I, PtrOpNo, IsWrite, Ty, Alignment,
+                               VPI->getMaskParam(),
+                               VPI->getVectorLengthParam());
+      break;
+    }
+    case Intrinsic::amdgcn_raw_buffer_load:
+    case Intrinsic::amdgcn_raw_ptr_buffer_load:
+    case Intrinsic::amdgcn_raw_buffer_load_format:
+    case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
+    case Intrinsic::amdgcn_raw_tbuffer_load:
+    case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
+    case Intrinsic::amdgcn_struct_buffer_load:
+    case Intrinsic::amdgcn_struct_ptr_buffer_load:
+    case Intrinsic::amdgcn_struct_buffer_load_format:
+    case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
+    case Intrinsic::amdgcn_struct_tbuffer_load:
+    case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
+    case Intrinsic::amdgcn_s_buffer_load:
+    case Intrinsic::amdgcn_global_load_tr_b64:
+    case Intrinsic::amdgcn_global_load_tr_b128: {
+      unsigned PtrOpNo = 0;
+      bool IsWrite = false;
+      Type *Ty = CI->getType();
+      Value *Ptr = CI->getArgOperand(PtrOpNo);
+      MaybeAlign Alignment = Ptr->getPointerAlignment(DL);
+      Interesting.emplace_back(I, PtrOpNo, IsWrite, Ty, Alignment);
+      break;
+    }
+    case Intrinsic::amdgcn_raw_tbuffer_store:
+    case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
+    case Intrinsic::amdgcn_raw_buffer_store:
+    case Intrinsic::amdgcn_raw_ptr_buffer_store:
+    case Intrinsic::amdgcn_raw_buffer_store_format:
+    case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
+    case Intrinsic::amdgcn_struct_buffer_store:
+    case Intrinsic::amdgcn_struct_ptr_buffer_store:
+    case Intrinsic::amdgcn_struct_buffer_store_format:
+    case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
+    case Intrinsic::amdgcn_struct_tbuffer_store:
+    case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
+      unsigned PtrOpNo = 1;
+      bool IsWrite = true;
+      Value *Ptr = CI->getArgOperand(PtrOpNo);
+      Type *Ty = Ptr->getType();
+      MaybeAlign Alignment = Ptr->getPointerAlignment(DL);
+      Interesting.emplace_back(I, PtrOpNo, IsWrite, Ty, Alignment);
+      break;
+    }
+    default:
+      for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ArgNo++) {
+        if (Type *Ty = CI->getParamByRefType(ArgNo)) {
+          Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+        } else if (Type *Ty = CI->getParamByValType(ArgNo)) {
+          Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+        }
+      }
+    }
+  }
+}
+} // end namespace AMDGPU
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsanInstrumentation.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsanInstrumentation.h
new file mode 100644
index 00000000000000..b438004aa6ce79
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsanInstrumentation.h
@@ -0,0 +1,60 @@
+//===AMDGPUAsanInstrumentation.h - ASAN helper functions -*- C++- *===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPU_ASAN_INSTRUMENTATION_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPU_ASAN_INSTRUMENTATION_H
+
+#include "AMDGPU.h"
+#include "AMDGPUBaseInfo.h"
+#include "AMDGPUMemoryUtils.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/OptimizedStructLayout.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+namespace llvm {
+namespace AMDGPU {
+
+/// Given SizeInBytes of the Value to be instrunmented,
+/// Returns the redzone size corresponding to it.
+uint64_t getRedzoneSizeForGlobal(int Scale, uint64_t SizeInBytes);
+
+/// Instrument the memory operand Addr.
+/// Generates report blocks that catch the addressing errors.
+void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns,
+                       Instruction *InsertBefore, Value *Addr,
+                       MaybeAlign Alignment, uint32_t TypeStoreSize,
+                       bool IsWrite, Value *SizeArgument, bool UseCalls,
+                       bool Recover, int Scale, int Offset);
+
+/// Get all the memory operands from the instruction
+/// that needs to be instrumented
+void getInterestingMemoryOperands(
+    Module &M, Instruction *I,
+    SmallVectorImpl<InterestingMemoryOperand> &Interesting);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPU_ASAN_INSTRUMENTATION_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 09b8da9f5dd48a..4d69fb67db8602 100644
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_component_library(LLVMAMDGPUUtils
+  AMDGPUAsanInstrumentation.cpp
   AMDGPUAsmUtils.cpp
   AMDGPUBaseInfo.cpp
   AMDGPUDelayedMCExpr.cpp