[nvptx] Unrolled loop condition rewritten does out of bounds nvptx64-nvidia-cuda

For this input LLVM IR. 
```
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"

define ptx_kernel void @_QMmPpartialsumshflshflr4(ptr %0, ptr %1, i32 %2) {
  %4 = alloca i32, i64 1, align 4
  %5 = alloca float, i64 1, align 4
  %6 = alloca float, i64 1, align 4
  %7 = alloca i32, i64 1, align 4
  %8 = alloca i32, i64 1, align 4
  %9 = alloca i32, i64 1, align 4
  %10 = alloca i32, i64 1, align 4
  %11 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %12 = add i32 %11, 1
  %13 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  %14 = add i32 %13, 1
  %15 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  %16 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  store i32 %2, ptr %10, align 4
  %17 = load i32, ptr %10, align 4
  %18 = sext i32 %17 to i64
  %19 = icmp sgt i64 %18, 0
  %20 = select i1 %19, i64 %18, i64 0
  %21 = mul i32 %15, %16
  store i32 %21, ptr %7, align 4
  store float 0.000000e+00, ptr %5, align 4
  %22 = mul i32 %13, %16
  %23 = add i32 %22, %12
  store i32 %23, ptr %8, align 4
  %24 = load i32, ptr %8, align 4
  %25 = sext i32 %24 to i64
  %26 = load i32, ptr %10, align 4
  %27 = sext i32 %26 to i64
  %28 = load i32, ptr %7, align 4
  %29 = sext i32 %28 to i64
  %30 = trunc i64 %25 to i32
  %31 = sub i64 %27, %25
  %32 = add i64 %31, %29
  %33 = sdiv i64 %32, %29
  br label %34

34:                                               ; preds = %38, %3
  %35 = phi i32 [ %52, %38 ], [ %30, %3 ]
  %36 = phi i64 [ %53, %38 ], [ %33, %3 ]
  %37 = icmp sgt i64 %36, 0
  br i1 %37, label %38, label %54

38:                                               ; preds = %34
  store i32 %35, ptr %9, align 4
  %39 = load float, ptr %5, align 4
  %40 = load i32, ptr %9, align 4
  %41 = sext i32 %40 to i64
  %42 = sub nsw i64 %41, 1
  %43 = mul nsw i64 %42, 1
  %44 = mul nsw i64 %43, 1
  %45 = add nsw i64 %44, 0
  %46 = mul nsw i64 1, %20
  %47 = getelementptr float, ptr %1, i64 %45
  %48 = load float, ptr %47, align 4
  %49 = fadd contract float %39, %48
  store float %49, ptr %5, align 4
  %50 = trunc i64 %29 to i32
  %51 = load i32, ptr %9, align 4
  %52 = add nsw i32 %51, %50
  %53 = sub i64 %36, 1
  br label %34

54:                                               ; preds = %34
  store i32 %35, ptr %9, align 4
  %55 = load float, ptr %5, align 4
  %56 = call contract float @__pgi_shfl_xorf2(float %55, i32 1)
  store float %56, ptr %6, align 4
  %57 = load float, ptr %5, align 4
  %58 = load float, ptr %6, align 4
  %59 = fadd contract float %57, %58
  store float %59, ptr %5, align 4
  %60 = load float, ptr %5, align 4
  %61 = call contract float @__pgi_shfl_xorf2(float %60, i32 2)
  store float %61, ptr %6, align 4
  %62 = load float, ptr %5, align 4
  %63 = load float, ptr %6, align 4
  %64 = fadd contract float %62, %63
  store float %64, ptr %5, align 4
  %65 = load float, ptr %5, align 4
  %66 = call contract float @__pgi_shfl_xorf2(float %65, i32 4)
  store float %66, ptr %6, align 4
  %67 = load float, ptr %5, align 4
  %68 = load float, ptr %6, align 4
  %69 = fadd contract float %67, %68
  store float %69, ptr %5, align 4
  %70 = load float, ptr %5, align 4
  %71 = call contract float @__pgi_shfl_xorf2(float %70, i32 8)
  store float %71, ptr %6, align 4
  %72 = load float, ptr %5, align 4
  %73 = load float, ptr %6, align 4
  %74 = fadd contract float %72, %73
  store float %74, ptr %5, align 4
  %75 = load float, ptr %5, align 4
  %76 = call contract float @__pgi_shfl_xorf2(float %75, i32 16)
  store float %76, ptr %6, align 4
  %77 = load float, ptr %5, align 4
  %78 = load float, ptr %6, align 4
  %79 = fadd contract float %77, %78
  store float %79, ptr %5, align 4
  %80 = load float, ptr %5, align 4
  %81 = sext i32 %14 to i64
  %82 = sub nsw i64 %81, 1
  %83 = mul nsw i64 %82, 1
  %84 = mul nsw i64 %83, 1
  %85 = add nsw i64 %84, 0
  %86 = getelementptr float, ptr %0, i64 %85
  store float %80, ptr %86, align 4
  ret void
}

declare float @__pgi_shfl_xorf2(float, i32)

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}
```

Invoking opt for `nvptx64-nvidia-cuda` at `O2` or higher will unroll the loop in `_QMmPpartialsumshflshflr4`. 
```
opt -mtriple=nvptx64-nvidia-cuda  -O2 -S testcase.ll  -o -
```

The problem is that this will try to access some data out of bounds.

```
; ModuleID = 'dummy.1.ll'
source_filename = "LLVMDialectModule"
target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

define ptx_kernel void @_QMmPpartialsumshflshflr4(ptr writeonly captures(none) %0, ptr readonly captures(none) %1, i32 %2) local_unnamed_addr {
  %4 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %5 = add nuw nsw i32 %4, 1
  %6 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  %7 = tail call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  %8 = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  %9 = sext i32 %2 to i64
  %10 = mul i32 %7, %8
  %11 = mul i32 %6, %8
  %12 = add i32 %5, %11
  %13 = sext i32 %12 to i64
  %14 = sext i32 %10 to i64
  %15 = sub nsw i64 %14, %13
  %16 = add nsw i64 %15, %9
  %17 = sdiv i64 %16, %14
  %invariant.gep = getelementptr i8, ptr %1, i64 -4
  %18 = icmp sgt i64 %17, 0
  br i1 %18, label %.lr.ph.preheader, label %._crit_edge

.lr.ph.preheader:                                 ; preds = %3
  %xtraiter = and i64 %17, 3
  %lcmp.mod.not = icmp eq i64 %xtraiter, 0
  br i1 %lcmp.mod.not, label %.lr.ph.prol.loopexit, label %.lr.ph.prol

.lr.ph.prol:                                      ; preds = %.lr.ph.preheader, %.lr.ph.prol
  %19 = phi i64 [ %25, %.lr.ph.prol ], [ %17, %.lr.ph.preheader ]
  %20 = phi i32 [ %24, %.lr.ph.prol ], [ %12, %.lr.ph.preheader ]
  %.023.prol = phi float [ %23, %.lr.ph.prol ], [ 0.000000e+00, %.lr.ph.preheader ]
  %prol.iter = phi i64 [ %prol.iter.next, %.lr.ph.prol ], [ 0, %.lr.ph.preheader ]
  %21 = sext i32 %20 to i64
  %gep.prol = getelementptr float, ptr %invariant.gep, i64 %21
  %22 = load float, ptr %gep.prol, align 4
  %23 = fadd contract float %.023.prol, %22
  %24 = add nsw i32 %20, %10
  %25 = add nsw i64 %19, -1
  %prol.iter.next = add i64 %prol.iter, 1
  %prol.iter.cmp.not = icmp eq i64 %prol.iter.next, %xtraiter
  br i1 %prol.iter.cmp.not, label %.lr.ph.prol.loopexit, label %.lr.ph.prol, !llvm.loop !1

.lr.ph.prol.loopexit:                             ; preds = %.lr.ph.prol, %.lr.ph.preheader
  %.unr = phi i64 [ %17, %.lr.ph.preheader ], [ %25, %.lr.ph.prol ]
  %.unr24 = phi i32 [ %12, %.lr.ph.preheader ], [ %24, %.lr.ph.prol ]
  %.023.unr = phi float [ 0.000000e+00, %.lr.ph.preheader ], [ %23, %.lr.ph.prol ]
  %.lcssa.unr = phi float [ poison, %.lr.ph.preheader ], [ %23, %.lr.ph.prol ]
  %26 = icmp ult i64 %17, 4
  br i1 %26, label %._crit_edge, label %.lr.ph.preheader.new

.lr.ph.preheader.new:                             ; preds = %.lr.ph.prol.loopexit
  %invariant.op = add i32 %10, %10
  %invariant.op29 = add i32 %invariant.op, %10
  %invariant.op31 = add i32 %invariant.op29, %10
  br label %.lr.ph

.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader.new
  %27 = phi i64 [ %.unr, %.lr.ph.preheader.new ], [ %42, %.lr.ph ]
  %28 = phi i32 [ %.unr24, %.lr.ph.preheader.new ], [ %.reass32, %.lr.ph ]
  %.023 = phi float [ %.023.unr, %.lr.ph.preheader.new ], [ %41, %.lr.ph ]
  %29 = sext i32 %28 to i64
  %gep = getelementptr float, ptr %invariant.gep, i64 %29
  %30 = load float, ptr %gep, align 4
  %31 = fadd contract float %.023, %30
  %32 = add nsw i32 %28, %10
  %33 = sext i32 %32 to i64
  %gep.1 = getelementptr float, ptr %invariant.gep, i64 %33
  %34 = load float, ptr %gep.1, align 4
  %35 = fadd contract float %31, %34
  %.reass = add i32 %28, %invariant.op
  %36 = sext i32 %.reass to i64
  %gep.2 = getelementptr float, ptr %invariant.gep, i64 %36
  %37 = load float, ptr %gep.2, align 4
  %38 = fadd contract float %35, %37
  %.reass30 = add i32 %28, %invariant.op29
  %39 = sext i32 %.reass30 to i64
  %gep.3 = getelementptr float, ptr %invariant.gep, i64 %39
  %40 = load float, ptr %gep.3, align 4
  %41 = fadd contract float %38, %40
  %.reass32 = add i32 %28, %invariant.op31
  %42 = add nsw i64 %27, -4
  %43 = icmp sgt i64 %27, 4
  br i1 %43, label %.lr.ph, label %._crit_edge

._crit_edge:                                      ; preds = %.lr.ph.prol.loopexit, %.lr.ph, %3
  %.0.lcssa = phi float [ 0.000000e+00, %3 ], [ %.lcssa.unr, %.lr.ph.prol.loopexit ], [ %41, %.lr.ph ]
  %44 = tail call contract float @__pgi_shfl_xorf2(float %.0.lcssa, i32 1)
  %45 = fadd contract float %.0.lcssa, %44
  %46 = tail call contract float @__pgi_shfl_xorf2(float %45, i32 2)
  %47 = fadd contract float %45, %46
  %48 = tail call contract float @__pgi_shfl_xorf2(float %47, i32 4)
  %49 = fadd contract float %47, %48
  %50 = tail call contract float @__pgi_shfl_xorf2(float %49, i32 8)
  %51 = fadd contract float %49, %50
  %52 = tail call contract float @__pgi_shfl_xorf2(float %51, i32 16)
  %53 = fadd contract float %51, %52
  %54 = zext nneg i32 %6 to i64
  %55 = getelementptr float, ptr %0, i64 %54
  store float %53, ptr %55, align 4
  ret void
}

declare float @__pgi_shfl_xorf2(float, i32) local_unnamed_addr

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #0

; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0

attributes #0 = { mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}
!1 = distinct !{!1, !2}
!2 = !{!"llvm.loop.unroll.disable"}
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[nvptx] Unrolled loop condition rewritten does out of bounds nvptx64-nvidia-cuda #142699

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[nvptx] Unrolled loop condition rewritten does out of bounds nvptx64-nvidia-cuda #142699

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions