Skip to content

Memory stores aren't vectorised in a for loop unless explicit at-inbounds is used #70

Closed
@giordano

Description

@giordano

Follow up from #68 (comment):

julia> code_llvm((FixedSizeVector{Float64,Memory{Float64}},)) do v
           for idx in eachindex(v)
               v[idx] = idx
           end
       end
LLVM IR
; Function Signature: var"#23"(FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}})
;  @ REPL[16]:2 within `#23`
define void @"julia_#23_3098"(ptr nocapture noundef nonnull readonly align 8 dereferenceable(16) %"v::FixedSizeArray", ptr nocapture readonly %.roots.v) #0 {
top:
; ┌ @ abstractarray.jl:321 within `eachindex`
; │┌ @ abstractarray.jl:137 within `axes1`
; ││┌ @ abstractarray.jl:98 within `axes`
; │││┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:64 within `size`
; ││││┌ @ Base.jl:49 within `getproperty`
       %0 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 8
; └└└└└
; ┌ @ range.jl:911 within `iterate`
; │┌ @ range.jl:688 within `isempty`
; ││┌ @ operators.jl:425 within `>`
; │││┌ @ int.jl:83 within `<`
      %.unbox = load i64, ptr %0, align 8
      %1 = icmp slt i64 %.unbox, 1
; └└└└
  br i1 %1, label %L48, label %L13.preheader16

L13.preheader16:                                  ; preds = %top
  %memoryref_mem = load ptr, ptr %.roots.v, align 8
  %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %memoryref_mem, i64 0, i32 1
;  @ REPL[16]:3 within `#23`
; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:240
; │┌ @ boot.jl:544 within `memoryref`
    %memoryref_data.pre = load ptr, ptr %memory_data_ptr, align 8
; │└
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:59 within `setindex!`
   br label %L29

L29:                                              ; preds = %L29, %L13.preheader16
   %value_phi3 = phi i64 [ %4, %L29 ], [ 1, %L13.preheader16 ]
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:239
; │┌ @ number.jl:7 within `convert`
; ││┌ @ float.jl:239 within `Float64`
     %2 = sitofp i64 %value_phi3 to double
; │└└
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:240
   %memoryref_offset = shl i64 %value_phi3, 3
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:241
   %3 = getelementptr i8, ptr %memoryref_data.pre, i64 %memoryref_offset
   %memoryref_data6 = getelementptr i8, ptr %3, i64 -8
   store double %2, ptr %memoryref_data6, align 8
; └
;  @ REPL[16]:4 within `#23`
; ┌ @ range.jl:915 within `iterate`
   %4 = add nuw i64 %value_phi3, 1
; └
  %5 = icmp ult i64 %value_phi3, %.unbox
  br i1 %5, label %L29, label %L48

L48:                                              ; preds = %L29, %top
  ret void
}

Compare with

julia> code_llvm((FixedSizeVector{Float64,Memory{Float64}},)) do v
           for idx in eachindex(v)
               @inbounds v[idx] = idx
           end
       end
LLVM IR
; Function Signature: var"#26"(FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}})
;  @ REPL[17]:2 within `#26`
define void @"julia_#26_3146"(ptr nocapture noundef nonnull readonly align 8 dereferenceable(16) %"v::FixedSizeArray", ptr nocapture readonly %.roots.v) #0 {
top:
; ┌ @ abstractarray.jl:321 within `eachindex`
; │┌ @ abstractarray.jl:137 within `axes1`
; ││┌ @ abstractarray.jl:98 within `axes`
; │││┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:64 within `size`
; ││││┌ @ Base.jl:49 within `getproperty`
       %0 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 8
; └└└└└
; ┌ @ range.jl:911 within `iterate`
; │┌ @ range.jl:688 within `isempty`
; ││┌ @ operators.jl:425 within `>`
; │││┌ @ int.jl:83 within `<`
      %.unbox = load i64, ptr %0, align 8
      %1 = icmp slt i64 %.unbox, 1
; └└└└
  br i1 %1, label %L48, label %L13.preheader

L13.preheader:                                    ; preds = %top
  %memoryref_mem = load ptr, ptr %.roots.v, align 8
  %memory_data_ptr = getelementptr inbounds { i64, ptr }, ptr %memoryref_mem, i64 0, i32 1
  %memoryref_data = load ptr, ptr %memory_data_ptr, align 8
;  @ REPL[17]:4 within `#26`
  %invariant.gep = getelementptr i8, ptr %memoryref_data, i64 -8
  %min.iters.check = icmp ult i64 %.unbox, 8
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %L13.preheader
  %n.vec = and i64 %.unbox, 9223372036854775800
  %ind.end = or disjoint i64 %n.vec, 1
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.ind = phi <2 x i64> [ <i64 1, i64 2>, %vector.ph ], [ %vec.ind.next, %vector.body ]
  %step.add = add <2 x i64> %vec.ind, <i64 2, i64 2>
  %step.add13 = add <2 x i64> %vec.ind, <i64 4, i64 4>
  %step.add14 = add <2 x i64> %vec.ind, <i64 6, i64 6>
;  @ REPL[17]:3 within `#26`
; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:239
; │┌ @ number.jl:7 within `convert`
; ││┌ @ float.jl:239 within `Float64`
     %2 = sitofp <2 x i64> %vec.ind to <2 x double>
     %3 = sitofp <2 x i64> %step.add to <2 x double>
     %4 = sitofp <2 x i64> %step.add13 to <2 x double>
     %5 = sitofp <2 x i64> %step.add14 to <2 x double>
; │└└
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:240
   %offset.idx = shl i64 %index, 3
   %6 = or disjoint i64 %offset.idx, 8
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:241
   %7 = getelementptr i8, ptr %invariant.gep, i64 %6
   %8 = getelementptr double, ptr %7, i64 2
   %9 = getelementptr double, ptr %7, i64 4
   %10 = getelementptr double, ptr %7, i64 6
   store <2 x double> %2, ptr %7, align 8
   store <2 x double> %3, ptr %8, align 8
   store <2 x double> %4, ptr %9, align 8
   store <2 x double> %5, ptr %10, align 8
   %index.next = add nuw i64 %index, 8
   %vec.ind.next = add <2 x i64> %vec.ind, <i64 8, i64 8>
   %11 = icmp eq i64 %index.next, %n.vec
   br i1 %11, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
; └
;  @ REPL[17]:4 within `#26`
  %cmp.n = icmp eq i64 %.unbox, %n.vec
  br i1 %cmp.n, label %L48, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %L13.preheader
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L13.preheader ]
  br label %L13

L13:                                              ; preds = %L13, %scalar.ph
  %value_phi3 = phi i64 [ %13, %L13 ], [ %bc.resume.val, %scalar.ph ]
;  @ REPL[17]:3 within `#26`
; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:239
; │┌ @ number.jl:7 within `convert`
; ││┌ @ float.jl:239 within `Float64`
     %12 = sitofp i64 %value_phi3 to double
; │└└
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:240
   %memoryref_offset = shl i64 %value_phi3, 3
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:60 within `setindex!` @ genericmemory.jl:241
   %gep = getelementptr i8, ptr %invariant.gep, i64 %memoryref_offset
   store double %12, ptr %gep, align 8
; └
;  @ REPL[17]:4 within `#26`
; ┌ @ range.jl:915 within `iterate`
; │┌ @ promotion.jl:639 within `==`
    %.not.not = icmp eq i64 %value_phi3, %.unbox
; │└
   %13 = add nuw i64 %value_phi3, 1
; └
  br i1 %.not.not, label %L48, label %L13

L48:                                              ; preds = %L13, %middle.block, %top
  ret void
}

I believe this is entirely due to the fact we're using Memory instead of MemoryRef, and MemoryRef is somehow better optimised for stores (it doesn't need to call memoryrefnew(mem)). As a proof of concept (don't take this code seriously!), with this patch:

diff --git a/src/FixedSizeArrays.jl b/src/FixedSizeArrays.jl
index 6aeac57..3fd1506 100644
--- a/src/FixedSizeArrays.jl
+++ b/src/FixedSizeArrays.jl
@@ -13,9 +13,11 @@ struct Internal end
 
 struct FixedSizeArray{T,N,Mem<:GenericMemory{<:Any,T}} <: DenseArray{T,N}
     mem::Mem
+    ref::MemoryRef{T}
     size::NTuple{N,Int}
     function FixedSizeArray{T,N,M}(::Internal, mem::M, size::NTuple{N,Int}) where {T,N,M<:GenericMemory{<:Any,T}}
-        new{T,N,M}(mem, size)
+        ref = Base.memoryref(mem)
+        new{T,N,M}(mem, ref, size)
     end
 end
 
@@ -57,7 +59,7 @@ Base.IndexStyle(::Type{<:FixedSizeArray}) = IndexLinear()
 Base.@propagate_inbounds Base.getindex(A::FixedSizeArray, i::Int) = A.mem[i]
 Base.@propagate_inbounds Base.@assume_effects :noub_if_noinbounds function Base.setindex!(A::FixedSizeArray{T}, x, i::Int) where {T}
     @boundscheck checkbounds(A, i)
-    @inbounds A.mem[i] = x
+    Base.memoryrefset!(Base.memoryrefnew(A.ref, i, false), x isa T ? x : convert(T,x)::T, :not_atomic, false)
     return A
 end

I get

julia> code_llvm((FixedSizeVector{Float64,Memory{Float64}},)) do v
           for idx in eachindex(v)
               v[idx] = 1.0
           end
       end
; Function Signature: var"#2"(FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}})
;  @ REPL[2]:2 within `#2`
define void @"julia_#2_2000"(ptr nocapture noundef nonnull readonly align 8 dereferenceable(32) %"v::FixedSizeArray", ptr nocapture readonly %.roots.v) #0 {
top:
; ┌ @ abstractarray.jl:321 within `eachindex`
; │┌ @ abstractarray.jl:137 within `axes1`
; ││┌ @ abstractarray.jl:98 within `axes`
; │││┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:66 within `size`
; ││││┌ @ Base.jl:49 within `getproperty`
       %0 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 24
; └└└└└
; ┌ @ range.jl:911 within `iterate`
; │┌ @ range.jl:688 within `isempty`
; ││┌ @ operators.jl:425 within `>`
; │││┌ @ int.jl:83 within `<`
      %.unbox = load i64, ptr %0, align 8
      %1 = icmp slt i64 %.unbox, 1
; └└└└
  br i1 %1, label %L45, label %L13.preheader14

L13.preheader14:                                  ; preds = %top
  %2 = getelementptr inbounds i8, ptr %"v::FixedSizeArray", i64 8
  %memoryref_data = load ptr, ptr %2, align 8
;  @ REPL[2]:3 within `#2`
; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:61 within `setindex!`
; │┌ @ abstractarray.jl:699 within `checkbounds`
    %invariant.gep = getelementptr i8, ptr %memoryref_data, i64 -8
    %min.iters.check = icmp ult i64 %.unbox, 8
    br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:                                        ; preds = %L13.preheader14
    %n.vec = and i64 %.unbox, 9223372036854775800
    %ind.end = or disjoint i64 %n.vec, 1
    br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; │└
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:62 within `setindex!`
   %offset.idx = shl i64 %index, 3
   %3 = or disjoint i64 %offset.idx, 8
   %4 = getelementptr i8, ptr %invariant.gep, i64 %3
   %5 = getelementptr i64, ptr %4, i64 2
   %6 = getelementptr i64, ptr %4, i64 4
   %7 = getelementptr i64, ptr %4, i64 6
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %4, align 8
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %5, align 8
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %6, align 8
   store <2 x i64> <i64 4607182418800017408, i64 4607182418800017408>, ptr %7, align 8
   %index.next = add nuw i64 %index, 8
   %8 = icmp eq i64 %index.next, %n.vec
   br i1 %8, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
; └
;  @ REPL[2]:4 within `#2`
  %cmp.n = icmp eq i64 %.unbox, %n.vec
  br i1 %cmp.n, label %L45, label %scalar.ph

scalar.ph:                                        ; preds = %middle.block, %L13.preheader14
  %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L13.preheader14 ]
;  @ REPL[2]:3 within `#2`
; ┌ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:61 within `setindex!`
; │┌ @ abstractarray.jl:699 within `checkbounds`
    br label %L30

L30:                                              ; preds = %L30, %scalar.ph
    %value_phi3 = phi i64 [ %9, %L30 ], [ %bc.resume.val, %scalar.ph ]
; │└
; │ @ /Users/mose/.julia/dev/FixedSizeArrays/src/FixedSizeArrays.jl:62 within `setindex!`
   %memoryref_offset = shl i64 %value_phi3, 3
   %gep = getelementptr i8, ptr %invariant.gep, i64 %memoryref_offset
   store i64 4607182418800017408, ptr %gep, align 8
; └
;  @ REPL[2]:4 within `#2`
; ┌ @ range.jl:915 within `iterate`
   %9 = add nuw i64 %value_phi3, 1
; └
  %10 = icmp ult i64 %value_phi3, %.unbox
  br i1 %10, label %L30, label %L45

L45:                                              ; preds = %L30, %middle.block, %top
  ret void
}

which is pretty much what you'd get for Vectors. It looks like using Memory instead of MemoryRef adds an extra layer of indirection, no idea where to go from here. CC: @oscardssmith who may have opinions about this (I already checked, JuliaLang/julia#55913 doesn't change anything here)

Metadata

Metadata

Assignees

No one assigned

    Labels

    performanceMust go fasterupstreamIssue related to an upstream package or Julia itself

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions