Open
Description
Reported by @matthias314 on discourse, sum
ming a FixedSizeVector
seems to be slightly slower than a Vector
:
julia> for n in (0, 2^0, 2^5, 2^10, 2^15, 2^20)
@info "n = $(n)"
@eval @btime sum(v) setup=(Random.seed!(1); v=rand!(Vector{Float64}(undef, $(n))))
@eval @btime sum(v) setup=(Random.seed!(1); v=rand!(FixedSizeVector{Float64}(undef, $(n))))
end
[ Info: n = 0
2.416 ns (0 allocations: 0 bytes)
1.166 ns (0 allocations: 0 bytes)
[ Info: n = 1
2.125 ns (0 allocations: 0 bytes)
2.708 ns (0 allocations: 0 bytes)
[ Info: n = 32
9.593 ns (0 allocations: 0 bytes)
9.885 ns (0 allocations: 0 bytes)
[ Info: n = 1024
101.183 ns (0 allocations: 0 bytes)
103.459 ns (0 allocations: 0 bytes)
[ Info: n = 32768
3.318 μs (0 allocations: 0 bytes)
3.427 μs (0 allocations: 0 bytes)
[ Info: n = 1048576
117.416 μs (0 allocations: 0 bytes)
117.666 μs (0 allocations: 0 bytes)
The difference isn't dramatic, but mostly systematic. But what really puzzles me is that I can't tell why. The LLVM IR looks nearly identical for the two containers, except for the fact FixedSizeVector
needs to construct the MemoryRef
, which is something which came up already in #70.
julia> code_llvm(Base.mapreduce_impl, (typeof(identity), typeof(Base.add_sum), Vector{Float64}, Int, Int, Int); )
LLVM IR for Vector
; Function Signature: mapreduce_impl(typeof(Base.identity), typeof(Base.add_sum), Array{Float64, 1}, Int64, Int64, Int64)
; @ reduce.jl:251 within `mapreduce_impl`
; Function Attrs: noinline
define double @julia_mapreduce_impl_7334(ptr noundef nonnull align 8 dereferenceable(24) %"A::Array", i64 signext %"ifirst::Int64", i64 signext %"ilast::Int64", i64 signext %"blksize::Int64") local_unnamed_addr #0 {
top:
; @ reduce.jl:253 within `mapreduce_impl`
; ┌ @ promotion.jl:637 within `==`
%.not = icmp eq i64 %"ifirst::Int64", %"ilast::Int64"
; └
br i1 %.not, label %L17, label %L22
L17: ; preds = %top
; @ reduce.jl:254 within `mapreduce_impl`
; ┌ @ essentials.jl:953 within `getindex`
%memoryref_data = load ptr, ptr %"A::Array", align 8
%0 = getelementptr double, ptr %memoryref_data, i64 %"ifirst::Int64"
%memoryref_data1 = getelementptr i8, ptr %0, i64 -8
%1 = load double, ptr %memoryref_data1, align 8
; └
; @ reduce.jl:255 within `mapreduce_impl`
br label %common.ret
common.ret: ; preds = %L126, %L83, %middle.block, %L39, %L17
%common.ret.op = phi double [ %1, %L17 ], [ %32, %L126 ], [ %6, %L39 ], [ %23, %middle.block ], [ %26, %L83 ]
; @ reduce.jl within `mapreduce_impl`
ret double %common.ret.op
L22: ; preds = %top
; @ reduce.jl:256 within `mapreduce_impl`
; ┌ @ int.jl:86 within `-`
%2 = sub i64 %"ilast::Int64", %"ifirst::Int64"
; └
; ┌ @ int.jl:83 within `<`
%.not49 = icmp slt i64 %2, %"blksize::Int64"
; └
br i1 %.not49, label %L39, label %L126
L39: ; preds = %L22
; @ reduce.jl:258 within `mapreduce_impl`
; ┌ @ essentials.jl:953 within `getindex`
%memoryref_data6 = load ptr, ptr %"A::Array", align 8
%3 = getelementptr double, ptr %memoryref_data6, i64 %"ifirst::Int64"
%memoryref_data13 = getelementptr i8, ptr %3, i64 -8
%4 = load double, ptr %memoryref_data13, align 8
; └
; @ reduce.jl:259 within `mapreduce_impl`
; ┌ @ essentials.jl:953 within `getindex`
%5 = load double, ptr %3, align 8
; └
; @ reduce.jl:260 within `mapreduce_impl`
; ┌ @ reduce.jl:19 within `add_sum`
; │┌ @ float.jl:492 within `+`
%6 = fadd double %4, %5
; └└
; @ reduce.jl:261 within `mapreduce_impl`
; ┌ @ simdloop.jl:69 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%7 = add i64 %"ifirst::Int64", 2
; │└
; │┌ @ range.jl:5 within `Colon`
; ││┌ @ range.jl:415 within `UnitRange`
; │││┌ @ range.jl:426 within `unitrange_last`
; ││││┌ @ operators.jl:472 within `>=`
; │││││┌ @ int.jl:522 within `<=`
%.not50 = icmp sgt i64 %7, %"ilast::Int64"
; ││││└└
%8 = add i64 %"ifirst::Int64", 1
%value_phi = select i1 %.not50, i64 %8, i64 %"ilast::Int64"
; │└└└
; │ @ simdloop.jl:71 within `macro expansion`
; │┌ @ simdloop.jl:51 within `simd_inner_length`
; ││┌ @ range.jl:783 within `length`
; │││┌ @ int.jl:86 within `-`
%9 = sub i64 %value_phi, %7
%.not5152 = icmp ult i64 %9, 9223372036854775807
; │└└└
; │ @ simdloop.jl:72 within `macro expansion`
br i1 %.not5152, label %L83.lr.ph, label %common.ret
L83.lr.ph: ; preds = %L39
%10 = getelementptr double, ptr %memoryref_data6, i64 %7
; │ @ simdloop.jl:75 within `macro expansion`
%invariant.gep = getelementptr i8, ptr %10, i64 -8
%11 = xor i64 %"ifirst::Int64", -1
%12 = add i64 %value_phi, %11
%min.iters.check = icmp ult i64 %12, 8
br i1 %min.iters.check, label %L83, label %vector.ph
vector.ph: ; preds = %L83.lr.ph
%n.vec = and i64 %12, -8
%13 = insertelement <2 x double> <double poison, double -0.000000e+00>, double %6, i64 0
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <2 x double> [ %13, %vector.ph ], [ %18, %vector.body ]
%vec.phi55 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %19, %vector.body ]
%vec.phi56 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %20, %vector.body ]
%vec.phi57 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %21, %vector.body ]
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:262
; │┌ @ essentials.jl:953 within `getindex`
%14 = getelementptr double, ptr %invariant.gep, i64 %index
%15 = getelementptr i8, ptr %14, i64 16
%16 = getelementptr i8, ptr %14, i64 32
%17 = getelementptr i8, ptr %14, i64 48
%wide.load = load <2 x double>, ptr %14, align 8
%wide.load58 = load <2 x double>, ptr %15, align 8
%wide.load59 = load <2 x double>, ptr %16, align 8
%wide.load60 = load <2 x double>, ptr %17, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:263
; │┌ @ reduce.jl:19 within `add_sum`
; ││┌ @ float.jl:492 within `+`
%18 = fadd reassoc contract <2 x double> %vec.phi, %wide.load
%19 = fadd reassoc contract <2 x double> %vec.phi55, %wide.load58
%20 = fadd reassoc contract <2 x double> %vec.phi56, %wide.load59
%21 = fadd reassoc contract <2 x double> %vec.phi57, %wide.load60
; │└└
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%index.next = add nuw i64 %index, 8
%22 = icmp eq i64 %index.next, %n.vec
br i1 %22, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
; │└└
; │ @ simdloop.jl:75 within `macro expansion`
%bin.rdx = fadd reassoc contract <2 x double> %19, %18
%bin.rdx61 = fadd reassoc contract <2 x double> %20, %bin.rdx
%bin.rdx62 = fadd reassoc contract <2 x double> %21, %bin.rdx61
%23 = call reassoc contract double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx62)
%cmp.n = icmp eq i64 %12, %n.vec
br i1 %cmp.n, label %common.ret, label %L83
L83: ; preds = %L83, %middle.block, %L83.lr.ph
%value_phi2754 = phi i64 [ %24, %L83 ], [ 0, %L83.lr.ph ], [ %n.vec, %middle.block ]
%value_phi2653 = phi double [ %26, %L83 ], [ %6, %L83.lr.ph ], [ %23, %middle.block ]
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%24 = add nuw nsw i64 %value_phi2754, 1
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:262
; │┌ @ essentials.jl:953 within `getindex`
%gep = getelementptr double, ptr %invariant.gep, i64 %value_phi2754
%25 = load double, ptr %gep, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:263
; │┌ @ reduce.jl:19 within `add_sum`
; ││┌ @ float.jl:492 within `+`
%26 = fadd reassoc contract double %value_phi2653, %25
; │└└
; │ @ simdloop.jl:75 within `macro expansion`
; │┌ @ int.jl:83 within `<`
%exitcond.not = icmp eq i64 %value_phi2754, %9
; │└
br i1 %exitcond.not, label %common.ret, label %L83
L126: ; preds = %L22
; └
; @ reduce.jl:268 within `mapreduce_impl`
; ┌ @ int.jl:542 within `>>` @ int.jl:535
%27 = ashr i64 %2, 1
; └
; ┌ @ int.jl:87 within `+`
%28 = add i64 %27, %"ifirst::Int64"
; └
; @ reduce.jl:269 within `mapreduce_impl`
%29 = call double @julia_mapreduce_impl_7334(ptr %"A::Array", i64 signext %"ifirst::Int64", i64 signext %28, i64 signext %"blksize::Int64")
; @ reduce.jl:270 within `mapreduce_impl`
; ┌ @ int.jl:87 within `+`
%30 = add i64 %28, 1
; └
%31 = call double @julia_mapreduce_impl_7334(ptr %"A::Array", i64 signext %30, i64 signext %"ilast::Int64", i64 signext %"blksize::Int64")
; @ reduce.jl:271 within `mapreduce_impl`
; ┌ @ reduce.jl:19 within `add_sum`
; │┌ @ float.jl:492 within `+`
%32 = fadd double %29, %31
; └└
br label %common.ret
}
julia> code_llvm(Base.mapreduce_impl, (typeof(identity), typeof(Base.add_sum), FixedSizeVectorDefault{Float64}, Int, Int, Int); )
LLVM IR for FixedSizeVectorDefault
; Function Signature: mapreduce_impl(typeof(Base.identity), typeof(Base.add_sum), FixedSizeArrays.FixedSizeArray{Float64, 1, Memory{Float64}}, Int64, Int64, Int64)
; @ reduce.jl:251 within `mapreduce_impl`
; Function Attrs: noinline
define double @julia_mapreduce_impl_7339(ptr nocapture noundef nonnull readonly align 8 dereferenceable(16) %"A::FixedSizeArray", ptr nocapture readonly %.roots.A, i64 signext %"ifirst::Int64", i64 signext %"ilast::Int64", i64 signext %"blksize::Int64") local_unnamed_addr #0 {
top:
%gcframe1 = alloca [4 x ptr], align 16
call void @llvm.memset.p0.i64(ptr align 16 %gcframe1, i8 0, i64 32, i1 true)
%0 = getelementptr inbounds nuw i8, ptr %gcframe1, i64 24
%1 = getelementptr inbounds nuw i8, ptr %gcframe1, i64 16
%pgcstack = call ptr inttoptr (i64 4377493260 to ptr)(i64 4377493296) #12
store i64 8, ptr %gcframe1, align 8
%task.gcstack = load ptr, ptr %pgcstack, align 8
%frame.prev = getelementptr inbounds nuw i8, ptr %gcframe1, i64 8
store ptr %task.gcstack, ptr %frame.prev, align 8
store ptr %gcframe1, ptr %pgcstack, align 8
%memoryref_mem = load ptr, ptr %.roots.A, align 8
; @ reduce.jl:253 within `mapreduce_impl`
; ┌ @ promotion.jl:637 within `==`
%.not = icmp eq i64 %"ifirst::Int64", %"ilast::Int64"
; └
br i1 %.not, label %L19, label %L26
L19: ; preds = %top
; @ reduce.jl:254 within `mapreduce_impl`
; ┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%memory_data_ptr = getelementptr inbounds nuw i8, ptr %memoryref_mem, i64 8
%memoryref_data = load ptr, ptr %memory_data_ptr, align 8
%2 = getelementptr double, ptr %memoryref_data, i64 %"ifirst::Int64"
%memoryref_data2 = getelementptr i8, ptr %2, i64 -8
%3 = load double, ptr %memoryref_data2, align 8
; └
; @ reduce.jl:255 within `mapreduce_impl`
br label %common.ret
common.ret: ; preds = %L142, %L95, %middle.block, %L45, %L19
%common.ret.op = phi double [ %3, %L19 ], [ %34, %L142 ], [ %8, %L45 ], [ %25, %middle.block ], [ %28, %L95 ]
%frame.prev96 = load ptr, ptr %frame.prev, align 8
store ptr %frame.prev96, ptr %pgcstack, align 8
; @ reduce.jl within `mapreduce_impl`
ret double %common.ret.op
L26: ; preds = %top
; @ reduce.jl:256 within `mapreduce_impl`
; ┌ @ int.jl:86 within `-`
%4 = sub i64 %"ilast::Int64", %"ifirst::Int64"
; └
; ┌ @ int.jl:83 within `<`
%.not72 = icmp slt i64 %4, %"blksize::Int64"
; └
br i1 %.not72, label %L45, label %L142
L45: ; preds = %L26
; @ reduce.jl:258 within `mapreduce_impl`
; ┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%memory_data_ptr5 = getelementptr inbounds nuw i8, ptr %memoryref_mem, i64 8
%memoryref_data7 = load ptr, ptr %memory_data_ptr5, align 8
%5 = getelementptr double, ptr %memoryref_data7, i64 %"ifirst::Int64"
%memoryref_data12 = getelementptr i8, ptr %5, i64 -8
%6 = load double, ptr %memoryref_data12, align 8
; └
; @ reduce.jl:259 within `mapreduce_impl`
; ┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%7 = load double, ptr %5, align 8
; └
; @ reduce.jl:260 within `mapreduce_impl`
; ┌ @ reduce.jl:19 within `add_sum`
; │┌ @ float.jl:492 within `+`
%8 = fadd double %6, %7
; └└
; @ reduce.jl:261 within `mapreduce_impl`
; ┌ @ simdloop.jl:69 within `macro expansion`
; │┌ @ int.jl:87 within `+`
%9 = add i64 %"ifirst::Int64", 2
; │└
; │┌ @ range.jl:5 within `Colon`
; ││┌ @ range.jl:415 within `UnitRange`
; │││┌ @ range.jl:426 within `unitrange_last`
; ││││┌ @ operators.jl:472 within `>=`
; │││││┌ @ int.jl:522 within `<=`
%.not73 = icmp sgt i64 %9, %"ilast::Int64"
; ││││└└
%10 = add i64 %"ifirst::Int64", 1
%value_phi = select i1 %.not73, i64 %10, i64 %"ilast::Int64"
; │└└└
; │ @ simdloop.jl:71 within `macro expansion`
; │┌ @ simdloop.jl:51 within `simd_inner_length`
; ││┌ @ range.jl:783 within `length`
; │││┌ @ int.jl:86 within `-`
%11 = sub i64 %value_phi, %9
%.not7475 = icmp ult i64 %11, 9223372036854775807
; │└└└
; │ @ simdloop.jl:72 within `macro expansion`
br i1 %.not7475, label %L95.lr.ph, label %common.ret
L95.lr.ph: ; preds = %L45
%12 = getelementptr double, ptr %memoryref_data7, i64 %9
; │ @ simdloop.jl:75 within `macro expansion`
%invariant.gep = getelementptr i8, ptr %12, i64 -8
%13 = xor i64 %"ifirst::Int64", -1
%14 = add i64 %value_phi, %13
%min.iters.check = icmp ult i64 %14, 8
br i1 %min.iters.check, label %L95, label %vector.ph
vector.ph: ; preds = %L95.lr.ph
%n.vec = and i64 %14, -8
%15 = insertelement <2 x double> <double poison, double -0.000000e+00>, double %8, i64 0
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <2 x double> [ %15, %vector.ph ], [ %20, %vector.body ]
%vec.phi78 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %21, %vector.body ]
%vec.phi79 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %22, %vector.body ]
%vec.phi80 = phi <2 x double> [ splat (double -0.000000e+00), %vector.ph ], [ %23, %vector.body ]
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:262
; │┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%16 = getelementptr double, ptr %invariant.gep, i64 %index
%17 = getelementptr i8, ptr %16, i64 16
%18 = getelementptr i8, ptr %16, i64 32
%19 = getelementptr i8, ptr %16, i64 48
%wide.load = load <2 x double>, ptr %16, align 8
%wide.load81 = load <2 x double>, ptr %17, align 8
%wide.load82 = load <2 x double>, ptr %18, align 8
%wide.load83 = load <2 x double>, ptr %19, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:263
; │┌ @ reduce.jl:19 within `add_sum`
; ││┌ @ float.jl:492 within `+`
%20 = fadd reassoc contract <2 x double> %vec.phi, %wide.load
%21 = fadd reassoc contract <2 x double> %vec.phi78, %wide.load81
%22 = fadd reassoc contract <2 x double> %vec.phi79, %wide.load82
%23 = fadd reassoc contract <2 x double> %vec.phi80, %wide.load83
; │└└
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%index.next = add nuw i64 %index, 8
%24 = icmp eq i64 %index.next, %n.vec
br i1 %24, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
; │└└
; │ @ simdloop.jl:75 within `macro expansion`
%bin.rdx = fadd reassoc contract <2 x double> %21, %20
%bin.rdx84 = fadd reassoc contract <2 x double> %22, %bin.rdx
%bin.rdx85 = fadd reassoc contract <2 x double> %23, %bin.rdx84
%25 = call reassoc contract double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> %bin.rdx85)
%cmp.n = icmp eq i64 %14, %n.vec
br i1 %cmp.n, label %common.ret, label %L95
L95: ; preds = %L95, %middle.block, %L95.lr.ph
%value_phi2377 = phi i64 [ %26, %L95 ], [ 0, %L95.lr.ph ], [ %n.vec, %middle.block ]
%value_phi2276 = phi double [ %28, %L95 ], [ %8, %L95.lr.ph ], [ %25, %middle.block ]
; │ @ simdloop.jl:76 within `macro expansion`
; │┌ @ simdloop.jl:54 within `simd_index`
; ││┌ @ int.jl:87 within `+`
%26 = add nuw nsw i64 %value_phi2377, 1
; │└└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:262
; │┌ @ /Users/mose/.julia/packages/FixedSizeArrays/22QFl/src/FixedSizeArray.jl:167 within `getindex` @ essentials.jl:386
%gep = getelementptr double, ptr %invariant.gep, i64 %value_phi2377
%27 = load double, ptr %gep, align 8
; │└
; │ @ simdloop.jl:77 within `macro expansion` @ reduce.jl:263
; │┌ @ reduce.jl:19 within `add_sum`
; ││┌ @ float.jl:492 within `+`
%28 = fadd reassoc contract double %value_phi2276, %27
; │└└
; │ @ simdloop.jl:75 within `macro expansion`
; │┌ @ int.jl:83 within `<`
%exitcond.not = icmp eq i64 %value_phi2377, %11
; │└
br i1 %exitcond.not, label %common.ret, label %L95
L142: ; preds = %L26
; └
; @ reduce.jl:268 within `mapreduce_impl`
; ┌ @ int.jl:542 within `>>` @ int.jl:535
%29 = ashr i64 %4, 1
; └
; ┌ @ int.jl:87 within `+`
%30 = add i64 %29, %"ifirst::Int64"
; └
; @ reduce.jl:269 within `mapreduce_impl`
store ptr %memoryref_mem, ptr %0, align 8
%31 = call double @julia_mapreduce_impl_7339(ptr nocapture readonly %"A::FixedSizeArray", ptr nocapture nonnull readonly %0, i64 signext %"ifirst::Int64", i64 signext %30, i64 signext %"blksize::Int64")
; @ reduce.jl:270 within `mapreduce_impl`
; ┌ @ int.jl:87 within `+`
%32 = add i64 %30, 1
; └
store ptr %memoryref_mem, ptr %1, align 8
%33 = call double @julia_mapreduce_impl_7339(ptr nocapture readonly %"A::FixedSizeArray", ptr nocapture nonnull readonly %1, i64 signext %32, i64 signext %"ilast::Int64", i64 signext %"blksize::Int64")
; @ reduce.jl:271 within `mapreduce_impl`
; ┌ @ reduce.jl:19 within `add_sum`
; │┌ @ float.jl:492 within `+`
%34 = fadd double %31, %33
; └└
br label %common.ret
}
Base.mapreduce_impl
is a recursive function to do pairwaise summation, so even a small overhead at the beginning of the generated code for the function scales with the size of the input data.