Description
julia> a = rand(100,100); b = similar(a); av = view(a, axes(a)...); bv = view(b, axes(b)...); bv2 = view(b, UnitRange.(axes(b))...);
julia> @btime copyto!($bv, $av); # fast, indices are Base.OneTos
2.333 μs (0 allocations: 0 bytes)
julia> @btime copyto!($bv2, $av); # slow, indices are UnitRanges
24.093 μs (0 allocations: 0 bytes)
This performance difference appears to arise from a lack of vectorization in indexing. In the first case, the output of
@code_llvm Base.copyto_unaliased!(IndexCartesian(), bv, IndexCartesian(), av);
contains
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; └└
; @ abstractarray.jl:1105 within `copyto_unaliased!`
; ┌ @ abstractarray.jl:1335 within `getindex`
; │┌ @ abstractarray.jl:1381 within `_getindex`
; ││┌ @ subarray.jl:317 within `getindex` @ array.jl:924 @ essentials.jl:892
%26 = add i64 %index, %20
%27 = getelementptr inbounds double, ptr %12, i64 %26
%wide.load = load <4 x double>, ptr %27, align 8
%28 = getelementptr inbounds double, ptr %27, i64 4
%wide.load1001 = load <4 x double>, ptr %28, align 8
%29 = getelementptr inbounds double, ptr %27, i64 8
%wide.load1002 = load <4 x double>, ptr %29, align 8
%30 = getelementptr inbounds double, ptr %27, i64 12
%wide.load1003 = load <4 x double>, ptr %30, align 8
; └└└
; ┌ @ abstractarray.jl:1436 within `setindex!`
; │┌ @ abstractarray.jl:1466 within `_setindex!`
; ││┌ @ subarray.jl:369 within `setindex!` @ array.jl:988
%31 = add i64 %index, %21
%32 = getelementptr inbounds double, ptr %13, i64 %31
store <4 x double> %wide.load, ptr %32, align 8
%33 = getelementptr inbounds double, ptr %32, i64 4
store <4 x double> %wide.load1001, ptr %33, align 8
%34 = getelementptr inbounds double, ptr %32, i64 8
store <4 x double> %wide.load1002, ptr %34, align 8
%35 = getelementptr inbounds double, ptr %32, i64 12
store <4 x double> %wide.load1003, ptr %35, align 8
%index.next = add nuw i64 %index, 16
%36 = icmp eq i64 %index.next, %n.vec
br i1 %36, label %middle.block, label %vector.body
whereas
@code_llvm Base.copyto_unaliased!(IndexCartesian(), bv2, IndexCartesian(), av);
contains
; @ abstractarray.jl:1105 within `copyto_unaliased!`
; ┌ @ abstractarray.jl:1335 within `getindex`
; │┌ @ abstractarray.jl:1381 within `_getindex`
; ││┌ @ subarray.jl:317 within `getindex` @ array.jl:924
; │││┌ @ abstractarray.jl:1370 within `_to_linear_index`
; ││││┌ @ abstractarray.jl:3080 within `_sub2ind` @ abstractarray.jl:3096
; │││││┌ @ abstractarray.jl:3112 within `_sub2ind_recurse` @ abstractarray.jl:3112
; ││││││┌ @ abstractarray.jl:3119 within `offsetin`
; │││││││┌ @ int.jl:86 within `-`
%19 = add i64 %value_phi71, -1
; ││││││└└
; ││││││┌ @ int.jl:88 within `*`
%20 = mul i64 %"src::SubArray.parent190.size209.sroa.0.0.copyload", %19
; ││││││└
; ││││││┌ @ int.jl:87 within `+`
%21 = add i64 %value_phi70, -1
; │││└└└└
; │││ @ subarray.jl:317 within `getindex` @ array.jl:924 @ essentials.jl:892
%22 = add i64 %21, %20
%23 = getelementptr inbounds double, ptr %16, i64 %22
%24 = load double, ptr %23, align 8
; └└└
; ┌ @ abstractarray.jl:1436 within `setindex!`
; │┌ @ abstractarray.jl:1466 within `_setindex!`
; ││┌ @ subarray.jl:369 within `setindex!`
; │││┌ @ subarray.jl:295 within `reindex` @ subarray.jl:295
; ││││┌ @ views.jl:150 within `maybeview`
; │││││┌ @ array.jl:3067 within `getindex`
; ││││││┌ @ range.jl:938 within `_getindex`
; │││││││┌ @ int.jl:87 within `+`
%25 = add i64 %value_phi71, -2
; │││└└└└└
; │││ @ subarray.jl:369 within `setindex!` @ array.jl:988
; │││┌ @ abstractarray.jl:1370 within `_to_linear_index`
; ││││┌ @ abstractarray.jl:3080 within `_sub2ind` @ abstractarray.jl:3096
; │││││┌ @ abstractarray.jl:3112 within `_sub2ind_recurse` @ abstractarray.jl:3112
; ││││││┌ @ abstractarray.jl:3119 within `offsetin`
; │││││││┌ @ int.jl:86 within `-`
%26 = add i64 %25, %"dest::SubArray.indices_ptr[2]_ptr.start_ptr.unbox"
; ││││││└└
; ││││││┌ @ int.jl:88 within `*`
%27 = mul i64 %"dest::SubArray.parent.size298.sroa.0.0.copyload", %26
; │││└└└└
; │││ @ subarray.jl:369 within `setindex!`
; │││┌ @ subarray.jl:295 within `reindex`
; ││││┌ @ views.jl:150 within `maybeview`
; │││││┌ @ array.jl:3067 within `getindex`
; ││││││┌ @ range.jl:938 within `_getindex`
; │││││││┌ @ int.jl:87 within `+`
%28 = add i64 %value_phi70, -2
; │││└└└└└
; │││ @ subarray.jl:369 within `setindex!` @ array.jl:988
; │││┌ @ abstractarray.jl:1370 within `_to_linear_index`
; ││││┌ @ abstractarray.jl:3080 within `_sub2ind` @ abstractarray.jl:3096
; │││││┌ @ abstractarray.jl:3112 within `_sub2ind_recurse` @ abstractarray.jl:3112
; ││││││┌ @ int.jl:87 within `+`
%29 = add i64 %28, %"dest::SubArray.indices_ptr[1]_ptr.start_ptr.unbox"
; │││└└└└
%30 = add i64 %29, %27
%31 = getelementptr inbounds double, ptr %17, i64 %30
store double %24, ptr %31, align 8
; └└└
In particular, if I add a @simd
declaration, this appears to improve performance considerably:
julia> function mycopyto!(dest::AbstractArray, src::AbstractArray)
iterdest = eachindex(dest)
itersrc = eachindex(src)
if iterdest == itersrc
@inbounds @simd for I in iterdest
if isassigned(src, I)
dest[I] = src[I]
else
Base._unsetindex!(dest, I)
end
end
else
@invoke copyto!(dest::AbstractArray, src::AbstractArray)
end
return dest
end
mycopyto! (generic function with 1 method)
julia> @btime mycopyto!($bv2, $av);
2.276 μs (0 allocations: 0 bytes)
Versioninfo:
julia> versioninfo()
Julia Version 1.12.0-DEV.50
Commit 8425b0ea03* (2024-02-21 20:33 UTC)
Platform Info:
OS: Linux (x86_64-linux-gnu)
CPU: 8 × 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
WORD_SIZE: 64
LLVM: libLLVM-16.0.6 (ORCJIT, tigerlake)
Threads: 1 default, 0 interactive, 1 GC (on 8 virtual cores)
Environment:
LD_LIBRARY_PATH = :/usr/lib/x86_64-linux-gnu/gtk-3.0/modules
JULIA_EDITOR = subl