Skip to content

copyto! does not vectorize for views with UnitRange indices #53430

@jishnub

Description

@jishnub
julia> a = rand(100,100); b = similar(a); av = view(a, axes(a)...); bv = view(b, axes(b)...); bv2 = view(b, UnitRange.(axes(b))...);

julia> @btime copyto!($bv, $av); # fast, indices are Base.OneTos
  2.333 μs (0 allocations: 0 bytes)

julia> @btime copyto!($bv2, $av); # slow, indices are UnitRanges
  24.093 μs (0 allocations: 0 bytes)

This performance difference appears to arise from a lack of vectorization in indexing. In the first case, the output of

@code_llvm Base.copyto_unaliased!(IndexCartesian(), bv, IndexCartesian(), av);

contains

vector.body:                                      ; preds = %vector.body, %vector.ph
    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; └└
;  @ abstractarray.jl:1105 within `copyto_unaliased!`
; ┌ @ abstractarray.jl:1335 within `getindex`
; │┌ @ abstractarray.jl:1381 within `_getindex`
; ││┌ @ subarray.jl:317 within `getindex` @ array.jl:924 @ essentials.jl:892
     %26 = add i64 %index, %20
     %27 = getelementptr inbounds double, ptr %12, i64 %26
     %wide.load = load <4 x double>, ptr %27, align 8
     %28 = getelementptr inbounds double, ptr %27, i64 4
     %wide.load1001 = load <4 x double>, ptr %28, align 8
     %29 = getelementptr inbounds double, ptr %27, i64 8
     %wide.load1002 = load <4 x double>, ptr %29, align 8
     %30 = getelementptr inbounds double, ptr %27, i64 12
     %wide.load1003 = load <4 x double>, ptr %30, align 8
; └└└
; ┌ @ abstractarray.jl:1436 within `setindex!`
; │┌ @ abstractarray.jl:1466 within `_setindex!`
; ││┌ @ subarray.jl:369 within `setindex!` @ array.jl:988
     %31 = add i64 %index, %21
     %32 = getelementptr inbounds double, ptr %13, i64 %31
     store <4 x double> %wide.load, ptr %32, align 8
     %33 = getelementptr inbounds double, ptr %32, i64 4
     store <4 x double> %wide.load1001, ptr %33, align 8
     %34 = getelementptr inbounds double, ptr %32, i64 8
     store <4 x double> %wide.load1002, ptr %34, align 8
     %35 = getelementptr inbounds double, ptr %32, i64 12
     store <4 x double> %wide.load1003, ptr %35, align 8
     %index.next = add nuw i64 %index, 16
     %36 = icmp eq i64 %index.next, %n.vec
     br i1 %36, label %middle.block, label %vector.body

whereas

@code_llvm Base.copyto_unaliased!(IndexCartesian(), bv2, IndexCartesian(), av);

contains

;  @ abstractarray.jl:1105 within `copyto_unaliased!`
; ┌ @ abstractarray.jl:1335 within `getindex`
; │┌ @ abstractarray.jl:1381 within `_getindex`
; ││┌ @ subarray.jl:317 within `getindex` @ array.jl:924
; │││┌ @ abstractarray.jl:1370 within `_to_linear_index`
; ││││┌ @ abstractarray.jl:3080 within `_sub2ind` @ abstractarray.jl:3096
; │││││┌ @ abstractarray.jl:3112 within `_sub2ind_recurse` @ abstractarray.jl:3112
; ││││││┌ @ abstractarray.jl:3119 within `offsetin`
; │││││││┌ @ int.jl:86 within `-`
          %19 = add i64 %value_phi71, -1
; ││││││└└
; ││││││┌ @ int.jl:88 within `*`
         %20 = mul i64 %"src::SubArray.parent190.size209.sroa.0.0.copyload", %19
; ││││││└
; ││││││┌ @ int.jl:87 within `+`
         %21 = add i64 %value_phi70, -1
; │││└└└└
; │││ @ subarray.jl:317 within `getindex` @ array.jl:924 @ essentials.jl:892
     %22 = add i64 %21, %20
     %23 = getelementptr inbounds double, ptr %16, i64 %22
     %24 = load double, ptr %23, align 8
; └└└
; ┌ @ abstractarray.jl:1436 within `setindex!`
; │┌ @ abstractarray.jl:1466 within `_setindex!`
; ││┌ @ subarray.jl:369 within `setindex!`
; │││┌ @ subarray.jl:295 within `reindex` @ subarray.jl:295
; ││││┌ @ views.jl:150 within `maybeview`
; │││││┌ @ array.jl:3067 within `getindex`
; ││││││┌ @ range.jl:938 within `_getindex`
; │││││││┌ @ int.jl:87 within `+`
          %25 = add i64 %value_phi71, -2
; │││└└└└└
; │││ @ subarray.jl:369 within `setindex!` @ array.jl:988
; │││┌ @ abstractarray.jl:1370 within `_to_linear_index`
; ││││┌ @ abstractarray.jl:3080 within `_sub2ind` @ abstractarray.jl:3096
; │││││┌ @ abstractarray.jl:3112 within `_sub2ind_recurse` @ abstractarray.jl:3112
; ││││││┌ @ abstractarray.jl:3119 within `offsetin`
; │││││││┌ @ int.jl:86 within `-`
          %26 = add i64 %25, %"dest::SubArray.indices_ptr[2]_ptr.start_ptr.unbox"
; ││││││└└
; ││││││┌ @ int.jl:88 within `*`
         %27 = mul i64 %"dest::SubArray.parent.size298.sroa.0.0.copyload", %26
; │││└└└└
; │││ @ subarray.jl:369 within `setindex!`
; │││┌ @ subarray.jl:295 within `reindex`
; ││││┌ @ views.jl:150 within `maybeview`
; │││││┌ @ array.jl:3067 within `getindex`
; ││││││┌ @ range.jl:938 within `_getindex`
; │││││││┌ @ int.jl:87 within `+`
          %28 = add i64 %value_phi70, -2
; │││└└└└└
; │││ @ subarray.jl:369 within `setindex!` @ array.jl:988
; │││┌ @ abstractarray.jl:1370 within `_to_linear_index`
; ││││┌ @ abstractarray.jl:3080 within `_sub2ind` @ abstractarray.jl:3096
; │││││┌ @ abstractarray.jl:3112 within `_sub2ind_recurse` @ abstractarray.jl:3112
; ││││││┌ @ int.jl:87 within `+`
         %29 = add i64 %28, %"dest::SubArray.indices_ptr[1]_ptr.start_ptr.unbox"
; │││└└└└
     %30 = add i64 %29, %27
     %31 = getelementptr inbounds double, ptr %17, i64 %30
     store double %24, ptr %31, align 8
; └└└

In particular, if I add a @simd declaration, this appears to improve performance considerably:

julia> function mycopyto!(dest::AbstractArray, src::AbstractArray)
           iterdest = eachindex(dest)
           itersrc = eachindex(src)
           if iterdest == itersrc
               @inbounds @simd for I in iterdest
                   if isassigned(src, I)
                       dest[I] = src[I]
                   else
                       Base._unsetindex!(dest, I)
                   end
               end
           else
               @invoke copyto!(dest::AbstractArray, src::AbstractArray)
           end
           return dest
       end
mycopyto! (generic function with 1 method)

julia> @btime mycopyto!($bv2, $av);
  2.276 μs (0 allocations: 0 bytes)

Versioninfo:

julia> versioninfo()
Julia Version 1.12.0-DEV.50
Commit 8425b0ea03* (2024-02-21 20:33 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 8 × 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
  WORD_SIZE: 64
  LLVM: libLLVM-16.0.6 (ORCJIT, tigerlake)
Threads: 1 default, 0 interactive, 1 GC (on 8 virtual cores)
Environment:
  LD_LIBRARY_PATH = :/usr/lib/x86_64-linux-gnu/gtk-3.0/modules
  JULIA_EDITOR = subl

Metadata

Metadata

Assignees

No one assigned

    Labels

    arrays[a, r, r, a, y, s]performanceMust go fasterregressionRegression in behavior compared to a previous versionregression 1.11Regression in the 1.11 release

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions