Skip to content

Commit a671a3b

Browse files
authored
make renaming perform copy in transform and transform! (#2721)
1 parent ff0d799 commit a671a3b

File tree

4 files changed

+76
-13
lines changed

4 files changed

+76
-13
lines changed

NEWS.md

+5
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@
1111
* `mapcols!` makes sure not to create columns being `AbstractRange` consistently
1212
with other methods that add columns to a `DataFrame`
1313
([#2594](https://github.com/JuliaData/DataFrames.jl/pull/2594))
14+
* `transform` and `transform!` always copy columns when column renaming transformation
15+
is passed. If similar issues are identified after 1.0 release (i.e. that a
16+
copy of data is not made in scenarios where it normally should be made these
17+
will be considered bugs and fixed as non-breaking changes)
18+
([#2721](https://github.com/JuliaData/DataFrames.jl/pull/2721))
1419

1520
## New functionalities
1621

src/abstractdataframe/selection.jl

+34-11
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ const TRANSFORMATION_COMMON_RULES =
138138
transformations that return the same object without copying may create
139139
column aliases even if `copycols=true`. An example of such a situation is
140140
`select!(df, :a, :a => :b, :a => identity => :c)`.
141+
As a special case in `transform` and `transform!` column renaming always
142+
copies columns to avoid storing aliased columns in the target data frame.
141143
142144
If `df` is a `SubDataFrame` and `copycols=true` then a `DataFrame` is
143145
returned and the same copying rules apply as for a `DataFrame` input: this
@@ -270,14 +272,14 @@ function normalize_selection(idx::AbstractIndex,
270272
allunique(combine_target_col) || throw(ArgumentError("target column names must be unique"))
271273
end
272274

273-
if wanttable
274-
combine_src = AsTable(c)
275-
else
275+
if wanttable
276+
combine_src = AsTable(c)
277+
else
276278
combine_src = (length(c) == 1 ? only(c) : c)
277279
end
278280

279281
combine_func = first(last(sel))
280-
282+
281283
return combine_src => combine_func => combine_target_col
282284
end
283285

@@ -338,9 +340,9 @@ function normalize_selection(idx::AbstractIndex,
338340
end
339341
end
340342

341-
if wanttable
342-
combine_src = AsTable(c)
343-
else
343+
if wanttable
344+
combine_src = AsTable(c)
345+
else
344346
combine_src = (length(c) == 1 ? only(c) : c)
345347
end
346348

@@ -656,8 +658,17 @@ $TRANSFORMATION_COMMON_RULES
656658
657659
See [`select`](@ref) for examples.
658660
"""
659-
transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true) =
660-
select!(df, :, args..., renamecols=renamecols)
661+
function transform!(df::DataFrame, @nospecialize(args...); renamecols::Bool=true)
662+
idx = index(df)
663+
newargs = Any[if sel isa Pair{<:ColumnIndex, Symbol}
664+
idx[first(sel)] => copy => last(sel)
665+
elseif sel isa Pair{<:ColumnIndex, <:AbstractString}
666+
idx[first(sel)] => copy => Symbol(last(sel))
667+
else
668+
sel
669+
end for sel in args]
670+
return select!(df, :, newargs..., renamecols=renamecols)
671+
end
661672

662673
function transform!(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renamecols::Bool=true)
663674
if arg isa Colon
@@ -978,8 +989,20 @@ ERROR: ArgumentError: column :x in returned data frame is not equal to grouping
978989
979990
See [`select`](@ref) for more examples.
980991
"""
981-
transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true) =
982-
select(df, :, args..., copycols=copycols, renamecols=renamecols)
992+
function transform(df::AbstractDataFrame, @nospecialize(args...); copycols::Bool=true, renamecols::Bool=true)
993+
idx = index(df)
994+
# when using the copy function the copy of source data frame
995+
# is made exactly once even if copycols=true
996+
# (copycols=true makes a copy only if the column was not copied previously)
997+
newargs = Any[if sel isa Pair{<:ColumnIndex, Symbol}
998+
idx[first(sel)] => copy => last(sel)
999+
elseif sel isa Pair{<:ColumnIndex, <:AbstractString}
1000+
idx[first(sel)] => copy => Symbol(last(sel))
1001+
else
1002+
sel
1003+
end for sel in args]
1004+
return select(df, :, newargs..., copycols=copycols, renamecols=renamecols)
1005+
end
9831006

9841007
function transform(@nospecialize(arg::Base.Callable), df::AbstractDataFrame; renamecols::Bool=true)
9851008
if arg isa Colon

src/other/precompile.jl

+7
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,9 @@ function precompile(all=false)
134134
end
135135
Base.precompile(Tuple{typeof(row_group_slots),Tuple{RepeatedVector{Union{Missing, Int}}, RepeatedVector{Union{Missing, Int}}, RepeatedVector{Union{Missing, String}}, Vector{Union{Missing, Int}}},Val{false},Vector{Int},Bool,Bool})
136136
Base.precompile(Tuple{typeof(combine),Union{Function, Type},GroupedDataFrame{DataFrame}})
137+
Base.precompile(Tuple{typeof(select_transform!),Base.RefValue{Any},DataFrame,DataFrame,Set{Symbol},Bool,Base.RefValue{Bool}})
138+
Base.precompile(Tuple{typeof(transform!),DataFrame,Any})
139+
Base.precompile(Tuple{typeof(transform),DataFrame,Any})
137140
else
138141
Base.precompile(Tuple{typeof(_sortperm),DataFrame,Base.Sort.MergeSortAlg,DFPerm{Vector{Ordering}, Tuple{Vector{Int}, Vector{Int}, Vector{Int}}}})
139142
Base.precompile(Tuple{typeof(flatten),DataFrame,All{Tuple{}}})
@@ -1030,6 +1033,10 @@ function precompile(all=false)
10301033
end
10311034
Base.precompile(Tuple{typeof(row_group_slots),Tuple{Vector{Float64}},Val{false},Vector{Int},Bool,Bool})
10321035
Base.precompile(Tuple{typeof(transform),DataFrame,Any})
1036+
Base.precompile(Tuple{typeof(transform!),DataFrame,Any})
1037+
Base.precompile(Tuple{typeof(select_transform!),Base.RefValue{Any},DataFrame,DataFrame,Set{Symbol},Bool,Base.RefValue{Bool}})
1038+
Base.precompile(Tuple{Core.kwftype(typeof(select)),NamedTuple{(:copycols, :renamecols), Tuple{Bool, Bool}},typeof(select),DataFrame,Any,Any})
1039+
Base.precompile(Tuple{Core.kwftype(typeof(select!)),NamedTuple{(:renamecols,), Tuple{Bool}},typeof(select!),DataFrame,Any,Any})
10331040
Base.precompile(Tuple{typeof(push!),DataFrame,Dict{Symbol, Any}})
10341041
Base.precompile(Tuple{typeof(_transformation_helper),DataFrame,Base.OneTo{Int},Base.RefValue{Any}})
10351042
Base.precompile(Tuple{typeof(_semijoin_sorted),OnCol{Tuple{Vector{Union{Missing, Int}}, Vector{Union{Missing, Int}}, Vector{Union{Missing, Int}}}, 3},OnCol{Tuple{PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}, PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}, PooledVector{Union{Missing, Int}, UInt32, Vector{UInt32}}}, 3},BitVector})

test/select.jl

+30-2
Original file line numberDiff line numberDiff line change
@@ -1080,13 +1080,17 @@ end
10801080

10811081
df2 = transform(df, [:x1, :x2] => +, :x2 => :x3, copycols=false)
10821082
@test df2 == select(df, :, [:x1, :x2] => +, :x2 => :x3)
1083-
@test df.x2 === df2.x2 === df2.x3
1083+
@test df.x2 == df2.x2 == df2.x3
1084+
@test df.x2 === df2.x2
1085+
@test df.x2 !== df2.x3
10841086
@test_throws ArgumentError transform(view(df, :, :), [:x1, :x2] => +, :x2 => :x3, copycols=false)
10851087

10861088
x2 = df.x2
10871089
transform!(df, [:x1, :x2] => +, :x2 => :x3)
10881090
@test df == df2
1089-
@test x2 === df.x2 === df.x3
1091+
@test x2 == df.x2 == df.x3
1092+
@test x2 === df.x2
1093+
@test x2 !== df.x3
10901094

10911095
@test transform(df) == df
10921096
df2 = transform(df, copycols=false)
@@ -1620,4 +1624,28 @@ end
16201624
[:a] => sum => ["new"], false) == (1 => (sum => [:new]))
16211625
end
16221626

1627+
@testset "copying in transform when renaming" begin
1628+
for oldcol in (:a, "a", 1), newcol in (:b, "b")
1629+
df = DataFrame(a=1)
1630+
df2 = transform(df, oldcol => newcol)
1631+
@test df2.b == df2.a == df.a
1632+
@test df2.b !== df2.a
1633+
@test df2.b !== df.a
1634+
@test df2.a !== df.a
1635+
1636+
df2 = transform(df, oldcol => newcol, copycols=false)
1637+
@test df2.b == df2.a == df.a
1638+
@test df2.b !== df2.a
1639+
@test df2.b !== df.a
1640+
@test df2.a === df.a
1641+
1642+
a = df.a
1643+
transform!(df, oldcol => newcol)
1644+
@test df.b == df.a == a
1645+
@test df.b !== df.a
1646+
@test df.b !== a
1647+
@test df.a === a
1648+
end
1649+
end
1650+
16231651
end # module

0 commit comments

Comments
 (0)