Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix depwarns of getindex #1585

Merged
merged 14 commits into from
Nov 8, 2018
4 changes: 3 additions & 1 deletion src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1035,7 +1035,9 @@ function _vcat(dfs::AbstractVector{<:AbstractDataFrame})
length(header) == 0 && return DataFrame()
cols = Vector{AbstractVector}(undef, length(header))
for (i, name) in enumerate(header)
data = [df[name] for df in dfs]
# TODO: replace with commented out code after getindex deprecation
# data = [df[name] for df in dfs]
data = [df isa DataFrame ? df[name] : df[:, name] for df in dfs]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be better to always use view here (even if that's temporary)? Same for completecases and maybe other places.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could do this, but this would be breaking.
That is: currently df[name] when df is a SubDataFrame produces a copy and so I used df[:, name] to still make a copy temporarily (it will be a view after getindex deprecation).

Do you think making a view now would be better?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But here it's an implementation detail which shouldn't have any user-visible consequences, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah - right. I mixed it up with hcat! (this PR is really long). I will push a PR changing copy to view in places where it is not user visible.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have pushed a PR implementing what you have requested. Actually it is a good way to test if something will not break in the future. The only limitation we have is with vcat with which you have started (I have noted it in the code).

lens = map(length, data)
T = mapreduce(eltype, promote_type, data)
cols[i] = Tables.allocatecolumn(T, sum(lens))
Expand Down
246 changes: 246 additions & 0 deletions src/deprecated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1387,3 +1387,249 @@ import Base: delete!, insert!, merge!

import Base: setindex!
@deprecate setindex!(df::DataFrame, x::Nothing, col_ind::Int) deletecols!(df, col_ind)

# TODO: START: Deprecations to be removed after getindex deprecation period finishes

function Base.iterate(itr::Cols{SubDataFrame}, st=1)
st > length(itr.df) && return nothing
# after deprecation we will return a view, now we materialize a vector
return (itr.df[:, st], st + 1)
end

# after deprecation we will return a view, now we materialize a vector
Base.get(df::SubDataFrame, key::Any, default::Any) = haskey(df, key) ? df[:, key] : default

function completecases(df::SubDataFrame)
res = trues(size(df, 1))
for i in 1:size(df, 2)
_nonmissing!(res, df[:, i])
end
res
end

function completecases(df::SubDataFrame, col::Union{Integer, Symbol})
res = trues(size(df, 1))
_nonmissing!(res, df[:, col])
res
end

completecases(df::SubDataFrame, cols::AbstractVector) =
completecases(df[:, cols])

nonunique(df::SubDataFrame, cols::Union{Real, Symbol}) = nonunique(df[:, [cols]])
nonunique(df::SubDataFrame, cols::Any) = nonunique(df[:, cols])

function Base.hash(df::SubDataFrame, h::UInt)
h += hashdf_seed
h += hash(size(df))
for i in 1:size(df, 2)
h = hash(df[:, i], h)
end
return h
end

function Base.iterate(itr::DFColumnIterator{<:SubDataFrame}, j=1)
j > size(itr.df, 2) && return nothing
return ((_names(itr.df)[j], itr.df[:, j]), j + 1)
end

Base.getindex(itr::DFColumnIterator{<:SubDataFrame}, j) = itr.df[:, j]

function showrowindices(io::IO,
df::SubDataFrame,
rowindices::AbstractVector{Int},
maxwidths::Vector{Int},
leftcol::Int,
rightcol::Int) # -> Void
rowmaxwidth = maxwidths[end]

for i in rowindices
# Print row ID
@printf io "│ %d" i
padding = rowmaxwidth - ndigits(i)
for _ in 1:padding
write(io, ' ')
end
print(io, " │ ")
# Print DataFrame entry
for j in leftcol:rightcol
strlen = 0
if isassigned(df[:, j], i)
s = df[i, j]
strlen = ourstrwidth(s)
if ismissing(s)
printstyled(io, s, color=:light_black)
elseif s === nothing
strlen = 0
else
ourshowcompact(io, s)
end
else
strlen = ourstrwidth(Base.undef_ref_str)
ourshowcompact(io, Base.undef_ref_str)
end
padding = maxwidths[j] - strlen
for _ in 1:padding
write(io, ' ')
end
if j == rightcol
if i == rowindices[end]
print(io, " │")
else
print(io, " │\n")
end
else
print(io, " │ ")
end
end
end
return
end

function showrows(io::IO,
df::SubDataFrame,
rowindices1::AbstractVector{Int},
rowindices2::AbstractVector{Int},
maxwidths::Vector{Int},
splitcols::Bool = false,
allcols::Bool = false,
rowlabel::Symbol = :Row,
displaysummary::Bool = true) # -> Void
ncols = size(df, 2)

if isempty(rowindices1)
if displaysummary
println(io, summary(df))
end
return
end

rowmaxwidth = maxwidths[ncols + 1]
chunkbounds = getchunkbounds(maxwidths, splitcols, displaysize(io)[2])
nchunks = allcols ? length(chunkbounds) - 1 : min(length(chunkbounds) - 1, 1)

header = displaysummary ? summary(df) : ""
if !allcols && length(chunkbounds) > 2
header *= ". Omitted printing of $(chunkbounds[end] - chunkbounds[2]) columns"
end
println(io, header)

for chunkindex in 1:nchunks
leftcol = chunkbounds[chunkindex] + 1
rightcol = chunkbounds[chunkindex + 1]

# Print column names
@printf io "│ %s" rowlabel
padding = rowmaxwidth - ourstrwidth(rowlabel)
for itr in 1:padding
write(io, ' ')
end
print(io, " │ ")
for j in leftcol:rightcol
s = _names(df)[j]
ourshowcompact(io, s)
padding = maxwidths[j] - ourstrwidth(s)
for itr in 1:padding
write(io, ' ')
end
if j == rightcol
print(io, " │\n")
else
print(io, " │ ")
end
end

# Print column types
print(io, "│ ")
padding = rowmaxwidth
for itr in 1:padding
write(io, ' ')
end
print(io, " │ ")
for j in leftcol:rightcol
s = compacttype(eltype(df[:, j]), maxwidths[j])
printstyled(io, s, color=:light_black)
padding = maxwidths[j] - ourstrwidth(s)
for itr in 1:padding
write(io, ' ')
end
if j == rightcol
print(io, " │\n")
else
print(io, " │ ")
end
end

# Print table bounding line
write(io, '├')
for itr in 1:(rowmaxwidth + 2)
write(io, '─')
end
write(io, '┼')
for j in leftcol:rightcol
for itr in 1:(maxwidths[j] + 2)
write(io, '─')
end
if j < rightcol
write(io, '┼')
else
write(io, '┤')
end
end
write(io, '\n')

# Print main table body, potentially in two abbreviated sections
showrowindices(io,
df,
rowindices1,
maxwidths,
leftcol,
rightcol)

if !isempty(rowindices2)
print(io, "\n⋮\n")
showrowindices(io,
df,
rowindices2,
maxwidths,
leftcol,
rightcol)
end

# Print newlines to separate chunks
if chunkindex < nchunks
print(io, "\n\n")
end
end

return
end

function hcat!(df1::DataFrame, df2::SubDataFrame; makeunique::Bool=false)
u = add_names(index(df1), index(df2), makeunique=makeunique)
for i in 1:length(u)
# after deprecation period of getindex we will store a view here
df1[u[i]] = df2[:, i]
end
return df1
end

function Base.append!(df1::DataFrame, df2::SubDataFrame)
_names(df1) == _names(df2) || error("Column names do not match")
nrows, ncols = size(df1)
try
for j in 1:ncols
append!(df1[j], df2[:, j])
end
catch err
# Undo changes in case of error
for j in 1:ncols
resize!(df1[j], nrows)
end
rethrow(err)
end
return df1
end


# TODO: END: Deprecations to be removed after getindex deprecation period finishes
3 changes: 2 additions & 1 deletion src/groupeddataframe/grouping.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ combine(map(d -> mean(skipmissing(d[:c])), gd))
"""
function groupby(df::AbstractDataFrame, cols::Vector;
sort::Bool = false, skipmissing::Bool = false)
sdf = df[cols]
# TODO: decide if we want this or retain `sdf` as a SubDataFrame after deprecation
sdf = df isa SubDataFrame ? df[:, cols] : df[cols]
df_groups = group_rows(sdf, skipmissing)
# sort the groups
if sort
Expand Down
2 changes: 1 addition & 1 deletion src/subdataframe/subdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ ncol(sdf::SubDataFrame) = length(index(sdf))

function Base.getindex(sdf::SubDataFrame, colind::ColumnIndex)
Base.depwarn("`sdf[colind]` will create a view of `parent(sdf)[colind]` in the future." *
" Use sdf[:, [colind]]` to get a `DataFrame`.", :getindex)
" Use sdf[:, colind]` to get a freshly allocated vector.", :getindex)
return parent(sdf)[rows(sdf), colind]
end

Expand Down
12 changes: 9 additions & 3 deletions test/cat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@ module TestCat
df = DataFrame(Int, 4, 3)

# Assignment of rows
df[1, :] = df[1, :]
# TODO: re-enable after getindex deprecation period
# this has to make sure that DataFrameRow can be on RHS
# df[1, :] = df[1, :]
df[1, :] = df[1:1, :]
df[1:2, :] = df[1:2, :]
df[[true,false,false,true], :] = df[2:3, :]

Expand All @@ -108,7 +111,10 @@ module TestCat
df[:x3] = 2

# assignment of subtables
df[1, 1:2] = df[2, 2:3]
# TODO: re-enable after getindex deprecation period
# this has to make sure that DataFrameRow can be on RHS
# df[1, 1:2] = df[2, 2:3]
df[1, 1:2] = df[2:2, 2:3]
df[1:2, 1:2] = df[2:3, 2:3]
df[[true,false,false,true], 2:3] = df[1:2,1:2]

Expand Down Expand Up @@ -278,7 +284,7 @@ module TestCat
err = @test_throws ArgumentError vcat(df1, df2, df3, df4, df1, df2, df3, df4, df1, df2, df3, df4)
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11"
end
x = view(DataFrame(A = Vector{Union{Missing, Int}}(1:3)), 2)
x = view(DataFrame(A = Vector{Union{Missing, Int}}(1:3)), 2:2, :)
y = DataFrame(A = 4:5)
@test vcat(x, y) == DataFrame(A = [2, 4, 5])
end
41 changes: 24 additions & 17 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ module TestData
@test df6[2, :C] == "two"
@test df6[:B] ≅ [1, 2, missing, 4]
@test size(df6[[2,3]], 2) == 2
@test size(df6[2,:], 1) == 1
# TODO: reenable after getinex deprecation period
# @test size(df6[2,:], 1) == 1
@test size(df6[2:2,:], 1) == 1
@test size(df6[[1, 3], [1, 3]]) == (2, 2)
@test size(df6[1:2, 1:2]) == (2, 2)
@test size(head(df6,2)) == (2, 3)
Expand Down Expand Up @@ -75,16 +77,20 @@ module TestData

#test_group("constructors")
# single index is rows
sdf6a = view(df6, 1)
sdf6b = view(df6, 2:3)
sdf6c = view(df6, [true, false, true, false])
sdf6a = view(df6, 1:1, :)
sdf6b = view(df6, 2:3, :)
sdf6c = view(df6, [true, false, true, false], :)
@test size(sdf6a) == (1,3)
sdf6d = view(df6, [1,3], :B)
sdf6d = view(df6, [1,3], [:B])
@test size(sdf6d) == (2,1)
sdf6e = view(df6, 0x01)
sdf6e = view(df6, [0x01], :)
@test size(sdf6e) == (1,3)
sdf6f = view(df6, UInt64[1, 2])
sdf6f = view(df6, UInt64[1, 2], :)
@test size(sdf6f) == (2,3)
# TODO: add those tests after getindex deprecation period
# sdf6g = view(df6, [1,3], :B)
# sdf6h = view(df6, 1, :B)
# sdf6i = view(df6, 1, [:B])

#test_group("ref")
@test sdf6a[1,2] == 4
Expand All @@ -104,18 +110,18 @@ module TestData
#test_group("groupby")
gd = groupby(df7, :d1)
@test length(gd) == 2
@test gd[1][:d2] ≅ d2[d1 .== 1]
@test gd[2][:d2] ≅ d2[d1 .== 2]
@test gd[1][:d3] == d3[d1 .== 1]
@test gd[2][:d3] == d3[d1 .== 2]
@test gd[1][:, :d2] ≅ d2[d1 .== 1]
@test gd[2][:, :d2] ≅ d2[d1 .== 2]
@test gd[1][:, :d3] == d3[d1 .== 1]
@test gd[2][:, :d3] == d3[d1 .== 2]

g1 = groupby(df7, [:d1, :d2])
g2 = groupby(df7, [:d2, :d1])
@test g1[1][:d3] == g2[1][:d3]
@test g1[1][:, :d3] == g2[1][:, :d3]

res = 0.0
for x in g1
res += sum(x[:d1])
res += sum(x[:, :d1])
end
@test res == sum(df7[:d1])

Expand Down Expand Up @@ -216,10 +222,11 @@ module TestData
d1s = stackdf(d1, [:a, :b])
@test d1s[1][[1,24]] == [:a, :b]
@test d1s[2][[1,24]] == [1, 4]
@test_broken d1s[1][true]
@test_broken d1s[1][1.0]
@test_broken d1s[2][true]
@test_broken d1s[2][1.0]
# TODO: enable those tests after deprecation period of getindex
# @test d1s[1][true]
# @test d1s[1][1.0]
# @test d1s[2][true]
# @test d1s[2][1.0]

# Those tests check indexing RepeatedVector/StackedVector by a vector
d1s[1][trues(24)] == d1s[1]
Expand Down
Loading