Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add rownumber #2356

Merged
merged 7 commits into from
Aug 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/src/lib/functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ length
ncol
ndims
nrow
rownumber
show
size
```
Expand Down
1 change: 1 addition & 0 deletions src/DataFrames.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ export AbstractDataFrame,
rename,
repeat!,
rightjoin,
rownumber,
select!,
select,
semijoin,
Expand Down
10 changes: 1 addition & 9 deletions src/abstractdataframe/iteration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -83,15 +83,7 @@ eachrow(df::AbstractDataFrame) = DataFrameRows{typeof(df), typeof(index(df))}(df
Base.IndexStyle(::Type{<:DataFrameRows}) = Base.IndexLinear()
Base.size(itr::DataFrameRows) = (size(parent(itr), 1), )

Base.@propagate_inbounds function Base.getindex(itr::DataFrameRows, i::Int)
df = parent(itr)
return DataFrameRow(df, index(df), i)
end

Base.@propagate_inbounds function Base.getindex(itr::DataFrameRows{<:SubDataFrame}, i::Int)
sdf = parent(itr)
return DataFrameRow(parent(sdf), index(sdf), rows(sdf)[i])
end
Base.@propagate_inbounds Base.getindex(itr::DataFrameRows, i::Int) = parent(itr)[i, :]

# separate methods are needed due to dispatch ambiguity
Base.getproperty(itr::DataFrameRows, col_ind::Symbol) =
Expand Down
120 changes: 110 additions & 10 deletions src/dataframerow/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,19 +74,25 @@ julia> Vector(df[1, :])
```
"""
struct DataFrameRow{D<:AbstractDataFrame,S<:AbstractIndex}
# although we allow D to be AbstractDataFrame to support extensions
# in DataFrames.jl it will always be a DataFrame unless an inner constructor
# is used. In this way we have a fast access to the data frame that
# actually stores the data that DataFrameRow refers to
df::D
colindex::S
row::Int
dfrow::Int # row number in df
rownumber::Int # row number in the direct source AbstractDataFrame from which DataFrameRow was created

@inline DataFrameRow(df::D, colindex::S, row::Union{Signed, Unsigned}) where
{D<:AbstractDataFrame,S<:AbstractIndex} = new{D,S}(df, colindex, row)
@inline DataFrameRow(df::D, colindex::S, row::Union{Signed, Unsigned},
rownumber::Union{Signed, Unsigned}) where
{D<:AbstractDataFrame,S<:AbstractIndex} = new{D,S}(df, colindex, row, rownumber)
end

Base.@propagate_inbounds function DataFrameRow(df::DataFrame, row::Integer, cols)
@boundscheck if !checkindex(Bool, axes(df, 1), row)
throw(BoundsError(df, (row, cols)))
end
DataFrameRow(df, SubIndex(index(df), cols), row)
DataFrameRow(df, SubIndex(index(df), cols), row, row)
end

Base.@propagate_inbounds DataFrameRow(df::DataFrame, row::Bool, cols) =
Expand All @@ -101,7 +107,7 @@ Base.@propagate_inbounds function DataFrameRow(sdf::SubDataFrame, row::Integer,
else
colindex = SubIndex(index(parent(sdf)), parentcols(index(sdf), cols))
end
@inbounds DataFrameRow(parent(sdf), colindex, rows(sdf)[row])
@inbounds DataFrameRow(parent(sdf), colindex, rows(sdf)[row], row)
end

Base.@propagate_inbounds DataFrameRow(df::SubDataFrame, row::Bool, cols) =
Expand All @@ -110,7 +116,83 @@ Base.@propagate_inbounds DataFrameRow(df::SubDataFrame, row::Bool, cols) =
Base.@propagate_inbounds DataFrameRow(df::AbstractDataFrame, row::Integer) =
DataFrameRow(df, row, :)

row(r::DataFrameRow) = getfield(r, :row)
row(r::DataFrameRow) = getfield(r, :dfrow)

"""
rownumber(dfr::DataFrameRow)

Return a row number in the `AbstractDataFrame` that `dfr` was created from.

Note that this differs from the first element in the tuple returned by
`parentindices`. The latter gives the row number in the `parent(dfr)`, which is
the source `DataFrame` where data that `dfr` gives access to is stored.

# Examples
```julia
julia> df = DataFrame(reshape(1:12, 3, 4))
3×4 DataFrame
│ Row │ x1 │ x2 │ x3 │ x4 │
│ │ Int64 │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼───────┼───────┤
│ 1 │ 1 │ 4 │ 7 │ 10 │
│ 2 │ 2 │ 5 │ 8 │ 11 │
│ 3 │ 3 │ 6 │ 9 │ 12 │

julia> dfr = df[2, :]
DataFrameRow
│ Row │ x1 │ x2 │ x3 │ x4 │
│ │ Int64 │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼───────┼───────┤
│ 2 │ 2 │ 5 │ 8 │ 11 │

julia> rownumber(dfr)
2

julia> parentindices(dfr)
(2, Base.OneTo(4))

julia> parent(dfr)
3×4 DataFrame
│ Row │ x1 │ x2 │ x3 │ x4 │
│ │ Int64 │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼───────┼───────┤
│ 1 │ 1 │ 4 │ 7 │ 10 │
│ 2 │ 2 │ 5 │ 8 │ 11 │
│ 3 │ 3 │ 6 │ 9 │ 12 │

julia> dfv = @view df[2:3, 1:3]
2×3 SubDataFrame
│ Row │ x1 │ x2 │ x3 │
│ │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼───────┤
│ 1 │ 2 │ 5 │ 8 │
│ 2 │ 3 │ 6 │ 9 │

julia> dfrv = dfv[2, :]
DataFrameRow
│ Row │ x1 │ x2 │ x3 │
│ │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼───────┤
│ 3 │ 3 │ 6 │ 9 │

julia> rownumber(dfrv)
2

julia> parentindices(dfrv)
(3, 1:3)

julia> parent(dfrv)
3×4 DataFrame
│ Row │ x1 │ x2 │ x3 │ x4 │
│ │ Int64 │ Int64 │ Int64 │ Int64 │
├─────┼───────┼───────┼───────┼───────┤
│ 1 │ 1 │ 4 │ 7 │ 10 │
│ 2 │ 2 │ 5 │ 8 │ 11 │
│ 3 │ 3 │ 6 │ 9 │ 12 │
```
"""
rownumber(r::DataFrameRow) = getfield(r, :rownumber)

Base.parent(r::DataFrameRow) = getfield(r, :df)
Base.parentindices(r::DataFrameRow) = (row(r), parentcols(index(r)))

Expand All @@ -129,8 +211,17 @@ Base.@propagate_inbounds Base.getindex(df::AbstractDataFrame, rowind::Integer, :
DataFrameRow(df, rowind, :)
Base.@propagate_inbounds Base.getindex(r::DataFrameRow, idx::ColumnIndex) =
parent(r)[row(r), parentcols(index(r), idx)]
Base.@propagate_inbounds Base.getindex(r::DataFrameRow, idxs::MultiColumnIndex) =
DataFrameRow(parent(r), row(r), parentcols(index(r), idxs))

Base.@propagate_inbounds function Base.getindex(r::DataFrameRow, idxs::MultiColumnIndex)
# we create a temporary DataFrameRow object to compute the SubIndex
# in the parent(r), but this object has an incorrect rownumber
# so we later copy rownumber from r
# the Julia compiler should be able to optimize out this indirection
# and in this way we avoid duplicating the code that computes the correct SubIndex
dfr_tmp = DataFrameRow(parent(r), row(r), parentcols(index(r), idxs))
bkamins marked this conversation as resolved.
Show resolved Hide resolved
return DataFrameRow(parent(dfr_tmp), index(dfr_tmp), row(r), rownumber(r))
end

Base.@propagate_inbounds Base.getindex(r::DataFrameRow, ::Colon) = r

for T in (:AbstractVector, :Regex, :Not, :Between, :All, :Colon)
Expand Down Expand Up @@ -213,8 +304,17 @@ Base.propertynames(r::DataFrameRow, private::Bool=false) = copy(_names(r))

Base.view(r::DataFrameRow, col::ColumnIndex) =
view(parent(r)[!, parentcols(index(r), col)], row(r))
Base.view(r::DataFrameRow, cols::MultiColumnIndex) =
DataFrameRow(parent(r), row(r), parentcols(index(r), cols))

function Base.view(r::DataFrameRow, cols::MultiColumnIndex)
# we create a temporary DataFrameRow object to compute the SubIndex
# in the parent(r), but this object has an incorrect rownumber
# so we later copy rownumber from r
# the Julia compiler should be able to optimize out this indirection
# and in this way we avoid duplicating the code that computes the correct SubIndex
dfr_tmp = DataFrameRow(parent(r), row(r), parentcols(index(r), cols))
return DataFrameRow(parent(dfr_tmp), index(dfr_tmp), row(r), rownumber(r))
end

Base.view(r::DataFrameRow, ::Colon) = r

"""
Expand Down
56 changes: 56 additions & 0 deletions test/dataframerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -521,4 +521,60 @@ end
@test df[1, :A] == 101
end

@testset "rownumber" begin
df = DataFrame(reshape(1:12, 3, 4))
dfr = df[2, :]
@test rownumber(dfr) == 2
@test parentindices(dfr) == (2, 1:4)
@test parent(dfr) === df

dfr = @view df[2, :]
@test rownumber(dfr) == 2
@test parentindices(dfr) == (2, 1:4)
@test parent(dfr) === df

dfr = dfr[1:2]
@test rownumber(dfr) == 2
@test parentindices(dfr) == (2, 1:2)
@test parent(dfr) === df

dfr = @view dfr[1:2]
@test rownumber(dfr) == 2
@test parentindices(dfr) == (2, 1:2)
@test parent(dfr) === df

for (i, r) in enumerate(eachrow(df))
@test rownumber(r) == i
@test parentindices(r) == (i, 1:4)
@test parent(r) === df
end

dfv = @view df[2:3, 1:3]
dfrv = dfv[2, :]
@test rownumber(dfrv) == 2
@test parentindices(dfrv) == (3, 1:3)
@test parent(dfrv) == df

dfrv = @view dfv[2, :]
@test rownumber(dfrv) == 2
@test parentindices(dfrv) == (3, 1:3)
@test parent(dfrv) == df

dfrv = dfrv[1:2]
@test rownumber(dfrv) == 2
@test parentindices(dfrv) == (3, 1:2)
@test parent(dfrv) === df

dfrv = @view dfrv[1:2]
@test rownumber(dfrv) == 2
@test parentindices(dfrv) == (3, 1:2)
@test parent(dfrv) === df

for (i, r) in enumerate(eachrow(dfv))
@test rownumber(r) == i
@test parentindices(r) == (i + 1, 1:3)
@test parent(r) === df
end
end

end # module