forked from JuliaData/DataFrames.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubdataframe.jl
151 lines (117 loc) · 4.58 KB
/
subdataframe.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
##############################################################################
##
## We use SubDataFrame's to maintain a reference to a subset of a DataFrame
## without making copies.
##
##############################################################################
struct SubDataFrame{T <: AbstractVector{Int}} <: AbstractDataFrame
parent::DataFrame
rows::T # maps from subdf row indexes to parent row indexes
function SubDataFrame{T}(parent::DataFrame, rows::T) where {T <: AbstractVector{Int}}
if length(rows) > 0
rmin, rmax = extrema(rows)
if rmin < 1 || rmax > size(parent, 1)
throw(BoundsError())
end
end
new(parent, rows)
end
end
"""
A view of row subsets of an AbstractDataFrame
A `SubDataFrame` is meant to be constructed with `view`. A
SubDataFrame is used frequently in split/apply sorts of operations.
```julia
view(d::AbstractDataFrame, rows)
```
### Arguments
* `d` : an AbstractDataFrame
* `rows` : any indexing type for rows, typically an Int,
AbstractVector{Int}, AbstractVector{Bool}, or a Range
### Notes
A `SubDataFrame` is an AbstractDataFrame, so expect that most
DataFrame functions should work. Such methods include `describe`,
`dump`, `nrow`, `size`, `by`, `stack`, and `join`. Indexing is just
like a DataFrame; copies are returned.
To subset along columns, use standard column indexing as that creates
a view to the columns by default. To subset along rows and columns,
use column-based indexing with `view`.
### Examples
```julia
df = DataFrame(a = repeat([1, 2, 3, 4], outer=[2]),
b = repeat([2, 1], outer=[4]),
c = randn(8))
sdf1 = view(df, 1:6)
sdf2 = view(df, df[:a] .> 1)
sdf3 = view(df[[1,3]], df[:a] .> 1) # row and column subsetting
sdf4 = groupby(df, :a)[1] # indexing a GroupedDataFrame returns a SubDataFrame
sdf5 = view(sdf1, 1:3)
sdf1[:,[:a,:b]]
```
"""
SubDataFrame
function SubDataFrame(parent::DataFrame, rows::T) where {T <: AbstractVector{Int}}
return SubDataFrame{T}(parent, rows)
end
function SubDataFrame(parent::DataFrame, rows::Colon)
return SubDataFrame(parent, 1:nrow(parent))
end
function SubDataFrame(parent::DataFrame, row::Integer)
return SubDataFrame(parent, [Int(row)])
end
function SubDataFrame(parent::DataFrame, rows::AbstractVector{<:Integer})
return SubDataFrame(parent, convert(Vector{Int}, rows))
end
function SubDataFrame(parent::DataFrame, rows::AbstractVector{Bool})
return SubDataFrame(parent, findall(rows))
end
function SubDataFrame(sdf::SubDataFrame, rowinds::Union{T, AbstractVector{T}}) where {T <: Integer}
return SubDataFrame(sdf.parent, sdf.rows[rowinds])
end
function SubDataFrame(sdf::SubDataFrame, rowinds::Colon)
return sdf
end
function Base.view(adf::AbstractDataFrame, rowinds::AbstractVector{T}) where {T >: Missing}
# Vector{>:Missing} need to be checked for missings
any(ismissing, rowinds) && throw(MissingException("missing values are not allowed in indices"))
return SubDataFrame(adf, convert(Vector{Missings.T(T)}, rowinds))
end
function Base.view(adf::AbstractDataFrame, rowinds::Any)
return SubDataFrame(adf, rowinds)
end
function Base.view(adf::AbstractDataFrame, rowinds::Any, colinds::Union{Colon, AbstractVector})
return SubDataFrame(adf[colinds], rowinds)
end
function Base.view(adf::AbstractDataFrame, rowinds::Any, colind::ColumnIndex)
return SubDataFrame(adf[[colind]], rowinds)
end
##############################################################################
##
## AbstractDataFrame interface
##
##############################################################################
index(sdf::SubDataFrame) = index(sdf.parent)
# TODO: Remove these
nrow(sdf::SubDataFrame) = ncol(sdf) > 0 ? length(sdf.rows)::Int : 0
ncol(sdf::SubDataFrame) = length(index(sdf))
function Base.getindex(sdf::SubDataFrame, colinds::Any)
return sdf.parent[sdf.rows, colinds]
end
function Base.getindex(sdf::SubDataFrame, rowinds::Any, colinds::Any)
return sdf.parent[sdf.rows[rowinds], colinds]
end
function Base.setindex!(sdf::SubDataFrame, val::Any, colinds::Any)
sdf.parent[sdf.rows, colinds] = val
return sdf
end
function Base.setindex!(sdf::SubDataFrame, val::Any, rowinds::Any, colinds::Any)
sdf.parent[sdf.rows[rowinds], colinds] = val
return sdf
end
##############################################################################
##
## Miscellaneous
##
##############################################################################
Base.map(f::Function, sdf::SubDataFrame) = f(sdf) # TODO: deprecate
without(sdf::SubDataFrame, c) = view(without(sdf.parent, c), sdf.rows)