JuliaData · bkamins · Nov 14, 2018 · Nov 8, 2018 · Nov 8, 2018 · Nov 8, 2018
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -29,6 +29,7 @@ meltdf
 
 ```@docs
 allowmissing!
+columns
 completecases
 describe
 disallowmissing!

diff --git a/docs/src/lib/types.md b/docs/src/lib/types.md
@@ -33,9 +33,19 @@ and reflects changes done to the parent after the creation of the view.
 Typically objects of the `DataFrameRow` type are encountered when returned by the `eachrow` function.
 In the future accessing a single row of a data frame via `getindex` or `view` will return a `DataFrameRow`.
 
-Additionally the `eachrow` and `eachcol` functions return values of the `DFRowIterator` and `DFColumnIterator` types respectively.
-Those types are not exported and should not be constructed directly.
-They respectively serve as iterators over rows and columns of an `AbstractDataFrame`.
+Additionally the `eachrow` function returns value of the `DFRowIterator` type, which
+serves as an iterator over rows and columns of an `AbstractDataFrame` returning `DataFrameRow` objects.
+
+Similarly `eachcol` and `columns` functions return value `DFColumnIterator` type, which
+serves as iterator over columns of an `AbstractDataFrame`.
+The difference between the return value of `eachcol` and `columns` is the following:
+
+* The `eachcol` function returns value of the `DFColumnIterator{<:AbstractDataFrame, true}` type which is an
+  iterator returning a tuple containing column name and column value.
+* The `columns` function returns value of the `DFColumnIterator{<:AbstractDataFrame, false}` type which is an
+  iterator returning a column value only.
+
+The `DFRowIterator` and `DFColumnIterator` types are not exported and should not be constructed directly.
 
 ## Types specification
 

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -27,6 +27,7 @@ export AbstractDataFrame,
        aggregate,
        by,
        categorical!,
+       columns,
        colwise,
        combine,
        completecases,

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -81,22 +81,6 @@ abstract type AbstractDataFrame end
 ##
 ##############################################################################
 
-struct Cols{T <: AbstractDataFrame} <: AbstractVector{AbstractVector}
-    df::T
-end
-function Base.iterate(itr::Cols, st=1)
-    st > length(itr.df) && return nothing
-    return (itr.df[st], st + 1)
-end
-Base.length(itr::Cols) = length(itr.df)
-Base.size(itr::Cols, ix) = ix==1 ? length(itr) : throw(ArgumentError("Incorrect dimension"))
-Base.size(itr::Cols) = (length(itr.df),)
-Base.IndexStyle(::Type{<:Cols}) = IndexLinear()
-Base.getindex(itr::Cols, inds...) = getindex(itr.df, inds...)
-
-# N.B. where stored as a vector, 'columns(x) = x.vector' is a bit cheaper
-columns(df::T) where {T <: AbstractDataFrame} = Cols{T}(df)
-
 Base.names(df::AbstractDataFrame) = names(index(df))
 _names(df::AbstractDataFrame) = _names(index(df))
 
@@ -218,7 +202,7 @@ eltypes(df)
 ```
 
 """
-eltypes(df::AbstractDataFrame) = map!(eltype, Vector{Type}(undef, size(df,2)), columns(df))
+eltypes(df::AbstractDataFrame) = [eltype(col) for col in columns(df)]
 
 Base.size(df::AbstractDataFrame) = (nrow(df), ncol(df))
 function Base.size(df::AbstractDataFrame, i::Integer)
@@ -1097,7 +1081,8 @@ julia> repeat(df, inner = 2, outer = 3)
 ```
 """
 Base.repeat(df::AbstractDataFrame; inner::Integer = 1, outer::Integer = 1) =
-    map(x -> repeat(x, inner = inner, outer = outer), eachcol(df))
+    DataFrame(map(x -> repeat(x, inner = inner, outer = outer), columns(df)),
+              names(df))
 
 """
     repeat(df::AbstractDataFrame, count::Integer)
@@ -1127,7 +1112,7 @@ julia> repeat(df, 2)
 ```
 """
 Base.repeat(df::AbstractDataFrame, count::Integer) =
-    map(x -> repeat(x, count), eachcol(df))
+    DataFrame(map(x -> repeat(x, count), columns(df)), names(df))
 
 ##############################################################################
 ##

diff --git a/src/abstractdataframe/io.jl b/src/abstractdataframe/io.jl
@@ -213,11 +213,11 @@ struct DataFrameStream{T}
     columns::T
     header::Vector{String}
 end
-DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(columns(df)), string.(names(df)))
+DataFrameStream(df::DataFrame) = DataFrameStream(Tuple(rawcolumns(df)), string.(names(df)))
 
 # DataFrame Data.Source implementation
 Data.schema(df::DataFrame) =
-    Data.Schema(Type[eltype(A) for A in columns(df)], string.(names(df)), size(df, 1))
+    Data.Schema(Type[eltype(A) for A in rawcolumns(df)], string.(names(df)), size(df, 1))
 
 Data.isdone(source::DataFrame, row, col, rows, cols) = row > rows || col > cols
 function Data.isdone(source::DataFrame, row, col)
@@ -276,26 +276,25 @@ function DataFrame(sch::Data.Schema{R}, ::Type{S}=Data.Field,
                 # to the # of rows in the source
             newsize = ifelse(S == Data.Column || !R, 0,
                         ifelse(append, sinkrows + sch.rows, sch.rows))
-            foreach(col->resize!(col, newsize), columns(sink))
+            foreach(col->resize!(col, newsize), rawcolumns(sink))
             sch.rows = newsize
         end
         # take care of a possible reference from source by addint to WeakRefStringArrays
         if !isempty(reference)
             foreach(col-> col isa WeakRefStringArray && push!(col.data, reference),
-                sink.columns)
+                    rawcolumns(sink))
         end
-        sink = DataFrameStream(sink)
+        DataFrameStream(sink)
     else
         # allocating a fresh DataFrame Sink; append is irrelevant
         # for Data.Column or unknown # of rows in Data.Field, we only ever append!,
             # so just allocate empty columns
         rows = ifelse(S == Data.Column, 0, ifelse(!R, 0, sch.rows))
         names = Data.header(sch)
-        sink = DataFrameStream(
-                Tuple(allocate(types[i], rows, reference) for i = 1:length(types)), names)
         sch.rows = rows
+        DataFrameStream(Tuple(allocate(types[i], rows, reference)
+                              for i = 1:length(types)), names)
     end
-    return sink
 end
 
 DataFrame(sink, sch::Data.Schema, ::Type{S}, append::Bool;

diff --git a/src/abstractdataframe/iteration.jl b/src/abstractdataframe/iteration.jl
@@ -15,7 +15,7 @@ with each row represented as a `DataFrameRow`.
 
 A value of this type is returned by the [`eachrow`](@link) function.
 """
-struct DFRowIterator{T <: AbstractDataFrame}
+struct DFRowIterator{T<:AbstractDataFrame}
     df::T
 end
 
@@ -27,25 +27,32 @@ with each row represented as a `DataFrameRow`.
 """
 eachrow(df::AbstractDataFrame) = DFRowIterator(df)
 
+Base.size(itr::DFRowIterator) = (size(itr.df, 1), )
+Base.size(itr::DFRowIterator, ix) =
+    ix == 1 ? length(itr) : throw(ArgumentError("Incorrect dimension"))
+Base.length(itr::DFRowIterator) = size(itr.df, 1)
+Base.firstindex(itr::DFRowIterator) = 1
+Base.lastindex(itr::DFRowIterator) = length(itr)
+
 function Base.iterate(itr::DFRowIterator, i=1)
     i > size(itr.df, 1) && return nothing
     return (DataFrameRow(itr.df, i), i + 1)
 end
+
 Base.eltype(::DFRowIterator{T}) where {T} = DataFrameRow{T}
-Base.size(itr::DFRowIterator) = (size(itr.df, 1), )
-Base.length(itr::DFRowIterator) = size(itr.df, 1)
 Base.getindex(itr::DFRowIterator, i) = DataFrameRow(itr.df, i)
 
 # Iteration by columns
 """
-    DFColumnIterator{<:AbstractDataFrame}
+    DFColumnIterator{<:AbstractDataFrame, C}
 
 Iterator over columns of an `AbstractDataFrame`.
-Each returned value is a tuple consisting of column name and column vector.
-
-A value of this type is returned by the [`eachcol`](@link) function.
+If `C` is `true` (a value returned by the [`eachcol`](@link) function)
+then each returned value is a tuple consisting of column name and column vector.
+If `C` is `false` (a value returned by the [`columns`](@link) function)
+then each returned value is a column vector.
 """
-struct DFColumnIterator{T <: AbstractDataFrame}
+struct DFColumnIterator{T<:AbstractDataFrame, C}
     df::T
 end
 
@@ -55,10 +62,6 @@ end
 Return a `DFColumnIterator` that iterates an `AbstractDataFrame` column by column.
 Iteration returns a tuple consisting of column name and column vector.
 
-`DFColumnIterator` has a custom implementation of the `map` function which
-returns a `DataFrame` and assumes that a function argument passed do
-the `map` function accepts takes only a column vector.
-
 **Examples**
 
 ```jldoctest
@@ -72,29 +75,73 @@ julia> df = DataFrame(x=1:4, y=11:14)
 │ 3   │ 3     │ 13    │
 │ 4   │ 4     │ 14    │
 
-julia> map(sum, eachcol(df))
-1×2 DataFrame
+julia> collect(eachcol(df))
+2-element Array{Tuple{Symbol,Any},1}:
+ (:x, [1, 2, 3, 4])
+ (:y, [11, 12, 13, 14])
+```
+"""
+eachcol(df::T) where T<: AbstractDataFrame = DFColumnIterator{T, true}(df)
+
+"""
+    columns(df::AbstractDataFrame)
+
+Return a `DFColumnIterator` that iterates an `AbstractDataFrame` column by column.
+Iteration returns a column vector.
+
+**Examples**
+
+```jldoctest
+julia> df = DataFrame(x=1:4, y=11:14)
+4×2 DataFrame
 │ Row │ x     │ y     │
 │     │ Int64 │ Int64 │
 ├─────┼───────┼───────┤
-│ 1   │ 10    │ 50    │
+│ 1   │ 1     │ 11    │
+│ 2   │ 2     │ 12    │
+│ 3   │ 3     │ 13    │
+│ 4   │ 4     │ 14    │
+
+julia> collect(columns(df))
+2-element Array{AbstractArray{T,1} where T,1}:
+ [1, 2, 3, 4]
+ [11, 12, 13, 14]
 ```
 """
-eachcol(df::AbstractDataFrame) = DFColumnIterator(df)
+columns(df::T) where T<: AbstractDataFrame = DFColumnIterator{T, false}(df)
 
-function Base.iterate(itr::DFColumnIterator, j=1)
+Base.length(itr::DFColumnIterator) = size(itr.df, 2)
+Base.size(itr::DFColumnIterator) = (size(itr.df, 2),)
+Base.size(itr::DFColumnIterator, ix) =
+    ix == 1 ? length(itr) : throw(ArgumentError("Incorrect dimension"))
+Base.firstindex(itr::DFColumnIterator) = 1
+Base.lastindex(itr::DFColumnIterator) = length(itr)
+
+function Base.iterate(itr::DFColumnIterator{<:AbstractDataFrame,true}, j=1)
     j > size(itr.df, 2) && return nothing
     return ((_names(itr.df)[j], itr.df[j]), j + 1)
 end
-Base.eltype(::DFColumnIterator) = Tuple{Symbol, AbstractVector}
-Base.size(itr::DFColumnIterator) = (size(itr.df, 2), )
-Base.length(itr::DFColumnIterator) = size(itr.df, 2)
-Base.getindex(itr::DFColumnIterator, j) = itr.df[j]
-function Base.map(f::Union{Function,Type}, dfci::DFColumnIterator)
-    # note: `f` must return a consistent length
-    res = DataFrame()
-    for (n, v) in eachcol(dfci.df)
-        res[n] = f(v)
+
+Base.eltype(::DFColumnIterator{<:AbstractDataFrame,true}) =
+    Tuple{Symbol, AbstractVector}
+
+function Base.getindex(itr::DFColumnIterator{<:AbstractDataFrame,true}, j)
+    # TODO: change to the way getindex for false is defined below afted deprecation
+    if !(j isa Integer) || j isa Bool
+        Base.depwarn("calling getindex on DFColumnIterator{<:AbstractDataFrame,true} " *
+                     " object will only accept integer indexing and will return " *
+                     "a tuple of column name and column value in the future.", :getindex)
     end
-    res
+    itr.df[j]
 end
+
+function Base.iterate(itr::DFColumnIterator{<:AbstractDataFrame,false}, j=1)
+    j > size(itr.df, 2) && return nothing
+    return (itr.df[j], j + 1)
+end
+
+Base.eltype(::DFColumnIterator{<:AbstractDataFrame,false}) = AbstractVector
+Base.getindex(itr::DFColumnIterator{<:AbstractDataFrame,false}, j::Integer) =
+    itr.df[j]
+Base.getindex(itr::DFColumnIterator{<:AbstractDataFrame,false}, j::Bool) =
+    throw(ArgumentError("invalid index $j of type Bool"))