JuliaData · quinnj · Jun 27, 2020 · Jun 27, 2020
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "CSV"
 uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 authors = ["Jacob Quinn <quinn.jacobd@gmail.com>"]
-version = "0.6.2"
+version = "0.7.0"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -7,6 +7,11 @@ CSV.jl is built to be a fast and flexible pure-Julia library for handling delimi
 Depth = 3
 ```
 
+## Getting Started
+
+CSV.jl provides a number of utilities for working with delimited files. `CSV.File` provides a way to read files into columns of data, detecting column types.
+`CSV.Rows` provides a row iterator for looping over rows in a file. Inputs to either should be filenames as `String`s, or byte vectors (`AbstractVector{UInt8}`). To read other `IO` inputs, just call `read(io)` and pass the bytes directly to `CSV.File` or `CSV.Rows`.
+
 ## Key Functions
 ```@docs
 CSV.File

diff --git a/src/CSV.jl b/src/CSV.jl
@@ -9,8 +9,8 @@ using Parsers
 using Tables
 # PooledArrays.jl is used for materializing pooled columns
 using PooledArrays
-# WeakRefStrings allows for more efficient materializing of string columns via StringVector
-using WeakRefStrings
+# SentinelArrays.jl allow efficient conversion from Vector{Union{T, Missing}} to Vector{T}
+# it also provides the MissingVector and ChainedVector array types
 using SentinelArrays
 
 using CategoricalArrays, DataFrames

diff --git a/src/detection.jl b/src/detection.jl
@@ -322,13 +322,17 @@ function findrowstarts!(buf, len, options::Parsers.Options{ignorerepeated}, rang
                     break
                 end
             end
-            # now we read the next row and see if we get the right # of columns
-            for _ = 1:ncols
-                _, code, _, _, tlen = Parsers.xparse(String, buf, pos, len, options)
-                pos += tlen
-                pos > len && break
+            # now we read the next 5 rows and see if we get the right # of columns
+            correct = true
+            for j = 1:5
+                for _ = 1:ncols
+                    _, code, _, _, tlen = Parsers.xparse(String, buf, pos, len, options)
+                    pos += tlen
+                    pos > len && break
+                end
+                correct &= Parsers.newline(code)
             end
-            if Parsers.newline(code)
+            if correct
                 # boom, we read a whole row and got correct # of columns
                 break
             end

diff --git a/src/file.jl b/src/file.jl
diff --git a/src/header.jl b/src/header.jl
@@ -228,8 +228,9 @@ getdf(x::AbstractDict{Int}, nm, i) = haskey(x, i) ? x[i] : nothing
             end
         end
     end
+    # generate a customtypes Tuple{...} we'll need to generate code for during parsing
     customtypes = Tuple{(nonstandardtype(T) for T in types if nonstandardtype(T) !== Union{})...}
-    # set any unselected columns to typecode USER | MISSING
+    # figure out if we'll drop any columns while parsing
     todrop = Int[]
     if select !== nothing && drop !== nothing
         error("`select` and `drop` keywords were both provided; only one or the other is allowed")

diff --git a/src/rows.jl b/src/rows.jl
@@ -21,7 +21,7 @@ struct Rows{transpose, O, O2, IO, T, V}
     customtypes::T
     positions::Vector{Int64}
     reusebuffer::Bool
-    tapes::Vector{AbstractVector} # for parsing, allocated once and used for each iteration
+    columns::Vector{AbstractVector} # for parsing, allocated once and used for each iteration
     values::Vector{V} # once values are parsed, put in values; allocated on each iteration if reusebuffer=false
     lookup::Dict{Symbol, Int}
 end
@@ -140,7 +140,7 @@ function Rows(source;
     kw...)
 
     h = Header(source, header, normalizenames, datarow, skipto, footerskip, limit, transpose, comment, use_mmap, ignoreemptylines, false, select, drop, missingstrings, missingstring, delim, ignorerepeated, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, categorical, pool, lazystrings, strict, silencewarnings, debug, parsingdebug, true)
-    tapes = allocate(1, h.cols, h.types, h.flags)
+    columns = allocate(1, h.cols, h.types, h.flags)
     values = all(x->x == Union{String, Missing}, h.types) && lazystrings ? Vector{PosLen}(undef, h.cols) : Vector{Any}(undef, h.cols)
     finaltypes = copy(h.types)
     columnmap = [i for i = 1:h.cols]
@@ -167,7 +167,7 @@ function Rows(source;
         h.customtypes,
         h.positions,
         reusebuffer,
-        tapes,
+        columns,
         values,
         lookup,
     )
@@ -182,66 +182,66 @@ Base.IteratorSize(::Type{<:Rows}) = Base.SizeUnknown()
 const EMPTY_TYPEMAP = Dict{Type, Type}()
 const EMPTY_REFS = RefPool[]
 
-@inline function setcustom!(::Type{T}, values, tapes, i) where {T}
+@inline function setcustom!(::Type{T}, values, columns, i) where {T}
     if @generated
         block = Expr(:block)
         push!(block.args, quote
-            error("CSV.jl code-generation error, unexpected column type: $(typeof(tape))")
+            error("CSV.jl code-generation error, unexpected column type: $(typeof(column))")
         end)
         for i = 1:fieldcount(T)
             vec = fieldtype(T, i)
             pushfirst!(block.args, quote
-                if tape isa $(fieldtype(vec, 1))
-                    @inbounds values[i] = tape[1]
+                if column isa $(fieldtype(vec, 1))
+                    @inbounds values[i] = column[1]
                     return
                 end
             end)
         end
         pushfirst!(block.args, quote
-            @inbounds tape = tapes[col]
+            @inbounds column = columns[col]
         end)
         pushfirst!(block.args, Expr(:meta, :inline))
         # @show block
         return block
     else
         # println("generated function failed")
-        @inbounds tape = tapes[i]
-        @inbounds values[i] = tape[1]
+        @inbounds column = columns[i]
+        @inbounds values[i] = column[1]
         return
     end
 end
 
 @inline function Base.iterate(r::Rows{transpose, O, O2, IO, T, V}, (pos, len, row)=(r.datapos, r.len, 1)) where {transpose, O, O2, IO, T, V}
     (pos > len || row > r.limit) && return nothing
     pos > len && return nothing
-    pos = parserow(1, Val(transpose), r.cols, EMPTY_TYPEMAP, r.tapes, r.datapos, r.buf, pos, len, r.positions, 0.0, EMPTY_REFS, 1, r.datarow + row - 2, r.types, r.flags, false, r.options, r.coloptions, T)
+    pos = parserow(1, Val(transpose), r.cols, EMPTY_TYPEMAP, r.columns, r.datapos, r.buf, pos, len, r.positions, 0.0, EMPTY_REFS, 1, r.datarow + row - 2, r.types, r.flags, false, r.options, r.coloptions, T)
     cols = r.cols
     values = r.reusebuffer ? r.values : Vector{V}(undef, cols)
-    tapes = r.tapes
+    columns = r.columns
     for i = 1:cols
-        @inbounds tape = tapes[i]
-        if tape isa Vector{PosLen}
-            @inbounds values[i] = tape[1]
-        elseif tape isa SVec{Int64}
-            @inbounds values[i] = tape[1]
-        elseif tape isa SVec{Float64}
-            @inbounds values[i] = tape[1]
-        elseif tape isa SVec2{String}
-            @inbounds values[i] = tape[1]
-        elseif tape isa SVec{Date}
-            @inbounds values[i] = tape[1]
-        elseif tape isa SVec{DateTime}
-            @inbounds values[i] = tape[1]
-        elseif tape isa SVec{Time}
-            @inbounds values[i] = tape[1]
-        elseif tape isa Vector{Union{Missing, Bool}}
-            @inbounds values[i] = tape[1]
-        elseif tape isa Vector{UInt32}
-            @inbounds values[i] = tape[1]
+        @inbounds column = columns[i]
+        if column isa Vector{PosLen}
+            @inbounds values[i] = column[1]
+        elseif column isa SVec{Int64}
+            @inbounds values[i] = column[1]
+        elseif column isa SVec{Float64}
+            @inbounds values[i] = column[1]
+        elseif column isa SVec2{String}
+            @inbounds values[i] = column[1]
+        elseif column isa SVec{Date}
+            @inbounds values[i] = column[1]
+        elseif column isa SVec{DateTime}
+            @inbounds values[i] = column[1]
+        elseif column isa SVec{Time}
+            @inbounds values[i] = column[1]
+        elseif column isa Vector{Union{Missing, Bool}}
+            @inbounds values[i] = column[1]
+        elseif column isa Vector{UInt32}
+            @inbounds values[i] = column[1]
         elseif T !== Tuple{}
-            setcustom!(T, values, tapes, i)
+            setcustom!(T, values, columns, i)
         else
-            error("bad array type: $(typeof(tape))")
+            error("bad array type: $(typeof(column))")
         end
     end
     return Row2{O, O2, V}(r.names, r.finaltypes, r.columnmap, r.types, r.lookup, values, r.buf, r.e, r.options, r.coloptions), (pos, len, row + 1)

diff --git a/src/utils.jl b/src/utils.jl
@@ -1,4 +1,10 @@
 export PooledString
+"""
+    PooledString
+
+A singleton type that can be used for signaling that a column of a csv file should be pooled,
+with the output array type being a `PooledArray`.
+"""
 struct PooledString <: AbstractString end
 
 # PointerString is an internal-only type for efficiently tracking string data + length
@@ -32,7 +38,7 @@ end
 
 Base.String(x::PointerString) = _unsafe_string(x.ptr, x.len)
 
-# column bit flags
+# column bit flags; useful so we don't have to pass a bunch of arguments/state around manually
 
 # whether the user provided the type or not
 const USER       = 0b00000001
@@ -46,15 +52,18 @@ anymissing(flag) = flag & ANYMISSING > 0
 const TYPEDETECTED = 0b00000100
 typedetected(flag) = flag & TYPEDETECTED > 0
 
-# whether a column will be "dropped"
+# whether a column will be "dropped" from the select/drop keyword arguments
 const WILLDROP = 0b00001000
 willdrop(flag) = flag & WILLDROP > 0
 
+# whether strings should be lazy; results in LazyStringVectors
+# this setting isn't per column, but we store it on the column bit flags anyway for convenience
 const LAZYSTRINGS = 0b00010000
 lazystrings(flag) = flag & LAZYSTRINGS > 0
 
 flag(T, lazystrings) = (T === Union{} ? 0x00 : ((USER | TYPEDETECTED) | (hasmissingtype(T) ? ANYMISSING : 0x00))) | (lazystrings ? LAZYSTRINGS : 0x00)
 
+# we define our own bit flag on a Parsers.ReturnCode to signal if a column needs to promote to string
 const PROMOTE_TO_STRING = 0b0100000000000000 % Int16
 promote_to_string(code) = code & PROMOTE_TO_STRING > 0
 
@@ -78,7 +87,8 @@ hasmissingtype(T) = T === Missing || T !== Core.Compiler.typesubtract(T, Missing
     end
 end
 
-# bit patterns for missing value, int value, escaped string, position and len in tape parsing
+## lazy strings
+# bit patterns for missing value, int value, escaped string, position and len in lazy string parsing
 const PosLen = UInt64
 
 # primitive type PosLen 64 end
@@ -147,13 +157,17 @@ end
     return s, st[2]
 end
 
-# column array allocating
+## column array allocating
+# we don't want to use SentinelVector for small integer types due to the higher risk of
+# sentinel value collision, so we just use Vector{Union{T, Missing}} and convert to Vector{T} if no missings were found
 const SmallIntegers = Union{Int8, UInt8, Int16, UInt16, Int32, UInt32}
 
+# allocate columns for a full file
 function allocate(rowsguess, ncols, types, flags)
     return AbstractVector[allocate(lazystrings(flags[i]) && (types[i] === String || types[i] === Union{String, Missing}) ? PosLen : types[i], rowsguess) for i = 1:ncols]
 end
 
+# MissingVector is an efficient representation in SentinelArrays.jl package
 allocate(::Type{Union{}}, len) = MissingVector(len)
 allocate(::Type{Missing}, len) = MissingVector(len)
 function allocate(::Type{PosLen}, len)
@@ -174,6 +188,7 @@ allocate(::Type{Union{Missing, T}}, len) where {T <: SmallIntegers} = Vector{Uni
 allocate(T, len) = SentinelVector{nonmissingtype(T)}(undef, len)
 
 reallocate!(A, len) = resize!(A, len)
+# when reallocating, we just need to make sure the missing bit is set for lazy string PosLen
 function reallocate!(A::Vector{PosLen}, len)
     oldlen = length(A)
     resize!(A, len)
@@ -185,6 +200,7 @@ const SVec{T} = SentinelVector{T, T, Missing, Vector{T}}
 const SVec2{T} = SentinelVector{T, typeof(undef), Missing, Vector{T}}
 
 ts(T, S) = Core.Compiler.typesubtract(T, S)
+# when users pass non-standard types, we need to keep track of them in a Tuple{...} to generate efficient custom parsing kernel codes
 function nonstandardtype(T)
     S = ts(ts(ts(ts(ts(ts(ts(ts(ts(T, Int64), Float64), String), PooledString), Bool), Date), DateTime), Time), Missing)
     if S === Union{}
@@ -201,6 +217,7 @@ end
 # one-liner suggested from ScottPJones
 consumeBOM(buf, pos) = (length(buf) >= 3 && buf[pos] == 0xef && buf[pos + 1] == 0xbb && buf[pos + 2] == 0xbf) ? pos + 3 : pos
 
+# whatever input is given, turn it into an AbstractVector{UInt8} we can parse with
 function getsource(x)
     if x isa AbstractVector{UInt8}
         return x, 1, length(x)
@@ -211,8 +228,14 @@ function getsource(x)
         buf = Base.read(x)
         return buf, 1, length(buf)
     else
-        buf = Mmap.mmap(string(x))
-        return buf, 1, length(buf)
+        try
+            buf = Mmap.mmap(string(x))
+            return buf, 1, length(buf)
+        catch e
+            # if we can't mmap, try just `read`ing the whole thing into a byte vector
+            buf = read(x)
+            return buf, 1, length(buf)
+        end
     end
 end
 
@@ -358,6 +381,9 @@ function detect(buf, pos, len, options)
     return nothing
 end
 
+# a ReversedBuf takes a byte vector and indexes backwards;
+# used for the footerskip keyword argument, which starts at the bottom of the file
+# and skips lines backwards
 struct ReversedBuf <: AbstractVector{UInt8}
     buf::Vector{UInt8}
 end
@@ -366,15 +392,9 @@ Base.size(a::ReversedBuf) = size(a.buf)
 Base.IndexStyle(::Type{ReversedBuf}) = Base.IndexLinear()
 Base.getindex(a::ReversedBuf, i::Int) = a.buf[end + 1 - i]
 
-function unset!(A::Vector, i::Int, row, x)
-    ccall(:jl_arrayunset, Cvoid, (Array, Csize_t), A, i - 1)
-    # println("deleting col = $i on thread = $(Threads.threadid()), row = $row, id = $x")
-    return
-end
-
-memcpy!(d, doff, s, soff, n) = ccall(:memcpy, Cvoid, (Ptr{UInt8}, Ptr{UInt8}, Int), d + doff - 1, s + soff - 1, n)
 memset!(ptr, value, num) = ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), ptr, value, num)
 
+# a RefPool holds our refs as a Dict, along with a lastref field which is incremented when a new ref is found while parsing pooled columns
 mutable struct RefPool
     refs::Dict{Union{String, Missing}, UInt32}
     lastref::UInt32

diff --git a/test/basics.jl b/test/basics.jl
@@ -433,4 +433,44 @@ f = CSV.File(GzipDecompressorStream(open(joinpath(dir, "randoms.csv.gz"))); type
 @test f.first isa AbstractVector{CSVString}
 @test f.wage isa AbstractVector{Union{Missing, Dec64}}
 
+f = CSV.File(joinpath(dir, "promotions.csv"); lazystrings=true)
+@test eltype.(f.columns) == [Float64, Union{Missing, Int64}, Union{Missing, Float64}, String, Union{Missing, String}, String, String, Union{Missing, Int64}]
+@test f.int_string isa CSV.LazyStringVector
+
+f = CSV.File(joinpath(dir, "promotions.csv"); limit=7500, threaded=true)
+@test length(f) == 7500
+
+f = CSV.File(IOBuffer("1,2\r\n3,4\r\n\r\n5,6\r\n"); header=["col1", "col2"], ignoreemptylines=true)
+@test f.col1 == [1, 3, 5]
+
+f = CSV.File(joinpath(dir, "escape_row_starts.csv"); tasks=2)
+@test length(f) == 10000
+@test eltype(f.col1) == String
+@test eltype(f.col2) == Int64
+
+f = CSV.File(IOBuffer("col1\nhey\nthere\nsailor"); lazystrings=true)
+@test f.col1 isa CSV.LazyStringVector
+@test Tables.columnnames(f) == [:col1]
+@test propertynames(f) == [:col1]
+@test CSV.getname(f) == "<Base.GenericIOBuffer{Array{UInt8,1}}>"
+@test CSV.getcols(f) == 1
+@test Base.IndexStyle(f) == Base.IndexLinear()
+@test f.col1 === Tables.getcolumn(f, 1)
+@test columntable(f) == columntable(collect(f))
+show(f)
+
+f = CSV.File(joinpath(dir, "big_types.csv"); lazystrings=true, pool=false)
+@test eltype(f.time) == Dates.Time
+@test eltype(f.bool) == Bool
+@test f.lazy isa CSV.LazyStringVector
+@test eltype(f.lazy) == String
+@test eltype(f.lazy_missing) == Union{String, Missing}
+
+r = CSV.Rows(joinpath(dir, "big_types.csv"); lazystrings=false, types=[Dates.Time, Bool, String, Union{String, Missing}])
+row = first(r)
+@test row.time == Dates.Time(12)
+@test row.bool
+@test row.lazy == "hey"
+@test row.lazy_missing === missing
+
 end