Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "CSV"
uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
authors = ["Jacob Quinn <quinn.jacobd@gmail.com>"]
version = "0.6.2"
version = "0.7.0"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand Down
5 changes: 5 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ CSV.jl is built to be a fast and flexible pure-Julia library for handling delimi
Depth = 3
```

## Getting Started

CSV.jl provides a number of utilities for working with delimited files. `CSV.File` provides a way to read files into columns of data, detecting column types.
`CSV.Rows` provides a row iterator for looping over rows in a file. Inputs to either should be filenames as `String`s, or byte vectors (`AbstractVector{UInt8}`). To read other `IO` inputs, just call `read(io)` and pass the bytes directly to `CSV.File` or `CSV.Rows`.

## Key Functions
```@docs
CSV.File
Expand Down
4 changes: 2 additions & 2 deletions src/CSV.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ using Parsers
using Tables
# PooledArrays.jl is used for materializing pooled columns
using PooledArrays
# WeakRefStrings allows for more efficient materializing of string columns via StringVector
using WeakRefStrings
# SentinelArrays.jl allow efficient conversion from Vector{Union{T, Missing}} to Vector{T}
# it also provides the MissingVector and ChainedVector array types
using SentinelArrays

using CategoricalArrays, DataFrames
Expand Down
16 changes: 10 additions & 6 deletions src/detection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -322,13 +322,17 @@ function findrowstarts!(buf, len, options::Parsers.Options{ignorerepeated}, rang
break
end
end
# now we read the next row and see if we get the right # of columns
for _ = 1:ncols
_, code, _, _, tlen = Parsers.xparse(String, buf, pos, len, options)
pos += tlen
pos > len && break
# now we read the next 5 rows and see if we get the right # of columns
correct = true
for j = 1:5
for _ = 1:ncols
_, code, _, _, tlen = Parsers.xparse(String, buf, pos, len, options)
pos += tlen
pos > len && break
end
correct &= Parsers.newline(code)
end
if Parsers.newline(code)
if correct
# boom, we read a whole row and got correct # of columns
break
end
Expand Down
448 changes: 223 additions & 225 deletions src/file.jl

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion src/header.jl
Original file line number Diff line number Diff line change
Expand Up @@ -228,8 +228,9 @@ getdf(x::AbstractDict{Int}, nm, i) = haskey(x, i) ? x[i] : nothing
end
end
end
# generate a customtypes Tuple{...} we'll need to generate code for during parsing
customtypes = Tuple{(nonstandardtype(T) for T in types if nonstandardtype(T) !== Union{})...}
# set any unselected columns to typecode USER | MISSING
# figure out if we'll drop any columns while parsing
todrop = Int[]
if select !== nothing && drop !== nothing
error("`select` and `drop` keywords were both provided; only one or the other is allowed")
Expand Down
66 changes: 33 additions & 33 deletions src/rows.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ struct Rows{transpose, O, O2, IO, T, V}
customtypes::T
positions::Vector{Int64}
reusebuffer::Bool
tapes::Vector{AbstractVector} # for parsing, allocated once and used for each iteration
columns::Vector{AbstractVector} # for parsing, allocated once and used for each iteration
values::Vector{V} # once values are parsed, put in values; allocated on each iteration if reusebuffer=false
lookup::Dict{Symbol, Int}
end
Expand Down Expand Up @@ -140,7 +140,7 @@ function Rows(source;
kw...)

h = Header(source, header, normalizenames, datarow, skipto, footerskip, limit, transpose, comment, use_mmap, ignoreemptylines, false, select, drop, missingstrings, missingstring, delim, ignorerepeated, quotechar, openquotechar, closequotechar, escapechar, dateformat, dateformats, decimal, truestrings, falsestrings, type, types, typemap, categorical, pool, lazystrings, strict, silencewarnings, debug, parsingdebug, true)
tapes = allocate(1, h.cols, h.types, h.flags)
columns = allocate(1, h.cols, h.types, h.flags)
values = all(x->x == Union{String, Missing}, h.types) && lazystrings ? Vector{PosLen}(undef, h.cols) : Vector{Any}(undef, h.cols)
finaltypes = copy(h.types)
columnmap = [i for i = 1:h.cols]
Expand All @@ -167,7 +167,7 @@ function Rows(source;
h.customtypes,
h.positions,
reusebuffer,
tapes,
columns,
values,
lookup,
)
Expand All @@ -182,66 +182,66 @@ Base.IteratorSize(::Type{<:Rows}) = Base.SizeUnknown()
const EMPTY_TYPEMAP = Dict{Type, Type}()
const EMPTY_REFS = RefPool[]

@inline function setcustom!(::Type{T}, values, tapes, i) where {T}
@inline function setcustom!(::Type{T}, values, columns, i) where {T}
if @generated
block = Expr(:block)
push!(block.args, quote
error("CSV.jl code-generation error, unexpected column type: $(typeof(tape))")
error("CSV.jl code-generation error, unexpected column type: $(typeof(column))")
end)
for i = 1:fieldcount(T)
vec = fieldtype(T, i)
pushfirst!(block.args, quote
if tape isa $(fieldtype(vec, 1))
@inbounds values[i] = tape[1]
if column isa $(fieldtype(vec, 1))
@inbounds values[i] = column[1]
return
end
end)
end
pushfirst!(block.args, quote
@inbounds tape = tapes[col]
@inbounds column = columns[col]
end)
pushfirst!(block.args, Expr(:meta, :inline))
# @show block
return block
else
# println("generated function failed")
@inbounds tape = tapes[i]
@inbounds values[i] = tape[1]
@inbounds column = columns[i]
@inbounds values[i] = column[1]
return
end
end

@inline function Base.iterate(r::Rows{transpose, O, O2, IO, T, V}, (pos, len, row)=(r.datapos, r.len, 1)) where {transpose, O, O2, IO, T, V}
(pos > len || row > r.limit) && return nothing
pos > len && return nothing
pos = parserow(1, Val(transpose), r.cols, EMPTY_TYPEMAP, r.tapes, r.datapos, r.buf, pos, len, r.positions, 0.0, EMPTY_REFS, 1, r.datarow + row - 2, r.types, r.flags, false, r.options, r.coloptions, T)
pos = parserow(1, Val(transpose), r.cols, EMPTY_TYPEMAP, r.columns, r.datapos, r.buf, pos, len, r.positions, 0.0, EMPTY_REFS, 1, r.datarow + row - 2, r.types, r.flags, false, r.options, r.coloptions, T)
cols = r.cols
values = r.reusebuffer ? r.values : Vector{V}(undef, cols)
tapes = r.tapes
columns = r.columns
for i = 1:cols
@inbounds tape = tapes[i]
if tape isa Vector{PosLen}
@inbounds values[i] = tape[1]
elseif tape isa SVec{Int64}
@inbounds values[i] = tape[1]
elseif tape isa SVec{Float64}
@inbounds values[i] = tape[1]
elseif tape isa SVec2{String}
@inbounds values[i] = tape[1]
elseif tape isa SVec{Date}
@inbounds values[i] = tape[1]
elseif tape isa SVec{DateTime}
@inbounds values[i] = tape[1]
elseif tape isa SVec{Time}
@inbounds values[i] = tape[1]
elseif tape isa Vector{Union{Missing, Bool}}
@inbounds values[i] = tape[1]
elseif tape isa Vector{UInt32}
@inbounds values[i] = tape[1]
@inbounds column = columns[i]
if column isa Vector{PosLen}
@inbounds values[i] = column[1]
elseif column isa SVec{Int64}
@inbounds values[i] = column[1]
elseif column isa SVec{Float64}
@inbounds values[i] = column[1]
elseif column isa SVec2{String}
@inbounds values[i] = column[1]
elseif column isa SVec{Date}
@inbounds values[i] = column[1]
elseif column isa SVec{DateTime}
@inbounds values[i] = column[1]
elseif column isa SVec{Time}
@inbounds values[i] = column[1]
elseif column isa Vector{Union{Missing, Bool}}
@inbounds values[i] = column[1]
elseif column isa Vector{UInt32}
@inbounds values[i] = column[1]
elseif T !== Tuple{}
setcustom!(T, values, tapes, i)
setcustom!(T, values, columns, i)
else
error("bad array type: $(typeof(tape))")
error("bad array type: $(typeof(column))")
end
end
return Row2{O, O2, V}(r.names, r.finaltypes, r.columnmap, r.types, r.lookup, values, r.buf, r.e, r.options, r.coloptions), (pos, len, row + 1)
Expand Down
46 changes: 33 additions & 13 deletions src/utils.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
export PooledString
"""
PooledString

A singleton type that can be used for signaling that a column of a csv file should be pooled,
with the output array type being a `PooledArray`.
"""
struct PooledString <: AbstractString end

# PointerString is an internal-only type for efficiently tracking string data + length
Expand Down Expand Up @@ -32,7 +38,7 @@ end

Base.String(x::PointerString) = _unsafe_string(x.ptr, x.len)

# column bit flags
# column bit flags; useful so we don't have to pass a bunch of arguments/state around manually

# whether the user provided the type or not
const USER = 0b00000001
Expand All @@ -46,15 +52,18 @@ anymissing(flag) = flag & ANYMISSING > 0
const TYPEDETECTED = 0b00000100
typedetected(flag) = flag & TYPEDETECTED > 0

# whether a column will be "dropped"
# whether a column will be "dropped" from the select/drop keyword arguments
const WILLDROP = 0b00001000
willdrop(flag) = flag & WILLDROP > 0

# whether strings should be lazy; results in LazyStringVectors
# this setting isn't per column, but we store it on the column bit flags anyway for convenience
const LAZYSTRINGS = 0b00010000
lazystrings(flag) = flag & LAZYSTRINGS > 0

flag(T, lazystrings) = (T === Union{} ? 0x00 : ((USER | TYPEDETECTED) | (hasmissingtype(T) ? ANYMISSING : 0x00))) | (lazystrings ? LAZYSTRINGS : 0x00)

# we define our own bit flag on a Parsers.ReturnCode to signal if a column needs to promote to string
const PROMOTE_TO_STRING = 0b0100000000000000 % Int16
promote_to_string(code) = code & PROMOTE_TO_STRING > 0

Expand All @@ -78,7 +87,8 @@ hasmissingtype(T) = T === Missing || T !== Core.Compiler.typesubtract(T, Missing
end
end

# bit patterns for missing value, int value, escaped string, position and len in tape parsing
## lazy strings
# bit patterns for missing value, int value, escaped string, position and len in lazy string parsing
const PosLen = UInt64

# primitive type PosLen 64 end
Expand Down Expand Up @@ -147,13 +157,17 @@ end
return s, st[2]
end

# column array allocating
## column array allocating
# we don't want to use SentinelVector for small integer types due to the higher risk of
# sentinel value collision, so we just use Vector{Union{T, Missing}} and convert to Vector{T} if no missings were found
const SmallIntegers = Union{Int8, UInt8, Int16, UInt16, Int32, UInt32}

# allocate columns for a full file
function allocate(rowsguess, ncols, types, flags)
return AbstractVector[allocate(lazystrings(flags[i]) && (types[i] === String || types[i] === Union{String, Missing}) ? PosLen : types[i], rowsguess) for i = 1:ncols]
end

# MissingVector is an efficient representation in SentinelArrays.jl package
allocate(::Type{Union{}}, len) = MissingVector(len)
allocate(::Type{Missing}, len) = MissingVector(len)
function allocate(::Type{PosLen}, len)
Expand All @@ -174,6 +188,7 @@ allocate(::Type{Union{Missing, T}}, len) where {T <: SmallIntegers} = Vector{Uni
allocate(T, len) = SentinelVector{nonmissingtype(T)}(undef, len)

reallocate!(A, len) = resize!(A, len)
# when reallocating, we just need to make sure the missing bit is set for lazy string PosLen
function reallocate!(A::Vector{PosLen}, len)
oldlen = length(A)
resize!(A, len)
Expand All @@ -185,6 +200,7 @@ const SVec{T} = SentinelVector{T, T, Missing, Vector{T}}
const SVec2{T} = SentinelVector{T, typeof(undef), Missing, Vector{T}}

ts(T, S) = Core.Compiler.typesubtract(T, S)
# when users pass non-standard types, we need to keep track of them in a Tuple{...} to generate efficient custom parsing kernel codes
function nonstandardtype(T)
S = ts(ts(ts(ts(ts(ts(ts(ts(ts(T, Int64), Float64), String), PooledString), Bool), Date), DateTime), Time), Missing)
if S === Union{}
Expand All @@ -201,6 +217,7 @@ end
# one-liner suggested from ScottPJones
consumeBOM(buf, pos) = (length(buf) >= 3 && buf[pos] == 0xef && buf[pos + 1] == 0xbb && buf[pos + 2] == 0xbf) ? pos + 3 : pos

# whatever input is given, turn it into an AbstractVector{UInt8} we can parse with
function getsource(x)
if x isa AbstractVector{UInt8}
return x, 1, length(x)
Expand All @@ -211,8 +228,14 @@ function getsource(x)
buf = Base.read(x)
return buf, 1, length(buf)
else
buf = Mmap.mmap(string(x))
return buf, 1, length(buf)
try
buf = Mmap.mmap(string(x))
return buf, 1, length(buf)
catch e
# if we can't mmap, try just `read`ing the whole thing into a byte vector
buf = read(x)
return buf, 1, length(buf)
end
end
end

Expand Down Expand Up @@ -358,6 +381,9 @@ function detect(buf, pos, len, options)
return nothing
end

# a ReversedBuf takes a byte vector and indexes backwards;
# used for the footerskip keyword argument, which starts at the bottom of the file
# and skips lines backwards
struct ReversedBuf <: AbstractVector{UInt8}
buf::Vector{UInt8}
end
Expand All @@ -366,15 +392,9 @@ Base.size(a::ReversedBuf) = size(a.buf)
Base.IndexStyle(::Type{ReversedBuf}) = Base.IndexLinear()
Base.getindex(a::ReversedBuf, i::Int) = a.buf[end + 1 - i]

function unset!(A::Vector, i::Int, row, x)
ccall(:jl_arrayunset, Cvoid, (Array, Csize_t), A, i - 1)
# println("deleting col = $i on thread = $(Threads.threadid()), row = $row, id = $x")
return
end

memcpy!(d, doff, s, soff, n) = ccall(:memcpy, Cvoid, (Ptr{UInt8}, Ptr{UInt8}, Int), d + doff - 1, s + soff - 1, n)
memset!(ptr, value, num) = ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), ptr, value, num)

# a RefPool holds our refs as a Dict, along with a lastref field which is incremented when a new ref is found while parsing pooled columns
mutable struct RefPool
refs::Dict{Union{String, Missing}, UInt32}
lastref::UInt32
Expand Down
40 changes: 40 additions & 0 deletions test/basics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -433,4 +433,44 @@ f = CSV.File(GzipDecompressorStream(open(joinpath(dir, "randoms.csv.gz"))); type
@test f.first isa AbstractVector{CSVString}
@test f.wage isa AbstractVector{Union{Missing, Dec64}}

f = CSV.File(joinpath(dir, "promotions.csv"); lazystrings=true)
@test eltype.(f.columns) == [Float64, Union{Missing, Int64}, Union{Missing, Float64}, String, Union{Missing, String}, String, String, Union{Missing, Int64}]
@test f.int_string isa CSV.LazyStringVector

f = CSV.File(joinpath(dir, "promotions.csv"); limit=7500, threaded=true)
@test length(f) == 7500

f = CSV.File(IOBuffer("1,2\r\n3,4\r\n\r\n5,6\r\n"); header=["col1", "col2"], ignoreemptylines=true)
@test f.col1 == [1, 3, 5]

f = CSV.File(joinpath(dir, "escape_row_starts.csv"); tasks=2)
@test length(f) == 10000
@test eltype(f.col1) == String
@test eltype(f.col2) == Int64

f = CSV.File(IOBuffer("col1\nhey\nthere\nsailor"); lazystrings=true)
@test f.col1 isa CSV.LazyStringVector
@test Tables.columnnames(f) == [:col1]
@test propertynames(f) == [:col1]
@test CSV.getname(f) == "<Base.GenericIOBuffer{Array{UInt8,1}}>"
@test CSV.getcols(f) == 1
@test Base.IndexStyle(f) == Base.IndexLinear()
@test f.col1 === Tables.getcolumn(f, 1)
@test columntable(f) == columntable(collect(f))
show(f)

f = CSV.File(joinpath(dir, "big_types.csv"); lazystrings=true, pool=false)
@test eltype(f.time) == Dates.Time
@test eltype(f.bool) == Bool
@test f.lazy isa CSV.LazyStringVector
@test eltype(f.lazy) == String
@test eltype(f.lazy_missing) == Union{String, Missing}

r = CSV.Rows(joinpath(dir, "big_types.csv"); lazystrings=false, types=[Dates.Time, Bool, String, Union{String, Missing}])
row = first(r)
@test row.time == Dates.Time(12)
@test row.bool
@test row.lazy == "hey"
@test row.lazy_missing === missing

end
Loading