Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

When delim=groupmark=x, treat x as delim unless input is quoted #182

Merged
merged 6 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 54 additions & 7 deletions src/Parsers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ end
* `ignoreemptylines=false`: after parsing a value, if a newline is detected, another immediately proceeding newline will be checked for and consumed
* `stripwhitespace=nothing`: if true, leading and trailing whitespace is stripped from string fields, note that for *quoted* strings however, whitespace is preserved within quotes (but ignored before/after quote characters). To also strip *within* quotes, see `stripquoted`
* `stripquoted=false`: if true, whitespace is also stripped within quoted strings. If true, `stripwhitespace` is also set to true.
* `groupmark=nothing`: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (`1,000.00`).
* `groupmark=nothing`: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (`1 000.00`). When the `groupmark` is ambiguous with the `delim`, the user must quote the number if it contains group marks.
* `rounding=RoundNearest`: optionally specify a rounding mode to use when parsing. No rounding means the result will be marked with `INEXACT` code if the value is not exactly representable in the target type.
"""
struct Options
Expand Down Expand Up @@ -141,12 +141,49 @@ function Base.getproperty(x::Options, nm::Symbol)
end
end

const OPTIONS = Options(Flags(false, false, false, false, false, false, false, false, false), UInt8('.'),
Token(UInt8('"')), Token(UInt8('"')), UInt8('"'), Token[], Token(""), Token(""),
nothing, nothing, nothing, nothing, nothing)
const XOPTIONS = Options(Flags(false, false, false, false, true, true, true, false, false), UInt8('.'),
Token(UInt8('"')), Token(UInt8('"')), UInt8('"'), Token[], Token(UInt8(',')), Token(""),
nothing, nothing, nothing, nothing, nothing)
# Get the default options for single-value parsing (i.e. not delimited), used
# by Parsers.parse and Parsers.tryparse via Parser.xparse2
function _get_default_options(;
flags::Flags=Flags(false, false, false, false, false, false, false, false, false),
decimal::UInt8=UInt8('.'),
oq::Token=Token(UInt8('"')),
cq::Token=Token(UInt8('"')),
e::UInt8=UInt8('"'),
sentinel::Vector{Token}=Token[],
delim::Token=Token(""),
cmt::Token=Token(""),
trues::Union{Nothing, Vector{String}}=nothing,
falses::Union{Nothing, Vector{String}}=nothing,
dateformat::Union{Nothing, Format}=nothing,
groupmark::Union{Nothing,UInt8}=nothing,
rounding::Union{Nothing,RoundingMode}=nothing,
)
return Options(flags, decimal, oq, cq, e, sentinel, delim, cmt, trues, falses, dateformat, groupmark, rounding)
end

# Get the default options for delimited parsing, used by Parsers.xparse
function _get_default_xoptions(;
flags::Flags=Flags(false, false, false, false, true, true, true, false, false),
decimal::UInt8=UInt8('.'),
oq::Token=Token(UInt8('"')),
cq::Token=Token(UInt8('"')),
e::UInt8=UInt8('"'),
sentinel::Vector{Token}=Token[],
delim::Token=Token(UInt8(',')),
cmt::Token=Token(""),
trues::Union{Nothing, Vector{String}}=nothing,
falses::Union{Nothing, Vector{String}}=nothing,
dateformat::Union{Nothing, Format}=nothing,
groupmark::Union{Nothing,UInt8}=nothing,
rounding::Union{Nothing,RoundingMode}=nothing,
)
return Options(flags, decimal, oq, cq, e, sentinel, delim, cmt, trues, falses, dateformat, groupmark, rounding)
end

# What is used by default in Parsers.parse, Parsers.tryparse, Parsers.xparse2
const OPTIONS = _get_default_options()
# What is used by default in Parsers.xparse
const XOPTIONS = _get_default_xoptions()

prepare!(x::Vector) = sort!(x, by=x->sizeof(x), rev=true)
asciival(c::Char) = isascii(c)
Expand Down Expand Up @@ -446,6 +483,16 @@ function checkdelim!(source::AbstractVector{UInt8}, pos, len, options::Options)
return pos
end

@inline function _has_groupmark(opts::Options, code::ReturnCode)
if opts.groupmark !== nothing
isquoted = (code & QUOTED) != 0
if isquoted || (opts.groupmark != opts.delim)
return true
end
end
return false
end

include("ints.jl")
include("floats.jl")
include("strings.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/floats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ rettype(::Type{T}) where {T} = T === Number ? Nothing : T
@inline function parsedigits(conf::AbstractConf{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, overflow_invalid::Bool=false, ndigits::Int=0, f::F=nothing) where {T, IntType, F}
x = zero(T)
anydigits = false
has_groupmark = options.groupmark !== nothing
has_groupmark = _has_groupmark(options, code)
groupmark0 = something(options.groupmark, 0xff) - UInt8('0')

# we already previously checked if `b` was decimal or a digit, so don't need to check explicitly again
Expand Down
2 changes: 1 addition & 1 deletion src/ints.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ overflowval(::Type{T}) where {T <: Integer} = div(typemax(T) - T(9), T(10))
@inline function typeparser(::AbstractConf{T}, source, pos, len, b, code, pl, opts) where {T <: Integer}
x = zero(T)
neg = false
has_groupmark = opts.groupmark !== nothing
has_groupmark = _has_groupmark(opts, code)
groupmark0 = something(opts.groupmark, 0xff) - UInt8('0')
# start actual int parsing
neg = b == UInt8('-')
Expand Down
42 changes: 21 additions & 21 deletions test/floats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -369,42 +369,42 @@ end
@test Parsers.tryparse(Float64, "0e+") === nothing

@testset "groupmark" begin
@test Parsers.xparse(Float64, "100,000,000.99"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "100,000,000"; groupmark=',').val == 100_000_000.0
@test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,0.99"; groupmark=',').val == 100_000_000.99
# xparse2 is used for parsing inputs with a single value in them,
# so when delims==groupmarks, we assume what we see are groupmarks
@test let case = "1,0,0,0,0,0,0,0,099e-2"; Parsers.xparse2(Float32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000.99 end
@test let case = "100,000,00099e-2"; Parsers.xparse2(Float32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000.99 end
@test let case = "100,000,000.99"; Parsers.xparse2(Float32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000.99 end
@test let case = "100,000,000"; Parsers.xparse2(Float32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000 end
@test let case = "1 0 0 0 0 0 0 0 099e-2"; Parsers.xparse2(Float32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000.99 end
@test let case = "100 000 00099e-2"; Parsers.xparse2(Float32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000.99 end
@test let case = "100 000 000.99"; Parsers.xparse2(Float32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000.99 end
@test let case = "100 000 000"; Parsers.xparse2(Float32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000 end

@test let case = "1,0,0,0,0,0,0,0,099e-2"; Parsers.xparse2(Float64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000.99 end
@test let case = "100,000,00099e-2"; Parsers.xparse2(Float64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000.99 end
@test let case = "100,000,000.99"; Parsers.xparse2(Float64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000.99 end
@test let case = "100,000,000"; Parsers.xparse2(Float64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000 end
@test let case = "1 0 0 0 0 0 0 0 099e-2"; Parsers.xparse2(Float64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000.99 end
@test let case = "100 000 00099e-2"; Parsers.xparse2(Float64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000.99 end
@test let case = "100 000 000.99"; Parsers.xparse2(Float64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000.99 end
@test let case = "100 000 000"; Parsers.xparse2(Float64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000 end


@test Parsers.xparse(Float64, "1 0 0 0 0 0 0 0 0.99"; groupmark=' ').val == 100_000_000.99
@test Parsers.xparse(Float64, "100000000.99"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "100000000.99,aaa"; groupmark=',') == Parsers.Result{Float64}(OK | DELIMITED, 13, 1.0000000099e8)
@test Parsers.xparse(Float64, "\"100,000,000.99\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 17, 1.0000000099e8)
@test Parsers.xparse(Float64, "100,000,000.99,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)
@test Parsers.xparse(Float64, "\"100,000,000\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 14, 1.0e8)
res = Parsers.xparse(Float64, "100,000,000,aaa"; groupmark=',')
@test res.code == EOF | INVALID | INVALID_DELIMITER
@test res.tlen == 15

@test Parsers.xparse(Float32, "100,000,000.99"; groupmark=',').val ≈ 100_000_000.99
@test Parsers.xparse(Float32, "100,000,000"; groupmark=',').val ≈ 100_000_000.0
@test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,0.99"; groupmark=',').val ≈ 100_000_000.99
@test Parsers.xparse(Float32, "1 0 0 0 0 0 0 0 0.99"; groupmark=' ').val ≈ 100_000_000.99
@test Parsers.xparse(Float32, "100000000.99"; groupmark=',').val ≈ 100_000_000.99
res = Parsers.xparse(Float32, "100000000.99,aaa"; groupmark=',')
@test res.code == OK | DELIMITED
@test res.tlen == 13
@test res.val ≈ 100_000_000.99
res = Parsers.xparse(Float32, "100,000,000,aaa"; groupmark=',')
@test res.code == EOF | INVALID | INVALID_DELIMITER
@test res.tlen == 15

@test Parsers.xparse(Float64, "100,000,00099e-2"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "1 0 0 0 0 0 0 0 099e-2"; groupmark=' ').val == 100_000_000.99
@test Parsers.xparse(Float64, "10000000099e-2"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "10000000099e-2,aaa"; groupmark=',') == Parsers.Result{Float64}(OK | DELIMITED, 15, 1.0000000099e8)
@test Parsers.xparse(Float64, "\"10000000099e-2\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 17, 1.0000000099e8)
@test Parsers.xparse(Float64, "10000000099e-2,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)

@test Parsers.xparse(Float32, "100,000,00099e-2"; groupmark=',').val ≈ 100_000_000.99
@test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val ≈ 100_000_000.99
@test Parsers.xparse(Float32, "1 0 0 0 0 0 0 0 099e-2"; groupmark=' ').val ≈ 100_000_000.99
@test Parsers.xparse(Float32, "10000000099e-2"; groupmark=',').val ≈ 100_000_000.99
res = Parsers.xparse(Float32, "10000000099e-2,aaa"; groupmark=',')
Expand Down
96 changes: 76 additions & 20 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -347,27 +347,31 @@ end # @testset "Core Parsers.xparse"
@testset "ints" begin

@testset "groupmark" begin
@test Parsers.xparse(Int64, "100,000,000"; groupmark=',').val == 100_000_000
@test Parsers.xparse(Int64, "1,0,0,0,0,0,0,0,0"; groupmark=',').val == 100_000_000
# xparse2 is used for parsing inputs with a single value in them,
# so when delims==groupmarks, we assume what we see are groupmarks
@test let case = "100,000,000"; Parsers.xparse2(Int64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val == 100_000_000 end
@test let case = "1,0,0,0,0,0,0,0,0"; Parsers.xparse2(Int64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val == 100_000_000 end
@test let case = "9,2,2,3,3,7,2,0,3,6,8,5,4,7,7,5,8,0,7"; Parsers.xparse2(Int64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val == 9223372036854775807 end
@test let case = "100 000 000"; Parsers.xparse2(Int64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val == 100_000_000 end
@test let case = "1 0 0 0 0 0 0 0 0"; Parsers.xparse2(Int64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val == 100_000_000 end
@test let case = "9 2 2 3 3 7 2 0 3 6 8 5 4 7 7 5 8 0 7"; Parsers.xparse2(Int64, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val == 9223372036854775807 end

@test let case = "100,000,000"; Parsers.xparse2(Int32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000.99 end
@test let case = "1,0,0,0,0,0,0,0,0"; Parsers.xparse2(Int32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val ≈ 100_000_000.99 end
@test let case = "2,1,4,7,4,8,3,6,4,7"; Parsers.xparse2(Int32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(','))).val == 2147483647 end
@test let case = "100 000 000"; Parsers.xparse2(Int32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000.99 end
@test let case = "1 0 0 0 0 0 0 0 0"; Parsers.xparse2(Int32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val ≈ 100_000_000.99 end
@test let case = "2 1 4 7 4 8 3 6 4 7"; Parsers.xparse2(Int32, case, 1, length(case), Parsers._get_default_options(groupmark=UInt8(' '))).val == 2147483647 end

@test Parsers.xparse(Int64, "100000000"; groupmark=',').val == 100_000_000
@test Parsers.xparse(Int64, "9223372036854775807"; groupmark=',').val == 9223372036854775807
@test Parsers.xparse(Int64, "9,2,2,3,3,7,2,0,3,6,8,5,4,7,7,5,8,0,7"; groupmark=',').val == 9223372036854775807
@test Parsers.xparse(Int64, "9 2 2 3 3 7 2 0 3 6 8 5 4 7 7 5 8 0 7"; groupmark=' ').val == 9223372036854775807
@test Parsers.xparse(Int64, "\"100,000,000\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Int64}(Int16(13), 14, 100_000_000)
res = Parsers.xparse(Int64, "100,000,000,aaa"; groupmark=',')
@test res.code == EOF | INVALID | INVALID_DELIMITER
@test res.tlen == 15

@test Parsers.xparse(Int32, "100,000,000"; groupmark=',').val == 100_000_000
@test Parsers.xparse(Int32, "1,0,0,0,0,0,0,0,0"; groupmark=',').val == 100_000_000
@test Parsers.xparse(Int32, "100000000"; groupmark=',').val == 100_000_000
@test Parsers.xparse(Int32, "2147483647"; groupmark=',').val == 2147483647
@test Parsers.xparse(Int32, "2,1,4,7,4,8,3,6,4,7"; groupmark=',').val == 2147483647
@test Parsers.xparse(Int32, "2 1 4 7 4 8 3 6 4 7"; groupmark=' ').val == 2147483647
@test Parsers.xparse(Int32, "\"100,000,000\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Int32}(Int16(13), 14, 100_000_000)
res = Parsers.xparse(Int32, "100,000,000,aaa"; groupmark=',')
@test res.code == EOF | INVALID | INVALID_DELIMITER
@test res.tlen == 15

@test_throws ArgumentError Parsers.xparse(Int64, "42"; groupmark=',', quoted=false, delim=',')
@test_throws ArgumentError Parsers.xparse(Int64, "42"; groupmark=',', quoted=false, delim=UInt8(','))
Expand All @@ -381,15 +385,67 @@ end # @testset "Core Parsers.xparse"
@test_throws ArgumentError Parsers.xparse(Int64, "42"; groupmark='"', closequotechar='"')
@test_throws ArgumentError Parsers.xparse(Int64, "42"; groupmark='"', closequotechar=UInt8('"'))


for g in (',',' ')
xopts_with_groupmark = Parsers._get_default_xoptions(groupmark=UInt8(g))
# Groupmark tests for ints
for (input, expected_vals) in [
("1000,0000,2000,3000", (1000,0,2000,3000,)),
("\"1000\",\"0000\",\"2000\",\"3000\"", (1000,0,2000,3000,)),
("\"1$(g)0$(g)0$(g)0\",0000,\"2$(g)0$(g)0$(g)0\",3000", (1000,0,2000,3000,)),
("1000,\"0$(g)0$(g)0$(g)0\",2000,\"3$(g)0$(g)0$(g)0\"", (1000,0,2000,3000,)),
]
pos = 1
len = length(input)
local res
for expected in expected_vals
res = Parsers.xparse(Int, input, pos, len, xopts_with_groupmark)
@test res.val == expected
@test Parsers.ok(res.code)
pos += res.tlen
end
@test Parsers.ok(res.code)
@test Parsers.eof(res.code)
end

# Groupmark tests for floats
for (input, expected_vals) in [
("1000,0000,2000,3000", (1000.0,0.0,2000.0,3000.0,)),
("\"1000\",\"0000\",\"2000\",\"3000\"", (1000.0,0.0,2000.0,3000.0,)),
("\"1$(g)0$(g)0$(g)0\",0000,\"2$(g)0$(g)0$(g)0\",3000", (1000.0,0.0,2000.0,3000.0,)),
("1000,\"0$(g)0$(g)0$(g)0\",2000,\"3$(g)0$(g)0$(g)0\"", (1000.0,0.0,2000.0,3000.0,)),
("1000.00,0000.00,2000.00,3000.00", (1000.0,0.0,2000.0,3000.0,)),
("\"1000.00\",\"0000.00\",\"2000.00\",\"3000.00\"", (1000.0,0.0,2000.0,3000.0,)),
("\"1$(g)0$(g)0$(g)0.00\",0000.00,\"2$(g)0$(g)0$(g)0.00\",3000.00", (1000.0,0.0,2000.0,3000.0,)),
("1000,\"0$(g)0$(g)0$(g)0.00\",2000.00,\"3$(g)0$(g)0$(g)0.00\"", (1000.0,0.0,2000.0,3000.0,)),
("1000.00e0,0000.00e0,2000.00e0,3000.00e0", (1000.0,0.0,2000.0,3000.0,)),
("\"1000.00e0\",\"0000.00e0\",\"2000.00e0\",\"3000.00e0\"", (1000.0,0.0,2000.0,3000.0,)),
("\"1$(g)0$(g)0$(g)0.00e0\",0000.00e0,\"2$(g)0$(g)0$(g)0.00e0\",3000.00e0", (1000.0,0.0,2000.0,3000.0,)),
("1000,\"0$(g)0$(g)0$(g)0.00e0\",2000.00e0,\"3$(g)0$(g)0$(g)0.00e0\"", (1000.0,0.0,2000.0,3000.0,)),
("1000e0,0000e0,2000e0,3000e0", (1000.0,0.0,2000.0,3000.0,)),
("\"1000e0\",\"0000e0\",\"2000e0\",\"3000e0\"", (1000.0,0.0,2000.0,3000.0,)),
("\"1$(g)0$(g)0$(g)0e0\",0000e0,\"2$(g)0$(g)0$(g)0e0\",3000e0", (1000.0,0.0,2000.0,3000.0,)),
("1000,\"0$(g)0$(g)0$(g)0e0\",2000e0,\"3$(g)0$(g)0$(g)0e0\"", (1000.0,0.0,2000.0,3000.0,)),
]
pos = 1
len = length(input)
local res
for expected in expected_vals
res = Parsers.xparse(Float64, input, pos, len, xopts_with_groupmark)
@test res.val == expected
@test Parsers.ok(res.code)
pos += res.tlen
end
@test Parsers.ok(res.code)
@test Parsers.eof(res.code)
end
end


# #168
res = Parsers.xparse(Int, "1,729"; groupmark=',')
@test res.code == (OK | EOF)
@test res.tlen == 5
@test res.val == 1729
res = Parsers.xparse(Int, "1,729"; groupmark=UInt8(','))
@test res.code == (OK | EOF)
@test res.tlen == 5
@test res.val == 1729
res = Parsers.parse(Int, "1,729", Parsers._get_default_options(groupmark=UInt8(',')))
@test res == 1729
nickrobinson251 marked this conversation as resolved.
Show resolved Hide resolved

@test_throws ArgumentError Parsers.xparse(Int, "3.14", groupmark='.', decimal=UInt8('.'), quoted=false)
@test_throws ArgumentError Parsers.xparse(Int, "3.14", groupmark=UInt8('.'), decimal='.', quoted=false)
@test_throws ArgumentError Parsers.xparse(Int, "3.14", groupmark=UInt8('.'), decimal=UInt8('.'), quoted=false)
Expand Down
Loading