From 7d08b055041005441d656cf87f6bb2ac1a6eae7d Mon Sep 17 00:00:00 2001 From: chainsawriot Date: Mon, 10 Jun 2024 14:31:20 +0200 Subject: [PATCH] Implement `guess_max` fix #33 (#34) * Implementation of `guess_max` * Update doc --- DESCRIPTION | 2 +- R/cpp11.R | 4 ++-- R/parser.R | 18 +++++++++----- R/type_convert.R | 5 ++-- README.Rmd | 18 ++++++++++++-- README.md | 22 ++++++++++++++++-- man/parse_guess.Rd | 3 +++ man/type_convert.Rd | 3 +++ misc/benchmark.md | 44 ++++++++++++++++++++++++++++++----- misc/benchmark.qmd | 8 +++++++ src/CollectorGuess.cpp | 28 ++++++++++++++-------- src/cpp11.cpp | 8 +++---- tests/testthat/test-parsing.R | 15 ++++++++++++ 13 files changed, 143 insertions(+), 35 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2bb97b1..29486ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: minty Title: Minimal Type Guesser -Version: 0.0.2 +Version: 0.0.3 Authors@R: c( person("Chung-hong", "Chan", role = c("aut", "cre"), email = "chainsawtiney@gmail.com", comment = c(ORCID = "0000-0002-6232-7530")), person("Hadley", "Wickham", , "hadley@posit.co", role = "aut", comment = "author of the ported code from readr"), diff --git a/R/cpp11.R b/R/cpp11.R index 5628954..a9958c9 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -1,7 +1,7 @@ # Generated by cpp11: do not edit by hand -collectorGuess <- function(input, locale_, guessInteger) { - .Call(`_minty_collectorGuess`, input, locale_, guessInteger) +collectorGuess <- function(input, locale_, guessInteger, guess_max) { + .Call(`_minty_collectorGuess`, input, locale_, guessInteger, guess_max) } parse_vector_ <- function(x, collectorSpec, locale_, na, trim_ws) { diff --git a/R/parser.R b/R/parser.R index ce2ef3a..50a0c91 100644 --- a/R/parser.R +++ b/R/parser.R @@ -185,6 +185,7 @@ col_number <- function() { #' @inheritParams parse_atomic #' @param guess_integer If `TRUE`, guess integer types for whole numbers, if #' `FALSE` guess numeric type for all numbers. +#' @param guess_max Maximum number of data rows to use for guessing column types. `NA`: uses all data. #' @family parsers #' @return a parsed vector #' @export @@ -201,8 +202,8 @@ col_number <- function() { #' #' # ISO 8601 date times #' parse_guess(c("2010-10-10")) -parse_guess <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE, guess_integer = FALSE, .return_problems = FALSE) { - parse_vector(x, guess_parser(x, locale, guess_integer = guess_integer, na = na), na = na, locale = locale, trim_ws = trim_ws, +parse_guess <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE, guess_integer = FALSE, guess_max = NA, .return_problems = FALSE) { + parse_vector(x, guess_parser(x, locale, guess_integer = guess_integer, na = na, guess_max = guess_max), na = na, locale = locale, trim_ws = trim_ws, .return_problems = .return_problems) } @@ -212,12 +213,17 @@ col_guess <- function() { collector("guess") } -guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA")) { +guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA"), guess_max = 1000) { x[x %in% na] <- NA_character_ - stopifnot(is.locale(locale)) - - collectorGuess(x, locale, guessInteger = guess_integer) + if (is.na(guess_max)) { + guess_max <- NA_integer_ + } + stopifnot(is.numeric(guess_max)) + if (abs(guess_max) == Inf || is.nan(guess_max) || guess_max < 1 || is.na(guess_max)) { + guess_max <- length(x) + } + collectorGuess(x, locale, guessInteger = guess_integer, as.integer(guess_max)) } #' Parse factors diff --git a/R/type_convert.R b/R/type_convert.R index 3537a21..69a5b29 100644 --- a/R/type_convert.R +++ b/R/type_convert.R @@ -27,7 +27,7 @@ #' df <- data.frame(x = c("NA", "10"), stringsAsFactors = FALSE) #' str(type_convert(df)) type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE, - locale = default_locale(), guess_integer = FALSE, + locale = default_locale(), guess_integer = FALSE, guess_max = NA, verbose = FALSE) { stopifnot(is.data.frame(df)) is_character <- vapply(df, is.character, logical(1)) @@ -45,7 +45,8 @@ type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE, guess_parser, locale = locale, na = na, - guess_integer = guess_integer + guess_integer = guess_integer, + guess_max = guess_max ) specs <- col_spec_standardise( diff --git a/README.Rmd b/README.Rmd index 01a6ef8..5a5ed24 100644 --- a/README.Rmd +++ b/README.Rmd @@ -144,11 +144,25 @@ Some features from `vroom` have been ported to `minty`, but not `readr`. ```{r} ## tidyverse/readr#1526 -minty::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str() +minty::type_convert(data.frame(a = c("NaN", "Inf", "-INF"))) |> str() ``` ```{r} -readr::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str() +readr::type_convert(data.frame(a = c("NaN", "Inf", "-INF"))) |> str() +``` + +`guess_max` is available for `parse_guess()` and `type_convert()`, default to `NA` (same as `readr`). + +```{r} +minty::parse_guess(c("1", "2", "drei")) +``` + +```{r} +minty::parse_guess(c("1", "2", "drei"), guess_max = 2) +``` + +```{r} +readr::parse_guess(c("1", "2", "drei")) ``` ## Similar packages diff --git a/README.md b/README.md index 56b17a0..bfc8e19 100644 --- a/README.md +++ b/README.md @@ -212,13 +212,13 @@ Some features from `vroom` have been ported to `minty`, but not `readr`. ``` r ## tidyverse/readr#1526 -minty::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str() +minty::type_convert(data.frame(a = c("NaN", "Inf", "-INF"))) |> str() #> 'data.frame': 3 obs. of 1 variable: #> $ a: num NaN Inf -Inf ``` ``` r -readr::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str() +readr::type_convert(data.frame(a = c("NaN", "Inf", "-INF"))) |> str() #> #> ── Column specification ──────────────────────────────────────────────────────── #> cols( @@ -228,6 +228,24 @@ readr::type_convert(data.frame(a=c("NaN", "Inf", "-INF"))) |> str() #> $ a: chr "NaN" "Inf" "-INF" ``` +`guess_max` is available for `parse_guess()` and `type_convert()`, +default to `NA` (same as `readr`). + +``` r +minty::parse_guess(c("1", "2", "drei")) +#> [1] "1" "2" "drei" +``` + +``` r +minty::parse_guess(c("1", "2", "drei"), guess_max = 2) +#> [1] 1 2 NA +``` + +``` r +readr::parse_guess(c("1", "2", "drei")) +#> [1] "1" "2" "drei" +``` + ## Similar packages For parsing ambiguous date(time) diff --git a/man/parse_guess.Rd b/man/parse_guess.Rd index 0d5e225..a8d761a 100644 --- a/man/parse_guess.Rd +++ b/man/parse_guess.Rd @@ -11,6 +11,7 @@ parse_guess( locale = default_locale(), trim_ws = TRUE, guess_integer = FALSE, + guess_max = NA, .return_problems = FALSE ) @@ -34,6 +35,8 @@ each field before parsing it?} \item{guess_integer}{If \code{TRUE}, guess integer types for whole numbers, if \code{FALSE} guess numeric type for all numbers.} +\item{guess_max}{Maximum number of data rows to use for guessing column types. \code{NA}: uses all data.} + \item{.return_problems}{Whether to hide the \code{problems} tibble from the output} } \value{ diff --git a/man/type_convert.Rd b/man/type_convert.Rd index eeac72b..881691b 100644 --- a/man/type_convert.Rd +++ b/man/type_convert.Rd @@ -11,6 +11,7 @@ type_convert( trim_ws = TRUE, locale = default_locale(), guess_integer = FALSE, + guess_max = NA, verbose = FALSE ) } @@ -37,6 +38,8 @@ names.} \item{guess_integer}{If \code{TRUE}, guess integer types for whole numbers, if \code{FALSE} guess numeric type for all numbers.} +\item{guess_max}{Maximum number of data rows to use for guessing column types. \code{NA}: uses all data.} + \item{verbose}{whether to print messages} } \value{ diff --git a/misc/benchmark.md b/misc/benchmark.md index 8b0c757..204fcdb 100644 --- a/misc/benchmark.md +++ b/misc/benchmark.md @@ -7,7 +7,7 @@ suppressPackageStartupMessages(library(readr)) Sys.time() ``` - [1] "2024-06-10 09:57:34 CEST" + [1] "2024-06-10 13:52:45 CEST" Under 200 rows, simple @@ -19,7 +19,7 @@ bench::mark(minty::type_convert(iris_chr), iterations = 10) # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 minty::type_convert(iris_chr) 368µs 399µs 2383. 695KB 0 + 1 minty::type_convert(iris_chr) 387µs 410µs 2377. 702KB 0 ``` r bench::mark(suppressMessages(readr::type_convert(iris_chr)), iterations = 10) @@ -28,7 +28,7 @@ bench::mark(suppressMessages(readr::type_convert(iris_chr)), iterations = 10) # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 suppressMessages(readr::type_conve… 2.11ms 2.15ms 376. 1.81MB 0 + 1 suppressMessages(readr::type_conve… 2.15ms 2.21ms 365. 1.81MB 0 Many rows @@ -43,7 +43,7 @@ bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE), iterati # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 x <- minty::type_convert(flights_ch… 991ms 1.05s 0.961 189MB 17.5 + 1 x <- minty::type_convert(flights_ch… 1.02s 1.06s 0.940 189MB 17.1 ``` r bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5) @@ -55,7 +55,39 @@ bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer # A tibble: 1 × 6 expression min median `itr/sec` mem_alloc `gc/sec` - 1 y <- suppressMessages(readr::type_c… 991ms 1.05s 0.970 153MB 17.5 + 1 y <- suppressMessages(readr::type_c… 1.01s 1.04s 0.954 153MB 17.2 + +``` r +all.equal(x, y) +``` + + [1] TRUE + +Many row, guess_max + +``` r +bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE, guess_max = 500), iterations = 5) +``` + + Warning: Some expressions had a GC in every iteration; so filtering is + disabled. + + # A tibble: 1 × 6 + expression min median `itr/sec` mem_alloc `gc/sec` + + 1 x <- minty::type_convert(flights_ch… 529ms 535ms 1.84 153MB 19.9 + +``` r +bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5) +``` + + Warning: Some expressions had a GC in every iteration; so filtering is + disabled. + + # A tibble: 1 × 6 + expression min median `itr/sec` mem_alloc `gc/sec` + + 1 y <- suppressMessages(readr::type_c… 1.02s 1.02s 0.959 153MB 17.5 ``` r all.equal(x, y) @@ -90,7 +122,7 @@ sessionInfo() [1] stats graphics grDevices utils datasets methods base other attached packages: - [1] readr_2.1.5 minty_0.0.2 + [1] readr_2.1.5 minty_0.0.3 loaded via a namespace (and not attached): [1] crayon_1.5.2 vctrs_0.6.5 cli_3.6.2 knitr_1.46 diff --git a/misc/benchmark.qmd b/misc/benchmark.qmd index 9e6e309..54b7be8 100644 --- a/misc/benchmark.qmd +++ b/misc/benchmark.qmd @@ -26,6 +26,14 @@ bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer all.equal(x, y) ``` +Many row, guess_max + +```{r} +bench::mark(x <- minty::type_convert(flights_chr, guess_integer = TRUE, guess_max = 500), iterations = 5) +bench::mark(y <- suppressMessages(readr::type_convert(flights_chr, guess_integer = TRUE)), iterations = 5) +all.equal(x, y) +``` + ```{r} sessionInfo() ``` diff --git a/src/CollectorGuess.cpp b/src/CollectorGuess.cpp index 0051caa..3fe4109 100644 --- a/src/CollectorGuess.cpp +++ b/src/CollectorGuess.cpp @@ -11,8 +11,15 @@ typedef bool (*canParseFun)(const std::string&, LocaleInfo* pLocale); bool canParse( - const cpp11::strings& x, const canParseFun& canParse, LocaleInfo* pLocale) { + const cpp11::strings& x, const canParseFun& canParseF, LocaleInfo* pLocale, unsigned int guess_max) { + unsigned int n = 0; for (const auto & i : x) { + n++; + //Rprintf("%u\n", n); + //Rprintf(i); + if (n > guess_max) { + break; + } if (i == NA_STRING) { continue; } @@ -21,7 +28,7 @@ bool canParse( continue; } - if (!canParse(std::string(i), pLocale)) { + if (!canParseF(std::string(i), pLocale)) { return false; } } @@ -122,7 +129,8 @@ static bool isDateTime(const std::string& x, LocaleInfo* pLocale) { [[cpp11::register]] std::string collectorGuess( const cpp11::strings& input, const cpp11::list& locale_, - bool guessInteger) { + bool guessInteger, + unsigned int guess_max) { LocaleInfo locale(static_cast(locale_)); if (input.size() == 0) { @@ -134,25 +142,25 @@ static bool isDateTime(const std::string& x, LocaleInfo* pLocale) { } // Work from strictest to most flexible - if (canParse(input, isLogical, &locale)) { + if (canParse(input, isLogical, &locale, guess_max)) { return "logical"; } - if (guessInteger && canParse(input, isInteger, &locale)) { + if (guessInteger && canParse(input, isInteger, &locale, guess_max)) { return "integer"; } - if (canParse(input, isDouble, &locale)) { + if (canParse(input, isDouble, &locale, guess_max)) { return "double"; } - if (canParse(input, isNumber, &locale)) { + if (canParse(input, isNumber, &locale, guess_max)) { return "number"; } - if (canParse(input, isTime, &locale)) { + if (canParse(input, isTime, &locale, guess_max)) { return "time"; } - if (canParse(input, isDate, &locale)) { + if (canParse(input, isDate, &locale, guess_max)) { return "date"; } - if (canParse(input, isDateTime, &locale)) { + if (canParse(input, isDateTime, &locale, guess_max)) { return "datetime"; } diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 714bae2..dfd113a 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -6,10 +6,10 @@ #include // CollectorGuess.cpp -std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger); -extern "C" SEXP _minty_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger) { +std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger, unsigned int guess_max); +extern "C" SEXP _minty_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger, SEXP guess_max) { BEGIN_CPP11 - return cpp11::as_sexp(collectorGuess(cpp11::as_cpp>(input), cpp11::as_cpp>(locale_), cpp11::as_cpp>(guessInteger))); + return cpp11::as_sexp(collectorGuess(cpp11::as_cpp>(input), cpp11::as_cpp>(locale_), cpp11::as_cpp>(guessInteger), cpp11::as_cpp>(guess_max))); END_CPP11 } // parse.cpp @@ -36,7 +36,7 @@ extern "C" SEXP _minty_r_is_string_cpp11(SEXP x) { extern "C" { static const R_CallMethodDef CallEntries[] = { - {"_minty_collectorGuess", (DL_FUNC) &_minty_collectorGuess, 3}, + {"_minty_collectorGuess", (DL_FUNC) &_minty_collectorGuess, 4}, {"_minty_parse_vector_", (DL_FUNC) &_minty_parse_vector_, 5}, {"_minty_r_is_string_cpp11", (DL_FUNC) &_minty_r_is_string_cpp11, 1}, {"_minty_type_convert_col", (DL_FUNC) &_minty_type_convert_col, 6}, diff --git a/tests/testthat/test-parsing.R b/tests/testthat/test-parsing.R index e81d258..086f402 100644 --- a/tests/testthat/test-parsing.R +++ b/tests/testthat/test-parsing.R @@ -1,3 +1,18 @@ test_that("trimmed before NA detection", { expect_equal(parse_logical(c(" TRUE ", "FALSE", " NA ")), c(TRUE, FALSE, NA)) }) + +test_that("parse_guess() guess_max", { + ## weird input + expected_output <- c(1, 2, 3) + expect_error(parse_guess(c("1", "2", "3"), guess_max = "123")) + expect_equal(parse_guess(c("1", "2", "3"), guess_max = NA), expected_output) + expect_equal(parse_guess(c("1", "2", "3"), guess_max = Inf), expected_output) + expect_equal(parse_guess(c("1", "2", "3"), guess_max = -Inf), expected_output) + expect_equal(parse_guess(c("1", "2", "3"), guess_max = 3.14), expected_output) + expect_equal(parse_guess(c("1", "2", "3"), guess_max = -3.14), expected_output) + expect_equal(parse_guess(c("1", "2", "3"), guess_max = 0), expected_output) + ## Off by one? + expect_equal(class(parse_guess(c("1", "2", "abc"), guess_max = 2)), "numeric") + expect_equal(class(parse_guess(c("1", "2", "abc"), guess_max = 3)), "character") +})