Skip to content

Commit

Permalink
Merge branch 'master' into 4889
Browse files Browse the repository at this point in the history
  • Loading branch information
OfekShilon authored May 29, 2021
2 parents d935887 + 8fc2066 commit 6d71dba
Show file tree
Hide file tree
Showing 58 changed files with 3,202 additions and 1,003 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
^.*\.Rproj$
^\.Rproj\.user$
^\.idea$
^\.libs$

^.*\.dll$

Expand Down
13 changes: 12 additions & 1 deletion .dev/CRAN_Release.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -157,11 +157,18 @@ grep -Enr "^[^#]*(?:\[|==|>|<|>=|<=|,|\(|\+)\s*[-]?[0-9]+[^0-9L:.e]" R | grep -E
grep -Enr "\bifelse" R

# No system.time in main tests.Rraw. Timings should be in benchmark.Rraw
grep -n "system[.]time" ./inst/tests/tests.Rraw
grep -Fn "system.time" ./inst/tests/*.Rraw | grep -Fv "benchmark.Rraw" | grep -Fv "this system.time usage ok"

# No tryCatch in *.Rraw -- tryCatch should be handled only in test() itself to avoid silently missed warnings/errors/output
grep -Fn "tryCatch" ./inst/tests/*.Rraw

# All % in *.Rd should be escaped otherwise text gets silently chopped
grep -n "[^\]%" ./man/*.Rd

# if (a & b) is either invalid or inefficient (ditto for replace & with |);
# if(any(a [&|] b)) is appropriate b/c of collapsing the logical vector to scalar
grep -nr "^[^#]*if[^&#]*[^&#\"][&][^&]" R | grep -Ev "if\s*[(](?:any|all)"

# seal leak potential where two unprotected API calls are passed to the same
# function call, usually involving install() or mkChar()
# Greppable thanks to single lines and wide screens
Expand Down Expand Up @@ -196,6 +203,10 @@ grep allocVector *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAtt
grep coerceVector *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAttrib | grep -v return
grep asCharacter *.c | grep -v PROTECT | grep -v SET_VECTOR_ELT | grep -v setAttrib | grep -v return

# Enforce local scope for loop index (`for (int i=0; ...)` instead of `int i; for (i=0; ...)`)
# exceptions are tagged with #loop_counter_not_local_scope_ok
grep -En "for\s*[(]\s*[a-zA-Z0-9_]+\s*=" src/*.c | grep -Fv "#loop_counter_not_local_scope_ok"

cd ..
R
cc(test=TRUE, clean=TRUE, CC="gcc-10") # to compile with -pedandic -Wall, latest gcc as CRAN: https://cran.r-project.org/web/checks/check_flavors.html
Expand Down
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
* -text
*.Rraw linguist-language=R

18 changes: 8 additions & 10 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ variables:
TZ: "UTC" ## to avoid 'Failed to create bus connection' from timedatectl via Sys.timezone() on Docker with R 3.4.
## Setting TZ for all GLCI jobs to isolate them from timezone. We could have a new GLCI job to test under
## a non-UTC timezone, although, that's what we do routinely in dev.
R_REL_VERSION: "4.0"
R_REL_VERSION: "4.1"
R_DEVEL_VERSION: "4.2"
R_OLDREL_VERSION: "3.6"
R_OLDREL_VERSION: "4.0"

stages:
- dependencies
Expand Down Expand Up @@ -96,16 +96,14 @@ build: ## build data.table sources as tar.gz archive
- mkdir.exe -p cran/bin/windows/contrib/$R_VERSION; mv.exe $(ls.exe -1t data.table_*.zip | head.exe -n 1) cran/bin/windows/contrib/$R_VERSION

.test-install-r-rel-win: &install-r-rel-win
- curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.0.3/R-4.0.3-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait
- curl.exe -s -o ../R-rel.exe https://cloud.r-project.org/bin/windows/base/old/4.1.0/R-4.1.0-win.exe; Start-Process -FilePath ..\R-rel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait
.test-install-r-devel-win: &install-r-devel-win
- curl.exe -s -o ../R-devel.exe https://cloud.r-project.org/bin/windows/base/R-devel-win.exe; Start-Process -FilePath ..\R-devel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait
.test-install-r-oldrel-win: &install-r-oldrel-win
- curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/3.6.3/R-3.6.3-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait
- curl.exe -s -o ../R-oldrel.exe https://cloud.r-project.org/bin/windows/base/old/4.0.5/R-4.0.5-win.exe; Start-Process -FilePath ..\R-oldrel.exe -ArgumentList "/VERYSILENT /DIR=C:\R" -NoNewWindow -Wait

.test-install-rtools-win: &install-rtools-win
- curl.exe -s -o ../rtools.exe https://cloud.r-project.org/bin/windows/Rtools/rtools40-x86_64.exe; Start-Process -FilePath ..\rtools.exe -ArgumentList "/VERYSILENT /DIR=C:\rtools40" -NoNewWindow -Wait
.test-install-rtools35-win: &install-rtools35-win
- curl.exe -s -o ../Rtools35.exe https://cloud.r-project.org/bin/windows/Rtools/Rtools35.exe; Start-Process -FilePath ..\Rtools35.exe -ArgumentList "/VERYSILENT /DIR=C:\Rtools" -NoNewWindow -Wait

.test-template: &test
stage: test
Expand Down Expand Up @@ -191,7 +189,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual
variables:
_R_CHECK_CRAN_INCOMING_: "TRUE" ## stricter --as-cran checks should run in dev pipelines continuously (not sure what they are though)
_R_CHECK_CRAN_INCOMING_REMOTE_: "FALSE" ## Other than no URL checking (takes many minutes) or 'Days since last update 0' NOTEs needed, #3284
_R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0, then 00check.log can be checked for "OK" rather than "2 NOTEs"
_R_CHECK_CRAN_INCOMING_TARBALL_THRESHOLD_: "7500000" ## effective from R 4.1.0
before_script:
- *install-deps
- *cp-src
Expand All @@ -205,7 +203,7 @@ test-rel-cran-lin: ## R-release on Linux, extra NOTEs check and build pdf manual
- R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1)
- *rm-src
- >-
Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 2 NOTEs")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 2 NOTEs"), " (size of tarball) but ", shQuote(l)) else q("no")'
Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); if (!identical(l, "Status: 1 NOTE")) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote("Status: 1 NOTE"), " (installed package size) but ", shQuote(l)) else q("no")'
test-dev-cran-lin: ## R-devel on Linux, --enable-strict-barrier --disable-long-double, check for new notes and compilation warnings, thus allow_failure
<<: *test-lin
Expand Down Expand Up @@ -285,8 +283,8 @@ test-old-win: ## R-oldrel on Windows
R_VERSION: "$R_OLDREL_VERSION"
before_script:
- *install-r-oldrel-win
- *install-rtools35-win
- $ENV:PATH = "C:\R\bin;C:\Rtools\bin;$ENV:PATH"
- *install-rtools-win
- $ENV:PATH = "C:\R\bin;C:\rtools40\usr\bin;$ENV:PATH"
- *install-deps-win
- *cp-src-win
- rm.exe -r bus
Expand Down
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ Authors@R: c(
person("Dirk","Eddelbuettel", role="ctb"),
person("Ben","Schwen", role="ctb"),
person("Tony","Fischetti", role="ctb"),
person("Ofek","Shilon", role="ctb"))
person("Ofek","Shilon", role="ctb"),
person("Vadim","Khotilovich", role="ctb"))
Depends: R (>= 3.1.0)
Imports: methods
Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export(nafill)
export(setnafill)
export(.Last.updated)
export(fcoalesce)
export(substitute2)

S3method("[", data.table)
S3method("[<-", data.table)
Expand Down
56 changes: 52 additions & 4 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
Item 2 has 0 rows but longest item has 3; filled with NA
```

5. `%like%` on factors with a large number of levels is now faster, [#4748](https://github.com/Rdatatable/data.table/issues/4748). The example in the PR shows 2.37s reduced to 0.86s on a factor lengh 100 million containing 1 million unique 10-character strings. Thanks to @statquant for reporting, and @shrektan for implementing.
5. `%like%` on factors with a large number of levels is now faster, [#4748](https://github.com/Rdatatable/data.table/issues/4748). The example in the PR shows 2.37s reduced to 0.86s on a factor length 100 million containing 1 million unique 10-character strings. Thanks to @statquant for reporting, and @shrektan for implementing.

6. `keyby=` now accepts `TRUE`/`FALSE` together with `by=`, [#4307](https://github.com/Rdatatable/data.table/issues/4307). The primary motivation is benchmarking where `by=` vs `keyby=` is varied across a set of queries. Thanks to Jan Gorecki for the request and the PR.

Expand All @@ -56,7 +56,42 @@

8. `melt()` now supports `NA` entries when specifying a list of `measure.vars`, which translate into runs of missing values in the output. Useful for melting wide data with some missing columns, [#4027](https://github.com/Rdatatable/data.table/issues/4027). Thanks to @vspinu for reporting, and @tdhock for implementing.

9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New function `measure()` which uses either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage for reporting, and to @tdhock for implementing.
9. `melt()` now supports multiple output variable columns via the `variable_table` attribute of `measure.vars`, [#3396](https://github.com/Rdatatable/data.table/issues/3396) [#2575](https://github.com/Rdatatable/data.table/issues/2575) [#2551](https://github.com/Rdatatable/data.table/issues/2551), [#4998](https://github.com/Rdatatable/data.table/issues/4998). It should be a `data.table` with one row that describes each element of the `measure.vars` vector(s). These data/columns are copied to the output instead of the usual variable column. This is backwards compatible since the previous behavior (one output variable column) is used when there is no `variable_table`. New functions `measure()` and `measurev()` which use either a separator or a regex to create a `measure.vars` list/vector with `variable_table` attribute; useful for melting data that has several distinct pieces of information encoded in each column name. See new `?measure` and new section in reshape vignette. Thanks to Matthias Gomolka, Ananda Mahto, Hugh Parsonage, Mark Fairbanks for reporting, and to @tdhock for implementing.

10. A new interface for _programming on data.table_ has been added, closing [#2655](https://github.com/Rdatatable/data.table/issues/2655) and many other linked issues. It is built using base R's `substitute`-like interface via a new `env` argument to `[.data.table`. For details see the new vignette *programming on data.table*, and the new `?substitute2` manual page. Thanks to numerous users for filing requests, and Jan Gorecki for implementing.

```R
DT = data.table(x = 1:5, y = 5:1)

# parameters
in_col_name = "x"
fun = "sum"
fun_arg1 = "na.rm"
fun_arg1val = TRUE
out_col_name = "sum_x"

# parameterized query
#DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val))]

# desired query
DT[, .(sum_x = sum(x, na.rm=TRUE))]

# new interface
DT[, .(out_col_name = fun(in_col_name, fun_arg1=fun_arg1val)),
env = list(
in_col_name = "x",
fun = "sum",
fun_arg1 = "na.rm",
fun_arg1val = TRUE,
out_col_name = "sum_x"
)]
```

11. `DT[, if (...) .(a=1L) else .(a=1L, b=2L), by=group]` now returns a 1-column result with warning `j may not evaluate to the same number of columns for each group`, rather than error `'names' attribute [2] must be the same length as the vector`, [#4274](https://github.com/Rdatatable/data.table/issues/4274). Thanks to @robitalec for reporting, and Michael Chirico for the PR.

12. Typo checking in `i` available since 1.11.4 is extended to work in non-English sessions, [#4989](https://github.com/Rdatatable/data.table/issues/4989). Thanks to Michael Chirico for the PR.

13. `fifelse()` now coerces logical `NA` to other types and the `na` argument supports vectorized input, [#4277](https://github.com/Rdatatable/data.table/issues/4277) [#4286](https://github.com/Rdatatable/data.table/issues/4286) [#4287](https://github.com/Rdatatable/data.table/issues/4287). Thanks to @michaelchirico and @shrektan for reporting, and @shrektan for implementing.

## BUG FIXES

Expand Down Expand Up @@ -88,6 +123,16 @@
14. `by=...get()...` could fail with `object not found`, [#4873](https://github.com/Rdatatable/data.table/issues/4873) [#4981](https://github.com/Rdatatable/data.table/issues/4981). Thanks to @sindribaldur for reporting, and @OfekShilon for fixing.
15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing.

16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank instead of blank, [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it.

17. Assigning a wrong-length or non-list vector to a list column could segfault, [#4166](https://github.com/Rdatatable/data.table/issues/4166) [#4667](https://github.com/Rdatatable/data.table/issues/4667) [#4678](https://github.com/Rdatatable/data.table/issues/4678) [#4729](https://github.com/Rdatatable/data.table/issues/4729). Thanks to @fklirono, Kun Ren, @kevinvzandvoort and @peterlittlejohn for reporting, and to Václav Tlapák for the PR.

18. `as.data.table()` on `xts` objects containing a column named `x` would return an `index` of type plain `integer` rather than `POSIXct`, [#4897](https://github.com/Rdatatable/data.table/issues/4897). Thanks to Emil Sjørup for reporting, and Jan Gorecki for the PR.

19. A fix to `as.Date(c("", ...))` in R 4.0.3, [17909](https://bugs.r-project.org/bugzilla3/show_bug.cgi?id=17909), has been backported to `data.table::as.IDate()` so that it too now returns `NA` for the first item when it is blank, even in older versions of R back to 3.1.0, rather than the incorrect error `character string is not in a standard unambiguous format`, [#4676](https://github.com/Rdatatable/data.table/issues/4676). Thanks to Arun Srinivasan for reporting, and Michael Chirico both for the `data.table` PR and for submitting the patch to R that was accepted and included in R 4.0.3.

## NOTES

1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.<type>()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example :
Expand All @@ -106,6 +151,10 @@

4. `cube(DT, by="a")` now gives a more helpful error that `j` is missing, [#4282](https://github.com/Rdatatable/data.table/pull/4282).

5. v1.13.0 (July 2020) fixed a segfault/corruption/error (depending on version of R and circumstances) in `dcast()` when `fun.aggregate` returned `NA` (type `logical`) in an otherwise `character` result, [#2394](https://github.com/Rdatatable/data.table/issues/2394). This fix was the result of other internal rework and there was no news item at the time. A new test to cover this case has now been added. Thanks Vadim Khotilovich for reporting, and Michael Chirico for investigating, pinpointing when the fix occurred and adding the test.

6. `DT[subset]` where `DT[(subset)]` or `DT[subset==TRUE]` was intended; i.e., subsetting by a logical column whose name conflicts with an existing function, now gives a friendlier error message, [#5014](https://github.com/Rdatatable/data.table/issues/5014). Thanks @michaelchirico for the suggestion and PR, and @ColeMiller1 for helping with the fix.


# data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021)

Expand Down Expand Up @@ -313,7 +362,6 @@ has a better chance of working on Mac.
11. `copy()` now overallocates deeply nested lists of `data.table`s, [#4205](https://github.com/Rdatatable/data.table/issues/4205). Thanks to @d-sci for reporting and the PR.
12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR.
13. A relatively rare case of segfault when combining non-equi joins with `by=.EACHI` is now fixed, closes [#4388](https://github.com/Rdatatable/data.table/issues/4388).
14. Selecting key columns could incur a large speed penalty, [#4498](https://github.com/Rdatatable/data.table/issues/4498). Thanks to @Jesper on Stack Overflow for the report.
Expand All @@ -328,7 +376,7 @@ has a better chance of working on Mac.
19. Matrices resulting from logical operators or comparisons on `data.table`s, e.g. in `dta == dtb`, can no longer have their colnames changed by reference later, [#4323](https://github.com/Rdatatable/data.table/issues/4323). Thanks to @eyherabh for reporting and @tlapak for the PR.
20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8).
20. The environment variable `R_DATATABLE_NUM_THREADS` was being limited by `R_DATATABLE_NUM_PROCS_PERCENT` (by default 50%), [#4514](https://github.com/Rdatatable/data.table/issues/4514). It is now consistent with `setDTthreads()` and only limited by the full number of logical CPUs. For example, on a machine with 8 logical CPUs, `R_DATATABLE_NUM_THREADS=6` now results in 6 threads rather than 4 (50% of 8).r
## NOTES
Expand Down
4 changes: 4 additions & 0 deletions R/IDateTime.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ as.IDate = function(x, ...) UseMethod("as.IDate")

as.IDate.default = function(x, ..., tz = attr(x, "tzone", exact=TRUE)) {
if (is.null(tz)) tz = "UTC"
if (is.character(x)) {
# backport of similar patch to base::as.Date.character in R 4.0.3, #4676
is.na(x) = !nzchar(x)
}
as.IDate(as.Date(x, tz = tz, ...))
}

Expand Down
2 changes: 1 addition & 1 deletion R/as.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
dnx
} else dnx
val = rev(val)
if (is.null(names(val)) || all(!nzchar(names(val))))
if (is.null(names(val)) || !any(nzchar(names(val))))
setattr(val, 'names', paste0("V", rev(seq_along(val))))
if (value.name %chin% names(val))
stop("Argument 'value.name' should not overlap with column names in result: ", brackify(rev(names(val))))
Expand Down
Loading

0 comments on commit 6d71dba

Please sign in to comment.