-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsim.R
62 lines (61 loc) · 2.34 KB
/
sim.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#' Simulate the levels and their sizes in a high-cardinality feature
#'
#' @param nlevels Number of levels to generate.
#' @param seed Random seed.
#'
#' @return A data frame of postal codes and sizes.
#'
#' @export sim_postcode_levels
#'
#' @importFrom stats rlnorm
#'
#' @note The code is derived from the example described in the "rare levels"
#' vignette in the \code{vtreat} package.
#'
#' @examples
#' df_levels <- sim_postcode_levels(nlevels = 500, seed = 42)
#' head(df_levels)
sim_postcode_levels <- function(nlevels = 100L, seed = 1001) {
set.seed(seed)
# Generate level names and sizes
size <- round(rlnorm(nlevels, meanlog = log(4000), sdlog = 1))
postcode <- paste0("z", formatC(sample.int(nlevels * 10L, size = nlevels, replace = FALSE), width = 5, flag = "0"))
data.frame("size" = size, "postcode" = postcode, stringsAsFactors = FALSE)
}
#' Simulate a high-cardinality feature and a binary response
#'
#' @param df_levels Number of levels.
#' @param n Number of samples.
#' @param threshold The threshold for determining if a postal code is rare.
#' @param prob Occurrence probability vector of the class 1 event in rare and non-rare postal codes.
#' @param seed Random seed.
#'
#' @return A data frame of samples with postal codes, response labels, and level rarity status.
#'
#' @export sim_postcode_samples
#'
#' @importFrom stats runif
#'
#' @note The code is derived from the example described in the "rare levels"
#' vignette in the \code{vtreat} package.
#'
#' @examples
#' df_levels <- sim_postcode_levels(nlevels = 500, seed = 42)
#' df_postcode <- sim_postcode_samples(
#' df_levels,
#' n = 10000, threshold = 3000, prob = c(0.2, 0.1), seed = 43
#' )
#' head(df_postcode)
sim_postcode_samples <- function(df_levels, n = 2000L, threshold = 1000, prob = c(0.3, 0.1), seed = 1001) {
set.seed(seed)
# Draw samples based on the levels
ords <- sort(sample.int(sum(df_levels$size), size = n, replace = TRUE))
cs <- cumsum(df_levels$size)
indexes <- findInterval(ords, cs) + 1
indexes <- indexes[sample.int(n, size = n, replace = FALSE)]
postcode <- df_levels$postcode[indexes]
is_rare <- df_levels$postcode[df_levels$size < threshold]
is_rare <- postcode %in% is_rare
label <- as.factor(as.integer(runif(n) < ifelse(is_rare, prob[1], prob[2])))
data.frame("postcode" = postcode, "label" = label, "is_rare" = is_rare, stringsAsFactors = FALSE)
}