Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sum visits function & tests #17

Merged
merged 2 commits into from
Jun 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion R/globals.R
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
utils::globalVariables(c("duration", "timestamp", "panelist_id","domain","visit","day","type","prev_type","tmp","session","host","suffix","domain_name","url_next", "host_next", "domain_next", "url_previous", "host_previous", "domain_previous"))
utils::globalVariables(c("duration", "timestamp", "panelist_id","domain","visit","day","type","prev_type","tmp","session",
"host","suffix","domain_name","url_next", "host_next", "domain_next", "url_previous", "host_previous", "domain_previous",
"date", "week", "month", "year", "wave"))
53 changes: 53 additions & 0 deletions R/summarize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#' Summarize number of visits by person
#' @description Summarize number of visits by person within a time frame, and optionally by class of visit
#' @detail sum_visits allows you to summarize the number of visits by panelist_id for different time periods (for example, by day).
#' It further allows to break down the number by any set of "classes" of visits, e.g. the type of the visit's domain.
#' @param wt webtrack data object.
#' @param timeframe character. indicates for what time frame to aggregate visits. Possible values are "date", "week", "month", "year", "wave" or "all".
#' If set to "wave", webtrack data object must contain a column call "wave". Defaults to "all".
#' @param visit_class character vector. Column(s) that contains a classification of visits.
#' Visits will be grouped by values in this column before being summarized. Defaults to NULL.
#' @importFrom data.table is.data.table shift .N
#' @return a data.table with columns "panelist_id", column indicating the time unit unless "all" was specified,
#' name indicating the class variable if specified, and "n_visits" indicating the number of visits
#' @examples
#' data("test_data")
#' wt <- as.wt_dt(test_data)
#' # example of visit classification
#' wt <- extract_domain(wt)
#' wt[,google:=ifelse(domain == "google.com", 1, 0)]
#' wt[,search:=ifelse(grepl("search", url), 1, 0)]
#' summary <- sum_visits(wt, timeframe = "month", visit_class = c("google", "search"))
#' @export
sum_visits <- function(wt, timeframe = "all", visit_class = NULL) {
stopifnot("input is not a wt_dt object" = is.wt_dt(wt))
vars_exist(wt,vars = c("url","panelist_id","timestamp"))
if (timeframe == "all") {
timeframe_var <- NULL
} else if (timeframe == "date") {
wt[, date := format(timestamp, format = "%F")]
timeframe_var <- "date"
} else if (timeframe == "week") {
wt[, week := format(timestamp, format = "%Y week %W")]
timeframe_var <- "week"
} else if (timeframe == "month") {
wt[, month := format(timestamp, format = "%Y month %m")]
timeframe_var <- "month"
} else if (timeframe == "year") {
wt[, year := format(timestamp, format = "%Y")]
timeframe_var <- "year"
} else if (timeframe == "wave") {
vars_wt <- names(wt)
wave <- pmatch("wave",vars_wt)
if (is.na(wave)) {
stop(paste0("couldn't find the column 'wave' in the webtrack data"), call. = FALSE)
} else {
timeframe_var <- "wave"
}
} else {
stop(paste0("unknown timeframe option specified"), call. = FALSE)
}
grp_vars <- c("panelist_id", timeframe_var, visit_class)
summary <- as.data.frame(wt[, list("n_visits" = .N), by = grp_vars])
summary[]
}
23 changes: 23 additions & 0 deletions tests/testthat/test-summarize.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
test_that("sum_visits", {
data("test_data")
wt <- as.wt_dt(test_data)
wt <- extract_domain(wt)
wt[,google:=ifelse(domain == "google.com", 1, 0)]
wt[,search:=ifelse(grepl("search", url), 1, 0)]
wt_sum <- sum_visits(wt)
expect_true("n_visits" %in% names(wt_sum))
expect_true("panelist_id" %in% names(wt_sum))
# expect_true(sum_visits(wt), is.null(timeframe_var)) # this does not work, I assume because timeframe_var is not carried outside function
# also add other cases for timeframe_var
# check whether grouping variables in wt_sum
})

test_that("sum_visits errors", {
data("test_data")
wt <- as.wt_dt(test_data)
wt <- extract_domain(wt)
wt[,google:=ifelse(domain == "google.com", 1, 0)]
wt[,search:=ifelse(grepl("search", url), 1, 0)]
expect_error(sum_visits(wt, timeframe = "something"))
expect_error(sum_visits(wt, timeframe = "wave"))
})