Skip to content

Commit

Permalink
Merge pull request #95 from schochastics/url_clean
Browse files Browse the repository at this point in the history
fix deduplicate issues (#97)
  • Loading branch information
schochastics authored Dec 11, 2023
2 parents a9a4669 + 5246b80 commit a74515c
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion R/preprocess.R
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ deduplicate <- function(wt, method = "aggregate", within = 1, duration_var = "du
if (!is.null(add_grpvars)) grp_vars <- c(grp_vars, add_grpvars)


wt <- aggregate(cbind(visits = 1, duration = as.numeric(wt$duration), timestamp = wt$timestamp),
wt <- aggregate(data.frame(visits = 1, duration = as.numeric(wt$duration), timestamp = wt$timestamp),
by = wt[grp_vars], FUN = function(x) if (is.numeric(x)) sum(x, na.rm = TRUE) else min(x)
)
wt$day <- NULL
Expand Down Expand Up @@ -201,6 +201,7 @@ deduplicate <- function(wt, method = "aggregate", within = 1, duration_var = "du
wt$tmp_url_prev <- NULL
wt$tmp_timestamp_prev <- NULL
}
class(wt) <- c("wt_dt", class(wt))
return(wt)
}

Expand Down Expand Up @@ -278,6 +279,8 @@ extract_host <- function(wt, varname = "url") {
extract_domain <- function(wt, varname = "url") {
abort_if_not_wtdt(wt)
vars_exist(wt, varname)
protocol <- adaR::ada_get_protocol(wt[[varname]])
wt[[varname]][is.na(protocol)] <- paste0("https://", wt[[varname]][is.na(protocol)])
domain <- adaR::ada_get_domain(wt[[varname]])
if (varname == "url") {
wt[["domain"]] <- domain
Expand Down

0 comments on commit a74515c

Please sign in to comment.