Skip to content

Commit

Permalink
create rcpp internal wrapper as spark_rcpp_read_warc()
Browse files Browse the repository at this point in the history
  • Loading branch information
javierluraschi committed Aug 21, 2017
1 parent eada834 commit 73c4172
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions R/sparkwarc.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
#' spark_disconnect(sc)
#'
#' @export
#' @useDynLib sparkwarc, .registration = TRUE
#' @import DBI
spark_read_warc <- function(sc,
name,
Expand Down Expand Up @@ -70,7 +69,7 @@ spark_read_warc <- function(sc,
path <- temp_warc
}

sparkwarc::rcpp_read_warc(path, filter = match_warc, include = match_line)
sparkwarc::spark_rcpp_read_warc(path, match_warc, match_line)
})

if (nrow(df) > 1) do.call("rbind", entries) else data.frame(entries)
Expand Down Expand Up @@ -98,3 +97,19 @@ spark_read_warc <- function(sc,

result_tbl
}

#' Reads a WARC File into using Rcpp
#'
#' Reads a WARC (Web ARChive) file using Rcpp.
#'
#' @param path The path to the file. Needs to be accessible from the cluster.
#' Supports the \samp{"hdfs://"}, \samp{"s3n://"} and \samp{"file://"} protocols.
#' @param match_warc include only warc files mathcing this character string.
#' @param match_line include only lines mathcing this character string.
#'
#' @useDynLib sparkwarc, .registration = TRUE
#'
#' @export
spark_rcpp_read_warc <- function(path, match_warc, match_line) {
rcpp_read_warc(path, filter = match_warc, include = match_line)
}

0 comments on commit 73c4172

Please sign in to comment.