-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from javierluraschi/feature/spark-apply-rcpp
Scale WARC processing using Rcpp and sparklyr::spark_apply
- Loading branch information
Showing
16 changed files
with
316 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,3 +37,8 @@ log4j.spark* | |
.cache-main | ||
.settings | ||
.classpath | ||
# Rcpp | ||
/src/*.o | ||
/src/*.o-* | ||
/src/*.d | ||
/src/*.so |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,11 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(cc_warc) | ||
export(rcpp_read_warc_sample) | ||
export(spark_read_warc) | ||
export(spark_read_warc_sample) | ||
export(spark_warc_sample_path) | ||
import(DBI) | ||
import(sparklyr) | ||
importFrom(utils,read.table) | ||
useDynLib(sparkwarc, .registration = TRUE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Generated by using Rcpp::compileAttributes() -> do not edit by hand | ||
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
rcpp_hello_world <- function() { | ||
.Call(`_sparkwarc_rcpp_hello_world`) | ||
} | ||
|
||
rcpp_read_warc <- function(path, filter, include) { | ||
.Call(`_sparkwarc_rcpp_read_warc`, path, filter, include) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#' Retrieves sample warc path | ||
#' | ||
#' @export | ||
spark_warc_sample_path <- function() { | ||
normalizePath(system.file("samples/sample.warc.gz", package = "sparkwarc")) | ||
} | ||
|
||
#' Loads the sample warc file in Rcpp | ||
#' | ||
#' @param filter A regular expression used to filter to each warc entry | ||
#' efficiently by running native code using \code{Rcpp}. | ||
#' @param include A regular expression used to keep only matching lines | ||
#' efficiently by running native code using \code{Rcpp}. | ||
#' | ||
#' @export | ||
rcpp_read_warc_sample <- function(filter = "", include = "") { | ||
sample_warc <- spark_warc_sample_path() | ||
|
||
sparkwarc:::rcpp_read_warc(sample_warc, filter, include) | ||
} | ||
|
||
#' Loads the sample warc file in Spark | ||
#' | ||
#' @param An active \code{spark_connection}. | ||
#' @param filter A regular expression used to filter to each warc entry | ||
#' efficiently by running native code using \code{Rcpp}. | ||
#' @param include A regular expression used to keep only matching lines | ||
#' efficiently by running native code using \code{Rcpp}. | ||
#' | ||
#' @export | ||
spark_read_warc_sample <- function(sc, filter = "", include = "") { | ||
sample_warc <- spark_warc_sample_path() | ||
|
||
spark_read_warc( | ||
sc, | ||
"sample_warc", | ||
sample_warc, | ||
overwrite = TRUE, | ||
group = TRUE, | ||
filter = filter, | ||
include = include) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
// Generated by using Rcpp::compileAttributes() -> do not edit by hand | ||
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 | ||
|
||
#include <Rcpp.h> | ||
|
||
using namespace Rcpp; | ||
|
||
// rcpp_hello_world | ||
List rcpp_hello_world(); | ||
RcppExport SEXP _sparkwarc_rcpp_hello_world() { | ||
BEGIN_RCPP | ||
Rcpp::RObject rcpp_result_gen; | ||
Rcpp::RNGScope rcpp_rngScope_gen; | ||
rcpp_result_gen = Rcpp::wrap(rcpp_hello_world()); | ||
return rcpp_result_gen; | ||
END_RCPP | ||
} | ||
// rcpp_read_warc | ||
DataFrame rcpp_read_warc(std::string path, std::string filter, std::string include); | ||
RcppExport SEXP _sparkwarc_rcpp_read_warc(SEXP pathSEXP, SEXP filterSEXP, SEXP includeSEXP) { | ||
BEGIN_RCPP | ||
Rcpp::RObject rcpp_result_gen; | ||
Rcpp::RNGScope rcpp_rngScope_gen; | ||
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP); | ||
Rcpp::traits::input_parameter< std::string >::type filter(filterSEXP); | ||
Rcpp::traits::input_parameter< std::string >::type include(includeSEXP); | ||
rcpp_result_gen = Rcpp::wrap(rcpp_read_warc(path, filter, include)); | ||
return rcpp_result_gen; | ||
END_RCPP | ||
} | ||
|
||
static const R_CallMethodDef CallEntries[] = { | ||
{"_sparkwarc_rcpp_hello_world", (DL_FUNC) &_sparkwarc_rcpp_hello_world, 0}, | ||
{"_sparkwarc_rcpp_read_warc", (DL_FUNC) &_sparkwarc_rcpp_read_warc, 3}, | ||
{NULL, NULL, 0} | ||
}; | ||
|
||
RcppExport void R_init_sparkwarc(DllInfo *dll) { | ||
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); | ||
R_useDynamicSymbols(dll, FALSE); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
CXX_STD=CXX11 PKG_LIBS=-lboost_regex |
Oops, something went wrong.