Skip to content

Commit ab6ad6c

Browse files
committed
Adding RoBertaSentenceEmbeddings and XlmRoBertaSentenceEmbeddings annotators
1 parent af61a56 commit ab6ad6c

8 files changed

+296
-0
lines changed

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@ export(nlp_relation_extraction_dl)
285285
export(nlp_relation_extraction_dl_pretrained)
286286
export(nlp_relation_extraction_pretrained)
287287
export(nlp_roberta_embeddings_pretrained)
288+
export(nlp_roberta_sentence_embeddings_pretrained)
288289
export(nlp_sentence_detector)
289290
export(nlp_sentence_detector_dl)
290291
export(nlp_sentence_detector_dl_pretrained)
@@ -318,6 +319,7 @@ export(nlp_word_embeddings)
318319
export(nlp_word_embeddings_model)
319320
export(nlp_word_embeddings_pretrained)
320321
export(nlp_xlm_roberta_embeddings_pretrained)
322+
export(nlp_xlm_roberta_sentence_embeddings_pretrained)
321323
export(nlp_xlnet_embeddings_pretrained)
322324
export(nlp_yake_model)
323325
export(set_nlp_version)

R/roberta_sentence_embeddings.R

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#' Load a pretrained Spark NLP RoBertaSentenceEmbeddings model
2+
#'
3+
#' Create a pretrained Spark NLP \code{RoBertaSentenceEmbeddings} model.
4+
#' Sentence-level embeddings using RoBERTa. The RoBERTa model was proposed in
5+
#' RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott,
6+
#' Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis,
7+
#' Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model
8+
#' released in 2018.
9+
#'
10+
#' It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with much larger mini-batches and learning rates.
11+
#' See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#robertabertsentenceembeddings}
12+
#'
13+
#' @template roxlate-pretrained-params
14+
#' @template roxlate-inputs-output-params
15+
#' @param batch_size batch size
16+
#' @param case_sensitive whether to lowercase tokens or not
17+
#' @param dimension defines the output layer of BERT when calculating embeddings
18+
#' @param max_sentence_length max sentence length to process
19+
#'
20+
#' @export
21+
nlp_roberta_sentence_embeddings_pretrained <- function(sc, input_cols, output_col, case_sensitive = NULL,
22+
batch_size = NULL, dimension = NULL,
23+
max_sentence_length = NULL,
24+
name = NULL, lang = NULL, remote_loc = NULL) {
25+
args <- list(
26+
input_cols = input_cols,
27+
output_col = output_col,
28+
case_sensitive = case_sensitive,
29+
batch_size = batch_size,
30+
dimension = dimension,
31+
max_sentence_length = max_sentence_length
32+
) %>%
33+
validator_nlp_roberta_sentence_embeddings()
34+
35+
model_class <- "com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings"
36+
model <- pretrained_model(sc, model_class, name, lang, remote_loc)
37+
spark_jobj(model) %>%
38+
sparklyr::jobj_set_param("setInputCols", args[["input_cols"]]) %>%
39+
sparklyr::jobj_set_param("setOutputCol", args[["output_col"]]) %>%
40+
sparklyr::jobj_set_param("setCaseSensitive", args[["case_sensitive"]]) %>%
41+
sparklyr::jobj_set_param("setBatchSize", args[["batch_size"]]) %>%
42+
sparklyr::jobj_set_param("setDimension", args[["dimension"]]) %>%
43+
sparklyr::jobj_set_param("setMaxSentenceLength", args[["max_sentence_length"]])
44+
45+
new_nlp_roberta_sentence_embeddings(model)
46+
}
47+
48+
#' @import forge
49+
validator_nlp_roberta_sentence_embeddings <- function(args) {
50+
args[["input_cols"]] <- cast_string_list(args[["input_cols"]])
51+
args[["output_col"]] <- cast_string(args[["output_col"]])
52+
args[["batch_size"]] <- cast_nullable_integer(args[["batch_size"]])
53+
args[["case_sensitive"]] <- cast_nullable_logical(args[["case_sensitive"]])
54+
args[["dimension"]] <- cast_nullable_integer(args[["dimension"]])
55+
args[["max_sentence_length"]] <- cast_nullable_integer(args[["max_sentence_length"]])
56+
args
57+
}
58+
59+
new_nlp_roberta_sentence_embeddings <- function(jobj) {
60+
sparklyr::new_ml_transformer(jobj, class = "nlp_roberta_sentence_embeddings")
61+
}

R/xlm_roberta_sentence_embeddings.R

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
#' Load a pretrained Spark NLP XlmRoBertaSentenceEmbeddings model
2+
#'
3+
#' Create a pretrained Spark NLP \code{XlmRoBertaSentenceEmbeddings} model.
4+
#' See \url{https://nlp.johnsnowlabs.com/docs/en/annotators#xlmrobertasentenceembeddings}
5+
#'
6+
#' Sentence-level embeddings using XLM-RoBERTa. The XLM-RoBERTa model was proposed in
7+
#' Unsupervised Cross-lingual Representation Learning at Scale by Alexis Conneau,
8+
#' Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán,
9+
#' Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on
10+
#' Facebook's RoBERTa model released in 2019. It is a large multi-lingual language model,
11+
#' trained on 2.5TB of filtered CommonCrawl data.
12+
#'
13+
#' @template roxlate-pretrained-params
14+
#' @template roxlate-inputs-output-params
15+
#' @param batch_size batch size
16+
#' @param case_sensitive whether to lowercase tokens or not
17+
#' @param dimension defines the output layer of BERT when calculating embeddings
18+
#' @param max_sentence_length max sentence length to process
19+
#'
20+
#' @export
21+
nlp_xlm_roberta_sentence_embeddings_pretrained <- function(sc, input_cols, output_col, case_sensitive = NULL,
22+
batch_size = NULL, dimension = NULL,
23+
max_sentence_length = NULL,
24+
name = NULL, lang = NULL, remote_loc = NULL) {
25+
args <- list(
26+
input_cols = input_cols,
27+
output_col = output_col,
28+
case_sensitive = case_sensitive,
29+
batch_size = batch_size,
30+
dimension = dimension,
31+
max_sentence_length = max_sentence_length
32+
) %>%
33+
validator_nlp_xlm_roberta_sentence_embeddings()
34+
35+
model_class <- "com.johnsnowlabs.nlp.embeddings.XlmRoBertaSentenceEmbeddings"
36+
model <- pretrained_model(sc, model_class, name, lang, remote_loc)
37+
spark_jobj(model) %>%
38+
sparklyr::jobj_set_param("setInputCols", args[["input_cols"]]) %>%
39+
sparklyr::jobj_set_param("setOutputCol", args[["output_col"]]) %>%
40+
sparklyr::jobj_set_param("setCaseSensitive", args[["case_sensitive"]]) %>%
41+
sparklyr::jobj_set_param("setBatchSize", args[["batch_size"]]) %>%
42+
sparklyr::jobj_set_param("setDimension", args[["dimension"]]) %>%
43+
sparklyr::jobj_set_param("setMaxSentenceLength", args[["max_sentence_length"]])
44+
45+
new_nlp_xlm_roberta_sentence_embeddings(model)
46+
}
47+
48+
#' @import forge
49+
validator_nlp_xlm_roberta_sentence_embeddings <- function(args) {
50+
args[["input_cols"]] <- cast_string_list(args[["input_cols"]])
51+
args[["output_col"]] <- cast_string(args[["output_col"]])
52+
args[["batch_size"]] <- cast_nullable_integer(args[["batch_size"]])
53+
args[["case_sensitive"]] <- cast_nullable_logical(args[["case_sensitive"]])
54+
args[["dimension"]] <- cast_nullable_integer(args[["dimension"]])
55+
args[["max_sentence_length"]] <- cast_nullable_integer(args[["max_sentence_length"]])
56+
args
57+
}
58+
59+
new_nlp_xlm_roberta_sentence_embeddings <- function(jobj) {
60+
sparklyr::new_ml_transformer(jobj, class = "nlp_xlm_roberta_sentence_embeddings")
61+
}

inst/sparkml/class_mapping.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@
7979
"com.johnsnowlabs.nlp.embeddings.AlbertEmbeddings": "nlp_albert_embeddings",
8080
"com.johnsnowlabs.nlp.embeddings.BertEmbeddings": "nlp_bert_embeddings",
8181
"com.johnsnowlabs.nlp.embeddings.BertSentenceEmbeddings": "nlp_bert_sentence_embeddings",
82+
"com.johnsnowlabs.nlp.embeddings.RoBertaSentenceEmbeddings": "nlp_roberta_sentence_embeddings",
83+
"com.johnsnowlabs.nlp.embeddings.XlmRoBertaSentenceEmbeddings": "nlp_xlm_roberta_sentence_embeddings",
8284
"com.johnsnowlabs.nlp.embeddings.ChunkEmbeddings": "nlp_chunk_embeddings",
8385
"com.johnsnowlabs.nlp.embeddings.DistilBertEmbeddings": "nlp_distilbert_embeddings",
8486
"com.johnsnowlabs.nlp.embeddings.ElmoEmbeddings": "nlp_elmo_embeddings",

man/nlp_roberta_sentence_embeddings_pretrained.Rd

Lines changed: 55 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/nlp_xlm_roberta_sentence_embeddings_pretrained.Rd

Lines changed: 55 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
setup({
2+
sc <- testthat_spark_connection()
3+
text_tbl <- testthat_tbl("test_text")
4+
5+
# These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
6+
assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
7+
8+
pipeline <- ml_pipeline(assembler)
9+
test_data <- ml_fit_and_transform(pipeline, text_tbl)
10+
11+
assign("sc", sc, envir = parent.frame())
12+
assign("pipeline", pipeline, envir = parent.frame())
13+
assign("test_data", test_data, envir = parent.frame())
14+
})
15+
16+
teardown({
17+
spark_disconnect(sc)
18+
rm(sc, envir = .GlobalEnv)
19+
rm(pipeline, envir = .GlobalEnv)
20+
rm(test_data, envir = .GlobalEnv)
21+
})
22+
23+
test_that("nlp_roberta_sentence_embeddings pretrained", {
24+
model <- nlp_roberta_sentence_embeddings_pretrained(sc, input_cols = c("document"), output_col = "roberta_sentence_embeddings")
25+
transformed_data <- ml_transform(model, test_data)
26+
expect_true("roberta_sentence_embeddings" %in% colnames(transformed_data))
27+
28+
expect_true(inherits(model, "nlp_roberta_sentence_embeddings"))
29+
})
30+
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
setup({
2+
sc <- testthat_spark_connection()
3+
text_tbl <- testthat_tbl("test_text")
4+
5+
# These lines should set a pipeline that will ultimately create the columns needed for testing the annotator
6+
assembler <- nlp_document_assembler(sc, input_col = "text", output_col = "document")
7+
8+
pipeline <- ml_pipeline(assembler)
9+
test_data <- ml_fit_and_transform(pipeline, text_tbl)
10+
11+
assign("sc", sc, envir = parent.frame())
12+
assign("pipeline", pipeline, envir = parent.frame())
13+
assign("test_data", test_data, envir = parent.frame())
14+
})
15+
16+
teardown({
17+
spark_disconnect(sc)
18+
rm(sc, envir = .GlobalEnv)
19+
rm(pipeline, envir = .GlobalEnv)
20+
rm(test_data, envir = .GlobalEnv)
21+
})
22+
23+
test_that("nlp_bert_sentence_embeddings pretrained", {
24+
model <- nlp_bert_sentence_embeddings_pretrained(sc, input_cols = c("document"), output_col = "bert_sentence_embeddings")
25+
transformed_data <- ml_transform(model, test_data)
26+
expect_true("bert_sentence_embeddings" %in% colnames(transformed_data))
27+
28+
expect_true(inherits(model, "nlp_bert_sentence_embeddings"))
29+
})
30+

0 commit comments

Comments
 (0)