Fixing DistilBertForTokenClassification annotator

dkincaid · dkincaid · commit 4abca45e5bc6 · 2021-10-18T08:46:29.000-05:00
diff --git a/NAMESPACE b/NAMESPACE
@@ -56,9 +56,6 @@ S3method(nlp_date_normalizer,tbl_spark)
 S3method(nlp_dependency_parser,ml_pipeline)
 S3method(nlp_dependency_parser,spark_connection)
 S3method(nlp_dependency_parser,tbl_spark)
-S3method(nlp_distilbert_for_token_classification,ml_pipeline)
-S3method(nlp_distilbert_for_token_classification,spark_connection)
-S3method(nlp_distilbert_for_token_classification,tbl_spark)
 S3method(nlp_doc2chunk,ml_pipeline)
 S3method(nlp_doc2chunk,spark_connection)
 S3method(nlp_doc2chunk,tbl_spark)
@@ -241,7 +238,7 @@ export(nlp_date_normalizer)
 export(nlp_dependency_parser)
 export(nlp_dependency_parser_pretrained)
 export(nlp_distilbert_embeddings_pretrained)
-export(nlp_distilbert_for_token_classification)
+export(nlp_distilbert_token_classification_pretrained)
 export(nlp_doc2chunk)
 export(nlp_document_assembler)
 export(nlp_document_logreg_classifier)
diff --git a/R/distilbert-for-token-classification.R b/R/distilbert-for-token-classification.R
@@ -1,6 +1,7 @@
 #' Spark NLP DistilBertForTokenClassification
 #'
-#' Spark ML transformer that 
+#' DistilBertForTokenClassification can load Bert Models with a token classification head on top 
+#' (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.
 #' See \url{https://nlp.johnsnowlabs.com/docs/en/transformers#distilbertfortokenclassification}
 #' 
 #' @template roxlate-nlp-algo
@@ -10,75 +11,33 @@
 #' @param max_sentence_length Max sentence length to process (Default: 128)
 #' 
 #' @export
-nlp_distilbert_for_token_classification <- function(x, input_cols, output_col,
-                 batch_size = NULL, case_sensitive = NULL, max_sentence_length = NULL,
-                 uid = random_string("distilbert_for_token_classification_")) {
-  UseMethod("nlp_distilbert_for_token_classification")
-}
-
-#' @export
-nlp_distilbert_for_token_classification.spark_connection <- function(x, input_cols, output_col,
-                 batch_size = NULL, case_sensitive = NULL, max_sentence_length = NULL,
-                 uid = random_string("distilbert_for_token_classification_")) {
+nlp_distilbert_token_classification_pretrained <- function(sc, input_cols, output_col,
+                                                 batch_size = NULL, case_sensitive = NULL, 
+                                                 max_sentence_length = NULL,
+                                                 name = NULL, lang = NULL, remote_loc = NULL) {
   args <- list(
     input_cols = input_cols,
     output_col = output_col,
     batch_size = batch_size,
     case_sensitive = case_sensitive,
-    max_sentence_length = max_sentence_length,
-    uid = uid
+    max_sentence_length = max_sentence_length
   ) %>%
-  validator_nlp_distilbert_for_token_classification()
-
-  jobj <- sparklyr::spark_pipeline_stage(
-    x, "com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForTokenClassification",
-    input_cols = args[["input_cols"]],
-    output_col = args[["output_col"]],
-    uid = args[["uid"]]
-  ) %>%
-    sparklyr::jobj_set_param("setBatchSize", args[["batch_size"]])  %>%
-    sparklyr::jobj_set_param("setCaseSensitive", args[["case_sensitive"]])  %>%
-    sparklyr::jobj_set_param("setMaxSentenceLength", args[["max_sentence_length"]]) 
-
-  new_nlp_distilbert_for_token_classification(jobj)
+    validator_nlp_distilbert_token_classification()
+  
+  model_class <- "com.johnsnowlabs.nlp.annotators.classifier.dl.DistilBertForTokenClassification"
+  model <- pretrained_model(sc, model_class, name, lang, remote_loc)
+  spark_jobj(model) %>%
+    sparklyr::jobj_set_param("setInputCols", args[["input_cols"]]) %>% 
+    sparklyr::jobj_set_param("setOutputCol", args[["output_col"]]) %>% 
+    sparklyr::jobj_set_param("setCaseSensitive", args[["case_sensitive"]]) %>% 
+    sparklyr::jobj_set_param("setBatchSize", args[["batch_size"]]) %>% 
+    sparklyr::jobj_set_param("setMaxSentenceLength", args[["max_sentence_length"]])
+  
+  new_ml_transformer(model)
 }
 
-#' @export
-nlp_distilbert_for_token_classification.ml_pipeline <- function(x, input_cols, output_col,
-                 batch_size = NULL, case_sensitive = NULL, max_sentence_length = NULL,
-                 uid = random_string("distilbert_for_token_classification_")) {
-
-  stage <- nlp_distilbert_for_token_classification.spark_connection(
-    x = sparklyr::spark_connection(x),
-    input_cols = input_cols,
-    output_col = output_col,
-    batch_size = batch_size,
-    case_sensitive = case_sensitive,
-    max_sentence_length = max_sentence_length,
-    uid = uid
-  )
-
-  sparklyr::ml_add_stage(x, stage)
-}
-
-#' @export
-nlp_distilbert_for_token_classification.tbl_spark <- function(x, input_cols, output_col,
-                 batch_size = NULL, case_sensitive = NULL, max_sentence_length = NULL,
-                 uid = random_string("distilbert_for_token_classification_")) {
-  stage <- nlp_distilbert_for_token_classification.spark_connection(
-    x = sparklyr::spark_connection(x),
-    input_cols = input_cols,
-    output_col = output_col,
-    batch_size = batch_size,
-    case_sensitive = case_sensitive,
-    max_sentence_length = max_sentence_length,
-    uid = uid
-  )
-
-  stage %>% sparklyr::ml_transform(x)
-}
 #' @import forge
-validator_nlp_distilbert_for_token_classification <- function(args) {
+validator_nlp_distilbert_token_classification <- function(args) {
   args[["input_cols"]] <- cast_string_list(args[["input_cols"]])
   args[["output_col"]] <- cast_string(args[["output_col"]])
   args[["batch_size"]] <- cast_nullable_integer(args[["batch_size"]])
@@ -87,9 +46,6 @@ validator_nlp_distilbert_for_token_classification <- function(args) {
   args
 }
 
-nlp_float_params.nlp_distilbert_for_token_classification <- function(x) {
-  return(c())
-}
 new_nlp_distilbert_for_token_classification <- function(jobj) {
   sparklyr::new_ml_transformer(jobj, class = "nlp_distilbert_for_token_classification")
 }
diff --git a/R/utils.R b/R/utils.R
@@ -7,7 +7,9 @@
 #' 
 #' @export
 nlp_set_input_cols <- function(jobj, input_cols) {
-  invoke(spark_jobj(jobj), "setInputCols", cast_string_list(input_cols))
+  newobj <- sparklyr:::ml_set_param(jobj, "inputCols", output_col)
+  return(newobj)
+  #invoke(spark_jobj(jobj), "setInputCols", cast_string_list(input_cols))
 }
 
 #' Set the output column name
@@ -19,7 +21,9 @@ nlp_set_input_cols <- function(jobj, input_cols) {
 #' 
 #' @export
 nlp_set_output_col <- function(jobj, output_col) {
-  invoke(spark_jobj(jobj), "setOutputCol", cast_string(output_col))
+  newobj <- sparklyr:::ml_set_param(jobj, "outputCol", output_col)
+  return(newobj)
+#  invoke(spark_jobj(jobj), "setOutputCol", cast_string(output_col))
 }
 
 #' Spark NLP version
diff --git a/man/nlp_distilbert_token_classification_pretrained.Rd b/man/nlp_distilbert_token_classification_pretrained.Rd
diff --git a/tests/testthat/testthat-distilbert-for-token-classification.R b/tests/testthat/testthat-distilbert-for-token-classification.R
@@ -22,34 +22,18 @@ teardown({
   rm(test_data, envir = .GlobalEnv)
 })
 
-test_that("distilbert_for_token_classification param setting", {
-# TODO: edit these to make them legal values for the parameters
-  test_args <- list(
-    input_cols = c("string1", "string2"),
-    output_col = "string1",
-    batch_size = 100,
-    case_sensitive = FALSE,
-    max_sentence_length = 200
-  )
 
-  test_param_setting(sc, nlp_distilbert_for_token_classification, test_args)
+test_that("nlp_distilbert_token_classification pretrained", {
+  model <- nlp_distilbert_token_classification_pretrained(sc, input_cols = c("sentence", "token"), output_col = "distilbert")
+  transformed_data <- ml_transform(model, test_data)
+  expect_true("distilbert" %in% colnames(transformed_data))
 })
 
-test_that("nlp_distilbert_for_token_classification spark_connection", {
-  test_annotator <- nlp_distilbert_for_token_classification(sc, input_cols = c("token", "document"), output_col = "label")
-  transformed_data <- ml_transform(test_annotator, test_data)
+test_that("nlp_distilbert_token_classification load", {
+  model_files <- list.files("~/cache_pretrained/")
+  bert_model_file <- max(Filter(function(s) startsWith(s, "distilbert_base_token"), model_files))
+  model <- ml_load(sc, paste0("~/cache_pretrained/", bert_model_file))
+  model <- nlp_set_output_col(model, "label")
+  transformed_data <- ml_transform(model, test_data)
   expect_true("label" %in% colnames(transformed_data))
-  expect_true(inherits(test_annotator, "nlp_distilbert_for_token_classification"))
 })
-
-test_that("nlp_distilbert_for_token_classification ml_pipeline", {
-  test_annotator <- nlp_distilbert_for_token_classification(pipeline, input_cols = c("token", "document"), output_col = "label")
-  transformed_data <- ml_fit_and_transform(test_annotator, test_data)
-  expect_true("label" %in% colnames(transformed_data))
-})
-
-test_that("nlp_distilbert_for_token_classification tbl_spark", {
-  transformed_data <- nlp_distilbert_for_token_classification(test_data, input_cols = c("token", "document"), output_col = "label")
-  expect_true("label" %in% colnames(transformed_data))
-})
-