Fix #46 (#47)

* Fix #46 * Make `remove_padding` work
gesistsa · Nov 22, 2023 · 75573a1 · 75573a1
1 parent 5f08a69
commit 75573a1
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 19 deletions.
diff --git a/R/get_dist.R b/R/get_dist.R
@@ -62,11 +62,8 @@ pp <- function(pattern) {
 #' @details Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: \["I", "eat", "this", "apple"\] and suppose "eat" is the keyword. The vector of minimum proximity for each word from "eat" is \[2, 1, 2, 3\], if `count_from` is 1. In another case: \["I", "wash", "and", "eat", "this", "apple"\] and \["wash", "eat"\] are the keywords. The minimal distance vector is \[2, 1, 2, 1, 2, 3\]. If `get_min` is `FALSE`, the output is a list of two vectors. For "wash", the distance vector is \[1, 0, 1, 2, 3\]. For "eat", \[3, 2, 1, 0, 1, 2\].
 #' Please conduct all text maniputation tasks with `tokens_*()` functions before calling this function. To convert the output back to a `tokens` object, use [quanteda::as.tokens()].
 #' @return a `tokens_with_proximity` object. It is similar to [quanteda::tokens()], but only [dfm.tokens_with_proximity()], [quanteda::convert()], [quanteda::docvars()], and [quanteda::meta()] methods are available. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included
-#' * a document variation `dist`
-#' * a metadata slot `keywords`
-#' * a metadata slot `get_min`
-#' * a metadata slot `tolower`
-#' * a metadata slot `keep_acronyms`
+#' * a document variable `proximity`
+#' * metadata slots for all arguments used
 #' @examples
 #' library(quanteda)
 #' tok1 <- data_char_ukimmig2010 %>%
@@ -193,14 +190,14 @@ tokens_proximity_tolower <- function(x) {
 #' Construct a sparse document-feature matrix from the output of [tokens_proximity()].
 #' @param x output of [tokens_proximity()].
 #' @param tolower convert all features to lowercase.
-#' @param remove_padding ignored.
-#' @param remove_docvars_proximity boolean, remove the "proximity" document variable.
-#' @param verbose ignored,
+#' @param remove_padding logical; if `TRUE`, remove the "pads" left as empty tokens after calling [quanteda::tokens()] or [quanteda::tokens_remove()] with `padding = TRUE`.
+#' @param remove_docvars_proximity logical, remove the "proximity" document variable.
+#' @param verbose  display messages if `TRUE`.
 #' @param weight_function a weight function, default to invert distance,
 #' @param ... not used.
 #' @importFrom quanteda dfm
 #' @return a [quanteda::dfm-class] object
-#' @details By default, words closer to keywords are weighted higher. You might change that with another `weight_function`. Please also note that `tolower` and `remove_padding` have no effect. It is because changing tokens at this point would need to recalculate the proximity vectors. Please do all the text manipulation before running [tokens_proximity()].
+#' @details By default, words closer to keywords are weighted higher. You might change that with another `weight_function`.
 #' @examples
 #' library(quanteda)
 #' tok1 <- data_char_ukimmig2010 %>%
@@ -238,9 +235,13 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE,
     x_attrs <- attributes(x)
     x_docvars <- quanteda::docvars(x)
     x_docnames <- attr(x, "docvars")$docname_
-    type <- attr(x, "types")
     temp <- unclass(x)
     index <- unlist(temp, use.names = FALSE)
+    type <- attr(x, "types")
+    if (0 %in% index) {
+        index <- index + 1
+        type <- c("", type)
+    }
     val <- weight_function(unlist(quanteda::docvars(x, "proximity"), use.names = FALSE))
     temp <- Matrix::sparseMatrix(
         j = index,
@@ -258,5 +259,9 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE,
         x_docvars$proximity <- NULL
     }
     quanteda::docvars(output) <- x_docvars
+    if (remove_padding) {
+        output <- quanteda::dfm_select(output, pattern = "", select = "remove", valuetype = "fixed", padding = FALSE,
+                                       verbose = verbose)
+    }
     return(output)
 }
diff --git a/man/dfm.tokens_with_proximity.Rd b/man/dfm.tokens_with_proximity.Rd
diff --git a/man/tokens_proximity.Rd b/man/tokens_proximity.Rd
diff --git a/tests/testthat/test-dfm.R b/tests/testthat/test-dfm.R
@@ -29,3 +29,18 @@ test_that("tolower", {
     res %>% dfm(tolower = TRUE) -> output
     expect_true("turkish" %in% colnames(output))
 })
+
+test_that("Padding #46", {
+    suppressPackageStartupMessages(library(quanteda))
+    toks <- tokens(c("a b c", "A B C D")) %>% tokens_remove("b", padding = TRUE)
+    expect_error(toks %>% tokens_proximity("a") %>% dfm(), NA)
+})
+
+test_that("remove_padding", {
+    suppressPackageStartupMessages(library(quanteda))
+    toks <- tokens(c("a b c", "A B C D")) %>% tokens_remove("b", padding = TRUE)
+    output <- toks %>% tokens_proximity("a") %>% dfm()
+    expect_true("" %in% colnames(output))
+    output <- toks %>% tokens_proximity("a") %>% dfm(remove_padding = TRUE)
+    expect_false("" %in% colnames(output))
+})