diff --git a/R/get_dist.R b/R/get_dist.R index ed4657f..6757545 100644 --- a/R/get_dist.R +++ b/R/get_dist.R @@ -62,11 +62,8 @@ pp <- function(pattern) { #' @details Proximity is measured by the number of tokens away from the keyword. Given a tokenized sentence: \["I", "eat", "this", "apple"\] and suppose "eat" is the keyword. The vector of minimum proximity for each word from "eat" is \[2, 1, 2, 3\], if `count_from` is 1. In another case: \["I", "wash", "and", "eat", "this", "apple"\] and \["wash", "eat"\] are the keywords. The minimal distance vector is \[2, 1, 2, 1, 2, 3\]. If `get_min` is `FALSE`, the output is a list of two vectors. For "wash", the distance vector is \[1, 0, 1, 2, 3\]. For "eat", \[3, 2, 1, 0, 1, 2\]. #' Please conduct all text maniputation tasks with `tokens_*()` functions before calling this function. To convert the output back to a `tokens` object, use [quanteda::as.tokens()]. #' @return a `tokens_with_proximity` object. It is similar to [quanteda::tokens()], but only [dfm.tokens_with_proximity()], [quanteda::convert()], [quanteda::docvars()], and [quanteda::meta()] methods are available. A `tokens_with_proximity` has a modified [print()] method. Also, additional data slots are included -#' * a document variation `dist` -#' * a metadata slot `keywords` -#' * a metadata slot `get_min` -#' * a metadata slot `tolower` -#' * a metadata slot `keep_acronyms` +#' * a document variable `proximity` +#' * metadata slots for all arguments used #' @examples #' library(quanteda) #' tok1 <- data_char_ukimmig2010 %>% @@ -193,14 +190,14 @@ tokens_proximity_tolower <- function(x) { #' Construct a sparse document-feature matrix from the output of [tokens_proximity()]. #' @param x output of [tokens_proximity()]. #' @param tolower convert all features to lowercase. -#' @param remove_padding ignored. -#' @param remove_docvars_proximity boolean, remove the "proximity" document variable. -#' @param verbose ignored, +#' @param remove_padding logical; if `TRUE`, remove the "pads" left as empty tokens after calling [quanteda::tokens()] or [quanteda::tokens_remove()] with `padding = TRUE`. +#' @param remove_docvars_proximity logical, remove the "proximity" document variable. +#' @param verbose display messages if `TRUE`. #' @param weight_function a weight function, default to invert distance, #' @param ... not used. #' @importFrom quanteda dfm #' @return a [quanteda::dfm-class] object -#' @details By default, words closer to keywords are weighted higher. You might change that with another `weight_function`. Please also note that `tolower` and `remove_padding` have no effect. It is because changing tokens at this point would need to recalculate the proximity vectors. Please do all the text manipulation before running [tokens_proximity()]. +#' @details By default, words closer to keywords are weighted higher. You might change that with another `weight_function`. #' @examples #' library(quanteda) #' tok1 <- data_char_ukimmig2010 %>% @@ -238,9 +235,13 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE, x_attrs <- attributes(x) x_docvars <- quanteda::docvars(x) x_docnames <- attr(x, "docvars")$docname_ - type <- attr(x, "types") temp <- unclass(x) index <- unlist(temp, use.names = FALSE) + type <- attr(x, "types") + if (0 %in% index) { + index <- index + 1 + type <- c("", type) + } val <- weight_function(unlist(quanteda::docvars(x, "proximity"), use.names = FALSE)) temp <- Matrix::sparseMatrix( j = index, @@ -258,5 +259,9 @@ dfm.tokens_with_proximity <- function(x, tolower = TRUE, remove_padding = FALSE, x_docvars$proximity <- NULL } quanteda::docvars(output) <- x_docvars + if (remove_padding) { + output <- quanteda::dfm_select(output, pattern = "", select = "remove", valuetype = "fixed", padding = FALSE, + verbose = verbose) + } return(output) } diff --git a/man/dfm.tokens_with_proximity.Rd b/man/dfm.tokens_with_proximity.Rd index ffa5bcc..56133d8 100644 --- a/man/dfm.tokens_with_proximity.Rd +++ b/man/dfm.tokens_with_proximity.Rd @@ -21,11 +21,11 @@ \item{tolower}{convert all features to lowercase.} -\item{remove_padding}{ignored.} +\item{remove_padding}{logical; if \code{TRUE}, remove the "pads" left as empty tokens after calling \code{\link[quanteda:tokens]{quanteda::tokens()}} or \code{\link[quanteda:tokens_select]{quanteda::tokens_remove()}} with \code{padding = TRUE}.} -\item{verbose}{ignored,} +\item{verbose}{display messages if \code{TRUE}.} -\item{remove_docvars_proximity}{boolean, remove the "proximity" document variable.} +\item{remove_docvars_proximity}{logical, remove the "proximity" document variable.} \item{weight_function}{a weight function, default to invert distance,} @@ -38,7 +38,7 @@ a \link[quanteda:dfm-class]{quanteda::dfm} object Construct a sparse document-feature matrix from the output of \code{\link[=tokens_proximity]{tokens_proximity()}}. } \details{ -By default, words closer to keywords are weighted higher. You might change that with another \code{weight_function}. Please also note that \code{tolower} and \code{remove_padding} have no effect. It is because changing tokens at this point would need to recalculate the proximity vectors. Please do all the text manipulation before running \code{\link[=tokens_proximity]{tokens_proximity()}}. +By default, words closer to keywords are weighted higher. You might change that with another \code{weight_function}. } \examples{ library(quanteda) diff --git a/man/tokens_proximity.Rd b/man/tokens_proximity.Rd index 2d08d19..50e0a0d 100644 --- a/man/tokens_proximity.Rd +++ b/man/tokens_proximity.Rd @@ -35,11 +35,8 @@ tokens_proximity( \value{ a \code{tokens_with_proximity} object. It is similar to \code{\link[quanteda:tokens]{quanteda::tokens()}}, but only \code{\link[=dfm.tokens_with_proximity]{dfm.tokens_with_proximity()}}, \code{\link[quanteda:convert]{quanteda::convert()}}, \code{\link[quanteda:docvars]{quanteda::docvars()}}, and \code{\link[quanteda:meta]{quanteda::meta()}} methods are available. A \code{tokens_with_proximity} has a modified \code{\link[=print]{print()}} method. Also, additional data slots are included \itemize{ -\item a document variation \code{dist} -\item a metadata slot \code{keywords} -\item a metadata slot \code{get_min} -\item a metadata slot \code{tolower} -\item a metadata slot \code{keep_acronyms} +\item a document variable \code{proximity} +\item metadata slots for all arguments used } } \description{ diff --git a/tests/testthat/test-dfm.R b/tests/testthat/test-dfm.R index eaa3b38..b2ed545 100644 --- a/tests/testthat/test-dfm.R +++ b/tests/testthat/test-dfm.R @@ -29,3 +29,18 @@ test_that("tolower", { res %>% dfm(tolower = TRUE) -> output expect_true("turkish" %in% colnames(output)) }) + +test_that("Padding #46", { + suppressPackageStartupMessages(library(quanteda)) + toks <- tokens(c("a b c", "A B C D")) %>% tokens_remove("b", padding = TRUE) + expect_error(toks %>% tokens_proximity("a") %>% dfm(), NA) +}) + +test_that("remove_padding", { + suppressPackageStartupMessages(library(quanteda)) + toks <- tokens(c("a b c", "A B C D")) %>% tokens_remove("b", padding = TRUE) + output <- toks %>% tokens_proximity("a") %>% dfm() + expect_true("" %in% colnames(output)) + output <- toks %>% tokens_proximity("a") %>% dfm(remove_padding = TRUE) + expect_false("" %in% colnames(output)) +})