remove dependency on pkg ngram

ChrisMuir · Dec 20, 2017 · 5c3c78b · 5c3c78b
1 parent f32d23b
commit 5c3c78b
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 52 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -14,10 +14,9 @@ License: GPL-3
 Encoding: UTF-8
 LazyData: true
 Imports:
-        ngram, 
-        stringdist, 
         magrittr, 
-        Rcpp
+        Rcpp, 
+        stringdist
 RoxygenNote: 6.0.1
 LinkingTo: Rcpp
 URL: http://github.com/ChrisMuir/refinr

diff --git a/R/get_fingerprint_ngram.R b/R/get_fingerprint_ngram.R
@@ -27,9 +27,9 @@
 
 get_fingerprint_ngram <- function(vect, numgram = 2, bus_suffix = TRUE,
                                   ignore_strings = NULL) {
-  # Get minimum char length of each string post tokenization.
-  numgram_thres <- numgram + (numgram - 1)
+  numgram <- numgram - 1
 
+  # Compile variable ignore_strings.
   if (bus_suffix) {
     if (!is.null(ignore_strings)) {
       ignore_strings <- c(ignore_strings,
@@ -53,34 +53,23 @@ get_fingerprint_ngram <- function(vect, numgram = 2, bus_suffix = TRUE,
     vect <- vect %>%
       tolower %>%
       business_suffix %>%
-      {gsub(regex, "", .)} %>%
-      char_splitter(numgram_thres)
+      {gsub(regex, "", .)}
   } else {
     # Initial transformations given "ignore_strings" is NULL.
     vect <- vect %>%
       tolower %>%
-      {gsub("[[:punct:]]|\\s", "", .)} %>%
-      char_splitter(numgram_thres)
+      {gsub("[[:punct:]]|\\s", "", .)}
   }
-  # Get indices of vect that are not NA.
-  vect_non_na <- !is.na(vect)
 
-  if (numgram > 1) {
-    # If numgram > 1, use the ngram pkg to get char grams.
-    vect[vect_non_na] <- vect[vect_non_na] %>%
-      lapply(., function(x) ngram::get.ngrams(ngram::ngram(x, n=numgram))) %>%
-      cpp_list_unique(sort_vals = TRUE) %>%
-      cpp_paste_collapse_list %>%
-      iconv(., to = "ASCII//TRANSLIT") %>%
-      {gsub("\\s", "", .)}
-  } else if (numgram == 1) {
-    # Else if numgram == 1, use strsplit to get char unigrams.
-    vect[vect_non_na] <- vect[vect_non_na] %>%
-      strsplit(., " ", fixed = TRUE) %>%
-      cpp_list_unique(sort_vals = TRUE) %>%
-      cpp_paste_collapse_list %>%
-      iconv(., to = "ASCII//TRANSLIT") %>%
-      {gsub("\\s", "", .)}
-  }
+  vect <- vect %>%
+    strsplit(., "", fixed = TRUE) %>%
+    lapply(., function(strings) {
+      vapply(seq_len(length(strings) - numgram), function(char) {
+        paste(strings[char:(char + numgram)], collapse = "")
+      }, character(1))
+    }) %>%
+    cpp_list_unique(sort_vals = TRUE) %>%
+    cpp_paste_collapse_list %>%
+    iconv(., to = "ASCII//TRANSLIT")
   return(vect)
 }
diff --git a/R/n_gram_merge.R b/R/n_gram_merge.R
@@ -43,9 +43,6 @@
 #'  \item t: transposition, default value is 0.5
 #'  }
 #'
-#'  For parameter \code{numgram}, the function is unreliable for any values
-#'  other than 2.
-#'
 #' @return Character vector with similar values merged.
 #' @importFrom magrittr "%>%"
 #' @export

diff --git a/R/utils.R b/R/utils.R
@@ -13,22 +13,6 @@ business_suffix <- function(vect) {
     {gsub(" and ", " & ", ., fixed = TRUE)}
 }
 
-# For each element of an input character vector, insert a single space
-# between each char. This function is meant to mimic function ngram::splitter,
-# but is faster due to fewer input checks.
-# Arg numgram_thres is a numeric value. After the splitting, any string that
-# has length less than this number will be replaced with NA_character_.
-char_splitter <- function(vect, numgram_thres) {
-  vapply(vect, function(x) {
-    x <- paste0(strsplit(x, split = "", fixed = TRUE)[[1]], collapse = " ")
-    if (nchar(x) >= numgram_thres) {
-      x
-    } else {
-      NA_character_
-    }
-  }, character(1), USE.NAMES = FALSE)
-}
-
 # Flatten a nested list such that each character vector occupies its own
 # element in the return list. Can handle lists that have inconsistent nesting
 # levels.

diff --git a/README.md b/README.md
@@ -9,8 +9,6 @@ R package implementation of two algorithms from the open source software [OpenRe
 
 In addition, there are few add-on features included, to make the clustering/merging functions more useful. These include approximate string matching to allow for merging despite minor mispellings, the option to pass a dictionary vector to dictate edit values, and the option to pass a vector of strings to ignore during the clustering process. Examples of these features are all shown below.
 
-This package is built using [stringdist](https://cran.r-project.org/web/packages/stringdist/index.html) for approximate string matching, [ngram](https://cran.r-project.org/web/packages/ngram/index.html) for string tokenization, and [Rcpp](https://cran.r-project.org/web/packages/Rcpp/index.html) to allow for functions written in C++ for faster performance.
-
 Installation
 ------------
 

diff --git a/man/n_gram_merge.Rd b/man/n_gram_merge.Rd