-
Notifications
You must be signed in to change notification settings - Fork 5
/
get_fingerprint.R
103 lines (99 loc) · 4.32 KB
/
get_fingerprint.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#' Given a character vector as input, get the key collision fingerprint for
#' each element.
#' @noRd
get_fingerprint_KC <- function(vect, bus_suffix = TRUE,
ignore_strings = NULL) {
# Remove char accent marks.
vect <- remove_accents(vect)
if(!is.null(ignore_strings)) ignore_strings <- remove_accents(ignore_strings)
# Replace some punctuation with an empty string (want "Ed's" to be 1 word).
vect <- gsub("[;'`\"]", "", tolower(vect), perl = TRUE)
# Replace other punct with a blank space (want "cats,inc" to be 2 words).
vect <- gsub("[[:punct:]]", " ", vect, perl = TRUE)
vect <- gsub(" {2,}", " ", vect, perl = TRUE)
if (bus_suffix) {
vect <- business_suffix(vect)
if (!is.null(ignore_strings)) {
ignore_strings <- c(ignore_strings,
c("inc", "corp", "co", "llc", "ltd", "div", "ent",
"lp", "and"))
} else {
ignore_strings <- c("inc", "corp", "co", "llc", "ltd", "div", "ent",
"lp", "and")
}
}
vect <- strsplit(cpp_trimws_left(vect), " ", fixed = TRUE)
# If "ignore_strings" is not NULL, for each element of list "vect", remove
# any string that has a match within vector "ignore_strings".
if (!is.null(ignore_strings)) vect <- remove_strings(vect, ignore_strings)
# Final transformations, then return object "out".
vect <- cpp_list_unique(vect, sort_vals = TRUE)
vect <- cpp_paste_list(vect, collapse_str = " ")
return(vect)
}
#' Given a character vector as input, get the ngram fingerprint value for each
#' element of the input.
#'@noRd
get_fingerprint_ngram <- function(vect, numgram = 2, bus_suffix = TRUE,
ignore_strings = NULL) {
# Remove char accent marks.
vect <- remove_accents(vect)
if(!is.null(ignore_strings)) ignore_strings <- remove_accents(ignore_strings)
# Replace some punctuation with an empty string (want "Ed's" to be 1 word).
vect <- gsub("[;'`\"]", "", cpp_tolower(vect), perl = TRUE)
# Replace other punct with a blank space (want "cats,inc" to be 2 words).
vect <- gsub("[[:punct:]]", " ", vect, perl = TRUE)
# Compile variable ignore_strings.
if (bus_suffix) {
vect <- business_suffix(vect)
if (!is.null(ignore_strings)) {
ignore_strings <- c(ignore_strings,
c("inc", "corp", "co", "llc", "ltd", "div", "ent",
"lp", "and"))
} else {
ignore_strings <- c("inc", "corp", "co", "llc", "ltd", "div", "ent",
"lp", "and")
}
}
if (!is.null(ignore_strings)) {
# Use values in "ignore_strings" to create a regex of substrings to
# eliminate from each element of "vect" (also remove all spaces).
regex <- paste0("\\b(",
paste(ignore_strings, collapse = "|"),
")\\b| ")
vect <- gsub(regex, "", vect, perl = TRUE)
} else {
# Otherwise, if ignore_strings is NULL, only remove spaces.
vect <- gsub(" ", "", vect, fixed = TRUE)
}
# Rest of the transformations. For each value in vect: get ngrams, filter by
# unique, sort alphabetically, paste back together, and normalize encoding.
if (numgram == 1) {
vect <- strsplit(vect, "", fixed = TRUE)
vect <- cpp_list_unique(vect, sort_vals = TRUE)
vect <- cpp_paste_list(vect, collapse_str = "")
} else {
vect <- cpp_get_char_ngrams(vect, numgram = numgram)
}
return(vect)
}
# Function that attempts to merge common business name suffixes within a
# character string.
business_suffix <- function(vect) {
vect <- gsub(" incorporated| incorporate", " inc", vect, perl = TRUE)
vect <- gsub(" corporation| corporations", " corp", vect, perl = TRUE)
vect <- gsub(" company| companys| companies", " co", vect, perl = TRUE)
vect <- gsub(" limited liability co", " llc", vect, fixed = TRUE)
vect <- gsub(" limited$", " ltd", vect, perl = TRUE)
vect <- gsub(" division| divisions", " div", vect, perl = TRUE)
vect <- gsub(" enterprises| enterprise", " ent", vect, perl = TRUE)
vect <- gsub(" limited partnership", " lp", vect, fixed = TRUE)
return(vect)
}
# Remove accents from chars, while properly handling UTF-8 strings.
remove_accents <- function(vect) {
enc <- Encoding(vect) == "UTF-8"
vect[enc] <- stri_trans_general(vect[enc], "latin-ASCII")
vect[!enc] <- iconv(vect[!enc], to = "ASCII//TRANSLIT")
vect
}