Skip to content

Commit

Permalink
changed the defaults for nnd algorithm and other changes
Browse files Browse the repository at this point in the history
  • Loading branch information
BERENZ committed May 7, 2024
1 parent 041fccf commit d9790fc
Show file tree
Hide file tree
Showing 16 changed files with 54 additions and 42 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Imports:
rnndescent,
igraph,
data.table,
RcppAlgos,
methods
Suggests:
tinytest,
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export(controls_ann)
export(controls_txt)
export(pair_ann)
import(data.table)
importFrom(RcppAlgos,comboGeneral)
importFrom(RcppAnnoy,AnnoyAngular)
importFrom(RcppAnnoy,AnnoyEuclidean)
importFrom(RcppAnnoy,AnnoyHamming)
Expand All @@ -31,6 +32,5 @@ importFrom(text2vec,create_vocabulary)
importFrom(text2vec,itoken)
importFrom(text2vec,itoken_parallel)
importFrom(text2vec,vocab_vectorizer)
importFrom(utils,combn)
importFrom(utils,setTxtProgressBar)
importFrom(utils,txtProgressBar)
22 changes: 11 additions & 11 deletions R/blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#' @importFrom igraph graph_from_data_frame
#' @importFrom igraph make_clusters
#' @importFrom igraph compare
#' @importFrom utils combn
#' @importFrom RcppAlgos comboGeneral
#'
#'
#' @title Block records based on text data.
Expand Down Expand Up @@ -41,6 +41,7 @@
#' \itemize{
#' \item{\code{result} -- \code{data.table} with indices (rows) of x, y, block and distance between points}
#' \item{\code{method} -- name of the ANN algorithm used,}
#' \item{\code{deduplication} -- information whether deduplication was applied,}
#' \item{\code{metrics} -- metrics for quality assessment, if \code{true_blocks} is provided,}
#' \item{\code{colnames} -- variable names (colnames) used for search,}
#' \item{\code{graph} -- \code{igraph} class object.}
Expand Down Expand Up @@ -96,12 +97,11 @@ blocking <- function(x,
is.character(x) | is.matrix(x) | inherits(x, "Matrix"))

## assuming rows (for nnd)
stopifnot("Minimum 3 cases required for x" = NROW(x) > 2)

if (!is.null(y)) {
stopifnot("Minimum 3 cases required for y" = NROW(y) > 2)
}

# stopifnot("Minimum 3 cases required for x" = NROW(x) > 2)
#
# if (!is.null(y)) {
# stopifnot("Minimum 3 cases required for y" = NROW(y) > 2)
# }

if (!is.null(ann_write)) {
stopifnot("Path provided in the `ann_write` is incorrect" = file.exists(ann_write) )
Expand Down Expand Up @@ -314,11 +314,10 @@ blocking <- function(x,

}

#consider using RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads)
candidate_pairs <- utils::combn(nrow(pairs_to_eval_long), 2)
candidate_pairs <- RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads)

same_block <- pairs_to_eval_long$block_id[candidate_pairs[1, ]] == pairs_to_eval_long$block_id[candidate_pairs[2, ]]
same_truth <- pairs_to_eval_long$true_id[candidate_pairs[1, ]] == pairs_to_eval_long$true_id[candidate_pairs[2, ]]
same_block <- pairs_to_eval_long$block_id[candidate_pairs[, 1]] == pairs_to_eval_long$block_id[candidate_pairs[,2]]
same_truth <- pairs_to_eval_long$true_id[candidate_pairs[,1]] == pairs_to_eval_long$true_id[candidate_pairs[,2]]

confusion <- table(same_block, same_truth)

Expand All @@ -343,6 +342,7 @@ blocking <- function(x,
method = ann,
deduplication = deduplication,
metrics = if (is.null(true_blocks)) NULL else eval_metrics,
confusion = if (is.null(true_blocks)) NULL else confusion,
colnames = colnames_xy,
graph = if (graph) {
igraph::graph_from_data_frame(x_df[, c("x", "y")], directed = F)
Expand Down
2 changes: 1 addition & 1 deletion R/controls.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ controls_ann <- function(
sparse = FALSE,
k_search = 30,
nnd = list(k_build = 30,
use_alt_metric = TRUE,
use_alt_metric = FALSE,
init = "tree",
n_trees = NULL,
leaf_size = NULL,
Expand Down
20 changes: 10 additions & 10 deletions R/method_nnd.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,19 @@ method_nnd <- function(x,
## query k dependent on the study
## there is a problem when dataset is small

if (deduplication == T) {
k_nnd_query <- k
} else if (nrow(x) < 10) {
k_nnd_query <- k
} else if (nrow(x) < control$k_search) {
k_nnd_query <- nrow(x)
} else {
k_nnd_query <- control$k_search
}
# if (deduplication == T) {
# k_nnd_query <- k
# } else if (nrow(x) < 10) {
# k_nnd_query <- k
# } else if (nrow(x) < control$k_search) {
# k_nnd_query <- nrow(x)
# } else {
# k_nnd_query <- control$k_search
# }

l_1nn <- rnndescent::rnnd_query(index = l_ind,
query = y,
k = k_nnd_query,
k = if (nrow(x) < control$k_search) nrow(x) else control$k_search,
epsilon = 0.1,
max_search_fraction = 1,
init = NULL,
Expand Down
2 changes: 1 addition & 1 deletion R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ print.blocking <- function(x,...) {
cat("========================================================\n")
cat("Evaluation metrics (standard):\n" )
metrics <- as.numeric(sprintf("%.4f", x$metrics*100))
names(metrics) <- names(result2$metrics)
names(metrics) <- names(x$metrics)
print(metrics)

}
Expand Down
22 changes: 12 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,11 @@ Table with blocking results contains:
blocking_result$result
#> x y block dist
#> <int> <int> <num> <num>
#> 1: 1 2 1 0.10000005
#> 2: 1 3 1 0.14188367
#> 3: 1 4 1 0.28286284
#> 4: 5 6 2 0.08333336
#> 5: 5 7 2 0.13397458
#> 1: 1 2 1 0.10000002
#> 2: 2 3 1 0.14188367
#> 3: 2 4 1 0.28286284
#> 4: 5 6 2 0.08333331
#> 5: 5 7 2 0.13397455
#> 6: 5 8 2 0.27831215
```

Expand All @@ -148,17 +148,19 @@ pair_ann(x = df_example, on = "txt") |>
score_simple("score", on = "txt") |>
select_threshold("threshold", score = "score", threshold = 0.55) |>
link(selection = "threshold")
#> Total number of pairs: 6 pairs
#> Total number of pairs: 8 pairs
#>
#> Key: <.y>
#> .y .x txt.x txt.y
#> <int> <int> <char> <char>
#> 1: 2 1 jankowalski kowalskijan
#> 2: 3 1 jankowalski kowalskimjan
#> 3: 4 1 jankowalski kowaljan
#> 4: 6 5 montypython pythonmonty
#> 5: 7 5 montypython cyrkmontypython
#> 6: 8 5 montypython monty
#> 3: 3 2 kowalskijan kowalskimjan
#> 4: 4 1 jankowalski kowaljan
#> 5: 4 2 kowalskijan kowaljan
#> 6: 6 5 montypython pythonmonty
#> 7: 7 5 montypython cyrkmontypython
#> 8: 8 5 montypython monty
```

Linking records using the same function where `df_base` is the
Expand Down
2 changes: 2 additions & 0 deletions inst/tinytest/test_annoy.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ expect_equal(
method = "annoy",
deduplication = FALSE,
metrics = NULL,
confusion = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo",
"nt", "ow", "py", "sk", "ty", "wa", "yp", "yt", "on", "th"),
graph = NULL),
Expand All @@ -50,6 +51,7 @@ expect_equal(
method = "annoy",
deduplication = FALSE,
metrics = NULL,
confusion = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"),
graph = NULL),
Expand Down
2 changes: 1 addition & 1 deletion inst/tinytest/test_blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ expect_equal(

expect_equal(
blocking(x = df_base$txt, y = df_example$txt, ann = "lsh")$result$block,
c(rep(2,3),rep(1,4), 3)
c(rep(2,3),rep(1,4),3)
)

expect_silent(
Expand Down
2 changes: 2 additions & 0 deletions inst/tinytest/test_hnsw.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ expect_equal(
method = "hnsw",
deduplication = FALSE,
metrics = NULL,
confusion = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"),
graph = NULL),
Expand Down Expand Up @@ -56,6 +57,7 @@ expect_equal(
method = "hnsw",
deduplication = FALSE,
metrics = NULL,
confusion = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt",
"ow", "py", "sk", "ty", "wa", "yp", "yt", "on", "th"),
graph = NULL),
Expand Down
4 changes: 4 additions & 0 deletions inst/tinytest/test_mlpack.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ expect_equal(
method = "lsh",
deduplication = FALSE,
metrics = NULL,
confusion = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"),
graph = NULL),
Expand All @@ -34,6 +35,7 @@ expect_equal(
method = "kd",
deduplication = FALSE,
metrics = NULL,
confusion = NULL,
colnames =c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"),
graph = NULL),
Expand Down Expand Up @@ -67,6 +69,7 @@ expect_equal(
method = "lsh",
deduplication = FALSE,
metrics = NULL,
confusion = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "on", "th"),
graph = NULL),
Expand All @@ -88,6 +91,7 @@ expect_equal(
method = "kd",
deduplication = FALSE,
metrics = NULL,
confusion = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "on", "th"),
graph = NULL),
Expand Down
2 changes: 1 addition & 1 deletion inst/tinytest/test_reclin2.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ expect_silent(

expect_equal(
dim(pair_ann(x = df_example, on = "txt")),
c(6, 3)
c(8, 3)
)

expect_equal(
Expand Down
Empty file.
1 change: 1 addition & 0 deletions man/blocking.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/controls_ann.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions vignettes/v2-reclin.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ The goal of this exercise is to link units from the CIS dataset to the CENSUS da

```{r}
set.seed(2024)
result1 <- blocking(x = census$txt, y = cis$txt, verbose = 1)
result1 <- blocking(x = census$txt, y = cis$txt, verbose = 1, n_threads = 8)
```

Distribution of distances for each pair.
Expand Down Expand Up @@ -140,7 +140,7 @@ So in our example we have `r nrow(matches)` pairs.
```{r}
set.seed(2024)
result2 <- blocking(x = census$txt, y = cis$txt, verbose = 1,
true_blocks = matches[, .(x, y, block)], n_threads = 4)
true_blocks = matches[, .(x, y, block)], n_threads = 8)
```

Let's see how our approach handled this problem.
Expand All @@ -149,7 +149,7 @@ Let's see how our approach handled this problem.
result2
```

It seems that the default parameters of the NND method result in an FNR of `r sprintf("%.2f",result2$metrics["fnr"]*100)`%, which is quite large. We can see if increasing the number of `k` (and thus `max_candidates`) as suggested in the [Nearest Neighbor Descent
It seems that the default parameters of the NND method result in an FNR of `r sprintf("%.2f",result2$metrics["fnr"]*100)`%. We can see if increasing the number of `k` (and thus `max_candidates`) as suggested in the [Nearest Neighbor Descent
](https://jlmelville.github.io/rnndescent/articles/nearest-neighbor-descent.html) vignette will help.


Expand All @@ -159,7 +159,7 @@ ann_control_pars <- controls_ann()
ann_control_pars$k_search <- 60
result3 <- blocking(x = census$txt, y = cis$txt, verbose = 1,
true_blocks = matches[, .(x, y, block)], n_threads = 4,
true_blocks = matches[, .(x, y, block)], n_threads = 8,
control_ann = ann_control_pars)
```

Expand All @@ -173,7 +173,7 @@ Finally, compare the NND and HNSW algorithm for this example.

```{r}
result4 <- blocking(x = census$txt, y = cis$txt, verbose = 1,
true_blocks = matches[, .(x, y, block)], n_threads = 4,
true_blocks = matches[, .(x, y, block)], n_threads = 8,
ann = "hnsw", seed = 2024)
```

Expand Down

0 comments on commit d9790fc

Please sign in to comment.