From 9a3a4cc713a91a3d0e3556d7793098b7ff31632f Mon Sep 17 00:00:00 2001 From: BERENZ Date: Sat, 4 May 2024 23:11:36 +0200 Subject: [PATCH] update vignettes --- .Rproj.user/shared/notebooks/paths | 3 + DESCRIPTION | 4 +- R/blocking.R | 29 +++++--- R/controls.R | 16 +++-- R/methods.R | 4 +- man/controls_txt.Rd | 17 +++-- vignettes/v2-reclin.Rmd | 8 ++- vignettes/v4-integration.Rmd | 103 +++++++++++++++++++++++++++++ 8 files changed, 159 insertions(+), 25 deletions(-) diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index 633f9e0..93c4658 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,4 +1,5 @@ /Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/DESCRIPTION="019D16E4" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_hnsw.R="A4FAA5A3" @@ -17,3 +18,5 @@ /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="289A4D2F" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v3-evaluation.Rmd="E778A54F" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v4-integration.Rmd="E3EFC8F1" diff --git a/DESCRIPTION b/DESCRIPTION index a85a1a4..409791a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -30,5 +30,7 @@ Suggests: tinytest, reclin2, knitr, - rmarkdown + rmarkdown, + fastLink, + RecordLinkage VignetteBuilder: knitr diff --git a/R/blocking.R b/R/blocking.R index 69d9a62..efcec47 100644 --- a/R/blocking.R +++ b/R/blocking.R @@ -157,36 +157,46 @@ blocking <- function(x, if (.Platform$OS.type == "unix") { x_tokens <- text2vec::itoken_parallel( iterable = x, - tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles), + tokenizer = function(x) tokenizers::tokenize_character_shingles(x, + n = control_txt$n_shingles, + lowercase = control_txt$lowercase, + strip_non_alphanum = control_txt$strip_non_alphanum), n_chunks = control_txt$n_chunks, progressbar = verbose) } else { x_tokens <- text2vec::itoken( iterable = x, - tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles), + tokenizer = function(x) tokenizers::tokenize_character_shingles(x, + n = control_txt$n_shingles, + lowercase = control_txt$lowercase, + strip_non_alphanum = control_txt$strip_non_alphanum), n_chunks = control_txt$n_chunks, progressbar = verbose) } - x_voc <- text2vec::create_vocabulary(x_tokens) x_vec <- text2vec::vocab_vectorizer(x_voc) x_dtm <- text2vec::create_dtm(x_tokens, x_vec) - if (is.null(y_default)) { y_dtm <- x_dtm } else { if (.Platform$OS.type == "unix") { y_tokens <- text2vec::itoken_parallel( iterable = y, - tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles), + tokenizer = function(x) tokenizers::tokenize_character_shingles(x, + n = control_txt$n_shingles, + lowercase = control_txt$lowercase, + strip_non_alphanum = control_txt$strip_non_alphanum), n_chunks = control_txt$n_chunks, progressbar = verbose) } else { y_tokens <- text2vec::itoken( iterable = y, - tokenizer = function(x) tokenizers::tokenize_character_shingles(x, n = control_txt$n_shingles), + tokenizer = function(x) tokenizers::tokenize_character_shingles(x, + n = control_txt$n_shingles, + lowercase = control_txt$lowercase, + strip_non_alphanum = control_txt$strip_non_alphanum), n_chunks = control_txt$n_chunks, progressbar = verbose) } @@ -197,7 +207,6 @@ blocking <- function(x, } } - colnames_xy <- intersect(colnames(x_dtm), colnames(y_dtm)) if (verbose %in% 1:2) { @@ -205,7 +214,6 @@ blocking <- function(x, ann, nrow(x_dtm), nrow(y_dtm), length(colnames_xy))) } - x_df <- switch(ann, "nnd" = method_nnd(x = x_dtm[, colnames_xy], y = y_dtm[, colnames_xy], @@ -267,11 +275,10 @@ blocking <- function(x, x_df[, `:=`(block, x_block[names(x_block) %in% x_df$query_g])] - ## if true are given if (!is.null(true_blocks)) { - setDT(true_blocks) + setDT(true_blocks) ## move it somewhere else pairs_to_eval <- x_df[y %in% true_blocks$y, c("x", "y", "block")] pairs_to_eval[true_blocks, on = c("x", "y"), both := TRUE] @@ -306,7 +313,7 @@ blocking <- function(x, } - ## consider using RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads) + #consider using RcppAlgos::comboGeneral(nrow(pairs_to_eval_long), 2, nThreads=n_threads) candidate_pairs <- utils::combn(nrow(pairs_to_eval_long), 2) same_block <- pairs_to_eval_long$block_id[candidate_pairs[1, ]] == pairs_to_eval_long$block_id[candidate_pairs[2, ]] diff --git a/R/controls.R b/R/controls.R index 37945e4..bc5dc28 100644 --- a/R/controls.R +++ b/R/controls.R @@ -69,19 +69,25 @@ controls_ann <- function( #' @author Maciej Beręsewicz #' #' @description -#' Controls for text data used in the \code{blocking} functions +#' Controls for text data used in the \code{blocking} functions, passed to [tokenizers::tokenize_character_shingles]. #' -#' @param n_shingles length of shingles (default 2L), passed to [tokenizers::tokenize_character_shingles], -#' @param n_chunks passed to (default 10L) [tokenizers::tokenize_character_shingles]. +#' @param n_shingles length of shingles (default `2L`), +#' @param n_chunks passed to (default `10L`), +#' @param lowercase should the caracters be made lowercase? (default `TRUE`) +#' @param strip_non_alphanum should punctuation and white space be stripped? (default `TRUE`) #' #' @returns Returns a list with parameters. #' #' @export controls_txt <- function( n_shingles = 2L, - n_chunks = 10L + n_chunks = 10L, + lowercase = TRUE, + strip_non_alphanum = TRUE ) { list(n_shingles = n_shingles, - n_chunks = n_chunks) + n_chunks = n_chunks, + lowercase = lowercase, + strip_non_alphanum = strip_non_alphanum) } diff --git a/R/methods.R b/R/methods.R index 45017ad..c18763e 100644 --- a/R/methods.R +++ b/R/methods.R @@ -10,7 +10,7 @@ print.blocking <- function(x,...) { cat("Blocking based on the", x$method, "method.\n") cat("Number of blocks: ", length(unique(block_ids)), ".\n",sep="") cat("Number of columns used for blocking: ", NROW(x$colnames), ".\n",sep="") - cat("Reduction ratio: ", round(rr, 4), ".\n",sep="") + cat("Reduction ratio: ", sprintf("%.4f", rr), ".\n",sep="") cat("========================================================\n") cat("Distribution of the size of the blocks:") @@ -20,7 +20,7 @@ print.blocking <- function(x,...) { if (!is.null(x$metrics)) { cat("========================================================\n") cat("Evaluation metrics (standard):\n" ) - print(round(x$metrics*100, 4)) + sprintf("%.4f", x$metrics*100) } invisible(x) diff --git a/man/controls_txt.Rd b/man/controls_txt.Rd index a53b6ce..5425f65 100644 --- a/man/controls_txt.Rd +++ b/man/controls_txt.Rd @@ -4,18 +4,27 @@ \alias{controls_txt} \title{Controls for processing text data} \usage{ -controls_txt(n_shingles = 2L, n_chunks = 10L) +controls_txt( + n_shingles = 2L, + n_chunks = 10L, + lowercase = TRUE, + strip_non_alphanum = TRUE +) } \arguments{ -\item{n_shingles}{length of shingles (default 2L), passed to \link[tokenizers:shingle-tokenizers]{tokenizers::tokenize_character_shingles},} +\item{n_shingles}{length of shingles (default \code{2L}),} -\item{n_chunks}{passed to (default 10L) \link[tokenizers:shingle-tokenizers]{tokenizers::tokenize_character_shingles}.} +\item{n_chunks}{passed to (default \code{10L}),} + +\item{lowercase}{should the caracters be made lowercase? (default \code{TRUE})} + +\item{strip_non_alphanum}{should punctuation and white space be stripped? (default \code{TRUE})} } \value{ Returns a list with parameters. } \description{ -Controls for text data used in the \code{blocking} functions +Controls for text data used in the \code{blocking} functions, passed to \link[tokenizers:shingle-tokenizers]{tokenizers::tokenize_character_shingles}. } \author{ Maciej Beręsewicz diff --git a/vignettes/v2-reclin.Rmd b/vignettes/v2-reclin.Rmd index 511a151..4a62db6 100644 --- a/vignettes/v2-reclin.Rmd +++ b/vignettes/v2-reclin.Rmd @@ -1,6 +1,10 @@ --- title: "Blocking records for record linkage" author: "Maciej Beręsewicz" +execute: + warning: false + message: false +lang: en output: html_vignette: df_print: kable @@ -88,7 +92,6 @@ cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, ``` - # Linking datasets ## Using basic functionalities of `blocking` package @@ -110,6 +113,7 @@ Example pairs ```{r} head(result1$result, n= 10) ``` + Let's look at the first pair. Clearly there is a typo on the `pername1` but all other variables are the same so it seems that this is a match. ```{r} @@ -127,7 +131,7 @@ cis[3901, ] ## Assessing the quality -For some records we have information on the correct linkage. We can use this information to assess our approach. +For some records we have information on the correct linkage. We can use this information to assess our approach but note that information on assessing the quality is described in detail in the other vignette. ```{r} matches <- merge(x = census[, .(x=1:.N, person_id)], diff --git a/vignettes/v4-integration.Rmd b/vignettes/v4-integration.Rmd index df5b06b..ebda676 100644 --- a/vignettes/v4-integration.Rmd +++ b/vignettes/v4-integration.Rmd @@ -1,6 +1,10 @@ --- title: "Integration with existing packages" author: "Maciej Beręsewicz" +execute: + warning: false + message: false +lang: en output: html_vignette: df_print: kable @@ -20,6 +24,105 @@ knitr::opts_chunk$set( ) ``` +# Setup + ```{r setup} library(blocking) +library(reclin2) +library(fastLink) +library(RecordLinkage) +``` + +# Data + +In the example we will use the same dataset as in the *Blocking records for record linkage* vignette. + +```{r} +census <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/census.csv") +cis <- read.csv("https://raw.githubusercontent.com/djvanderlaan/tutorial-reclin-uros2021/main/data/cis.csv") +setDT(census) +setDT(cis) +census[is.na(dob_day), dob_day := ""] +census[is.na(dob_mon), dob_mon := ""] +census[is.na(dob_year), dob_year := ""] +cis[is.na(dob_day), dob_day := ""] +cis[is.na(dob_mon), dob_mon := ""] +cis[is.na(dob_year), dob_year := ""] +census[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] +cis[, txt:=paste0(pername1, pername2, sex, dob_day, dob_mon, dob_year, enumcap, enumpc)] +census[, x:=1:.N] +cis[, y:=1:.N] +``` + +# Integration with the `reclin2` package + +The package contains function `pair_ann` which aims at integration with `reclin2` package. This function works as follows + +```{r} +pair_ann(x = census[1:1000], + y = cis[1:1000], + on = "txt", + deduplication = FALSE) +``` + +Which provides you information on the total number of pairs. This can be further included in the pipeline of the `reclin2` package. + +```{r} +pair_ann(x = census[1:1000], + y = cis[1:1000], + on = "txt", + deduplication = FALSE, + ann = "hnsw") |> + compare_pairs(on = "txt", comparators = list(cmp_jarowinkler())) |> + score_simple("score", on = "txt") |> + select_threshold("threshold", score = "score", threshold = 0.75) |> + link(selection = "threshold") |> + head() +``` + +# Usage with `fastLink` package + +In order to use it with the `fastLink` package you need to add information on blocks to datasets and create appropriate subsets just like presented in the `blockData` function. + +```{r} +blocks <- blocking(x = census$txt[1:1000], + y = cis$txt[1:1000], + #verbose = 1, + seed = 2024) + +census[blocks$result, on = "x", block:=as.integer(i.block)] +cis[blocks$result, on = "y", block:=as.integer(i.block)] +``` + +Then you can use `blockData` function. + +```{r} +blocked_data <- blockData(dfA = cis[1:1000], + dfB = census[1:1000], + varnames = "block") +``` + +```{r} +blocked_data$block.3 +``` + +```{r} +dfA_block1 <- cis[1:1000][blocked_data$block.3$dfA.inds,] +dfB_block1 <- census[1:1000][blocked_data$block.3$dfB.inds,] ``` + + +# Usage with `RecordLinkage` package + +Te same can be done with the `RecordLinkage` package + +```{r} +pairs <- RecordLinkage::compare.linkage(dataset1 = cis[1:500], + dataset2 = census[1:500], + blockfld = list(12), + strcmp = c("pername1", "pername2", "sex", "enumcap", "enumpc")) +summary(pairs) +``` + + +