## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE ) library(ggplot2) set.seed(1234) ## ----ex-records--------------------------------------------------------------- # load library library(representr) # load data data("rl_reg1") # data for record linkage and regression data("identity.rl_reg1") # true identity of each record ## ----echo = FALSE------------------------------------------------------------- knitr::kable(head(rl_reg1)) ## ----------------------------------------------------------------------------- data("linkage.rl") ## ---- echo=FALSE-------------------------------------------------------------- # take the last iteration of linkage for lambda value lambda <- linkage.rl[nrow(linkage.rl),] # split data by linkage results clusters <- split(rl_reg1, lambda) ## ---- echo=FALSE-------------------------------------------------------------- knitr::kable(clusters[[names(which(table(lambda) == 4)[1])]]) ## ----random------------------------------------------------------------------- # ids for representative records (random) random_id <- represent(rl_reg1, lambda, "proto_random", parallel = FALSE) rep_random <- rl_reg1[random_id,] # representative records (random) ## ---- echo = FALSE------------------------------------------------------------ knitr::kable(rep_random[as.numeric(names(which(table(lambda) == 4)))[1:5],]) ## ----minimax------------------------------------------------------------------ # additional parameters for minimax prototyping # need column types, the order levels for any ordinal variables, and column weights col_type <- c("string", "string", "numeric", "numeric", "numeric", "categorical", "ordinal", "numeric", "numeric") orders <- list(education = c("Less than a high school diploma", "High school graduates, no college", "Some college or associate degree", "Bachelor's degree only", "Advanced degree")) weights <- c(.25, .25, .05, .05, .1, .15, .05, .05, .05) # ids for representative records (minimax) minimax_id <- represent(rl_reg1, linkage.rl[nrow(linkage.rl),], "proto_minimax", distance = dist_col_type, col_type = col_type, weights = weights, orders = orders, scale = TRUE, parallel = FALSE) rep_minimax <- rl_reg1[minimax_id,] # representative records (minimax) ## ---- echo = FALSE------------------------------------------------------------ knitr::kable(rep_minimax[as.numeric(names(which(table(lambda) == 4)))[1:5],]) ## ----composite---------------------------------------------------------------- # representative records (composite) rep_composite <- represent(rl_reg1, linkage.rl[nrow(linkage.rl),], "composite", col_type = col_type, parallel = FALSE) ## ---- echo = FALSE------------------------------------------------------------ knitr::kable(rep_composite[as.numeric(names(which(table(lambda) == 4)))[1:5],]) ## ----pp----------------------------------------------------------------------- # Posterior prototyping weights pp_weights <- pp_weights(rl_reg1, linkage.rl[seq(80000, 100000, by = 100), ], "proto_minimax", distance = dist_col_type, col_type = col_type, weights = weights, orders = orders, scale = TRUE, parallel = FALSE) ## ----thresh-plot, fig.width = 5, echo = FALSE, fig.align="center", fig.cap = "The distribution of PP weights as generated from posterior draws of $\\boldsymbol \\Lambda$ colored by if they are true or duplicated records in the original data set. The dotted vertical line shows the threshold value of 0.5. The true records have consistently higher PP weights and the proportion of duplicated records with high weights is relatively low."---- data.frame(pp_weights = pp_weights, true = seq_len(nrow(rl_reg1)) %in% unique(identity.rl_reg1), included = pp_weights >= .5) -> threshold_df ggplot(threshold_df) + geom_histogram(aes(pp_weights, fill = true), position = "dodge") + geom_vline(aes(xintercept = .5), lty = 2, alpha = .5) + xlab("Posterior Prototyping weights") + ylab("") + scale_fill_discrete("True record") + theme(legend.position = "bottom") ## ----pp_thresh---------------------------------------------------------------- # representative records (PP threshold) rep_pp_thresh <- rl_reg1[pp_weights > .5, ] ## ---- echo = FALSE------------------------------------------------------------ knitr::kable(head(rep_pp_thresh)) ## ----empkl_div, message = FALSE, warning = FALSE------------------------------ true_dat <- rl_reg1[unique(identity.rl_reg1),] # true records emp_kl_div(true_dat, rep_random, c("sex"), c("income", "bp")) emp_kl_div(true_dat, rep_minimax, c("sex"), c("income", "bp")) emp_kl_div(true_dat, rep_composite, c("sex"), c("income", "bp")) emp_kl_div(true_dat, rep_pp_thresh, c("sex"), c("income", "bp"))