## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 5 ) library(couplr) library(dplyr) library(ggplot2) ## ----------------------------------------------------------------------------- set.seed(42) # Treatment group: younger, more educated, higher prior earnings treatment <- tibble( id = 1:200, age = rnorm(200, mean = 35, sd = 8), education = rnorm(200, mean = 14, sd = 2), prior_earnings = rnorm(200, mean = 40000, sd = 12000), employed = rbinom(200, 1, 0.7), group = "treatment" ) # Control group: older, less educated, lower prior earnings (selection bias) control <- tibble( id = 201:700, age = rnorm(500, mean = 45, sd = 12), education = rnorm(500, mean = 12, sd = 3), prior_earnings = rnorm(500, mean = 32000, sd = 15000), employed = rbinom(500, 1, 0.5), group = "control" ) # Combine for packages that expect single data frame combined <- bind_rows(treatment, control) %>% mutate(treated = as.integer(group == "treatment")) cat("Treatment units:", nrow(treatment), "\n") cat("Control units:", nrow(control), "\n") # Baseline imbalance vars <- c("age", "education", "prior_earnings", "employed") for (v in vars) { diff <- mean(treatment[[v]]) - mean(control[[v]]) pooled_sd <- sqrt((var(treatment[[v]]) + var(control[[v]])) / 2) std_diff <- diff / pooled_sd cat(sprintf("%s: std diff = %.3f\n", v, std_diff)) } ## ----eval = requireNamespace("MatchIt", quietly = TRUE)----------------------- # if (requireNamespace("MatchIt", quietly = TRUE)) { # library(MatchIt) # # # MatchIt: Propensity score matching (default) # m_ps <- matchit( # treated ~ age + education + prior_earnings + employed, # data = combined, # method = "nearest", # distance = "glm" # Propensity score via logistic regression # ) # # cat("MatchIt (propensity score, nearest neighbor):\n") # cat(" Matched pairs:", sum(m_ps$weights[combined$treated == 1] > 0), "\n") # # # Extract matched data # matched_ps <- match.data(m_ps) # } ## ----------------------------------------------------------------------------- # couplr: Direct covariate matching result_couplr <- match_couples( left = treatment, right = control, vars = c("age", "education", "prior_earnings", "employed"), auto_scale = TRUE, scale = "robust" ) cat("\ncouplr (direct covariate matching):\n") cat(" Matched pairs:", result_couplr$info$n_matched, "\n") cat(" Mean distance:", round(mean(result_couplr$pairs$distance), 4), "\n") ## ----eval = requireNamespace("MatchIt", quietly = TRUE), fig.width=9, fig.height=5, fig.alt="Side-by-side comparison of covariate balance achieved by MatchIt and couplr, showing standardized mean differences for age, education, prior_earnings, and employed"---- # if (requireNamespace("MatchIt", quietly = TRUE)) { # # MatchIt balance # matched_treated_ps <- matched_ps %>% filter(treated == 1) # matched_control_ps <- matched_ps %>% filter(treated == 0) # # matchit_balance <- tibble( # variable = vars, # std_diff = sapply(vars, function(v) { # diff <- mean(matched_treated_ps[[v]]) - mean(matched_control_ps[[v]]) # pooled_sd <- sqrt((var(matched_treated_ps[[v]]) + var(matched_control_ps[[v]])) / 2) # diff / pooled_sd # }), # method = "MatchIt" # ) # # # couplr balance # couplr_balance <- balance_diagnostics( # result_couplr, treatment, control, vars # ) # # couplr_balance_df <- couplr_balance$var_stats %>% # select(variable, std_diff) %>% # mutate(method = "couplr") # # # Combine and plot # balance_comparison <- bind_rows(matchit_balance, couplr_balance_df) # # ggplot(balance_comparison, aes(x = variable, y = abs(std_diff), fill = method)) + # geom_col(position = "dodge") + # geom_hline(yintercept = 0.1, linetype = "dashed", color = "#93c54b") + # geom_hline(yintercept = 0.25, linetype = "dashed", color = "#f47c3c") + # labs( # title = "Covariate Balance: MatchIt vs couplr", # subtitle = "Green line = 0.1 (excellent), Orange line = 0.25 (acceptable)", # x = "Variable", # y = "|Standardized Difference|", # fill = "Method" # ) + # scale_fill_manual(values = c("MatchIt" = "#29abe0", "couplr" = "#93c54b")) + # theme_minimal() + # theme( # plot.background = element_rect(fill = "transparent", color = NA), # panel.background = element_rect(fill = "transparent", color = NA), # legend.position = "bottom" # ) # } ## ----eval = requireNamespace("optmatch", quietly = TRUE)---------------------- # if (requireNamespace("optmatch", quietly = TRUE)) { # library(optmatch) # # # Create distance matrix # dist_mat <- match_on( # treated ~ age + education + prior_earnings + employed, # data = combined, # method = "mahalanobis" # ) # # # Optimal pair matching # m_opt <- pairmatch(dist_mat, data = combined) # # n_matched <- sum(!is.na(m_opt)) / 2 # cat("optmatch (optimal pair matching):\n") # cat(" Matched pairs:", n_matched, "\n") # } ## ----------------------------------------------------------------------------- # couplr with Mahalanobis-like scaling result_couplr_maha <- match_couples( left = treatment, right = control, vars = vars, auto_scale = TRUE, scale = "standardize" # Similar to Mahalanobis diagonal ) cat("\ncouplr (optimal pair matching):\n") cat(" Matched pairs:", result_couplr_maha$info$n_matched, "\n") ## ----eval = requireNamespace("optmatch", quietly = TRUE)---------------------- # if (requireNamespace("optmatch", quietly = TRUE)) { # # Compare total distances # # (Note: Direct comparison is complex due to different distance scaling) # cat("\nBoth packages find globally optimal one-to-one assignments.\n") # cat("Total distance differences arise from distance metric choices.\n") # } ## ----eval = requireNamespace("designmatch", quietly = TRUE)------------------- # if (requireNamespace("designmatch", quietly = TRUE)) { # library(designmatch) # # # Prepare data for designmatch # t_ind <- combined$treated # # # Distance matrix (Mahalanobis) # X <- as.matrix(combined[, vars]) # dist_mat_dm <- distmat(t_ind, X) # # # Balance constraints: mean differences # mom <- list( # covs = X, # tols = rep(0.1, ncol(X)) # Tolerance for standardized difference # ) # # # Solve with balance constraints # tryCatch({ # m_dm <- bmatch( # t_ind = t_ind, # dist_mat = dist_mat_dm, # mom = mom, # solver = list(name = "glpk", approximate = 1) # ) # # cat("designmatch (balance-constrained matching):\n") # cat(" Matched pairs:", sum(m_dm$t_id > 0), "\n") # }, error = function(e) { # cat("designmatch: Balance constraints may be infeasible\n") # cat(" Try relaxing tolerances or reducing constraint count\n") # }) # } ## ----------------------------------------------------------------------------- # couplr: Optimize distance, then check balance result_couplr_dm <- match_couples( left = treatment, right = control, vars = vars, auto_scale = TRUE # No caliper - let algorithm find optimal matches ) balance_dm <- balance_diagnostics(result_couplr_dm, treatment, control, vars) cat("\ncouplr (distance-minimizing):\n") cat(" Matched pairs:", result_couplr_dm$info$n_matched, "\n") cat(" Mean |std diff|:", round(balance_dm$overall$mean_abs_std_diff, 4), "\n") cat(" Max |std diff|:", round(balance_dm$overall$max_abs_std_diff, 4), "\n") ## ----eval = requireNamespace("Matching", quietly = TRUE)---------------------- # if (requireNamespace("Matching", quietly = TRUE)) { # library(Matching) # # # Genetic matching (finds optimal weights) # X <- as.matrix(combined[, vars]) # # set.seed(123) # m_gen <- Match( # Tr = combined$treated, # X = X, # M = 1, # 1:1 matching # estimand = "ATT", # replace = FALSE # ) # # cat("Matching package (multivariate matching):\n") # cat(" Matched pairs:", length(m_gen$index.treated), "\n") # } ## ----eval = requireNamespace("Matching", quietly = TRUE)---------------------- # if (requireNamespace("Matching", quietly = TRUE)) { # # Check balance from Matching package # mb <- MatchBalance( # treated ~ age + education + prior_earnings + employed, # data = combined, # match.out = m_gen, # nboots = 0 # ) # } ## ----eval = FALSE------------------------------------------------------------- # # Stage 1: couplr for initial matching # matched <- match_couples( # left = treatment_data, # right = control_data, # vars = covariates, # auto_scale = TRUE # ) # # # Stage 2: Check balance # balance <- balance_diagnostics(matched, treatment_data, control_data, covariates) # # # Stage 3: If balance insufficient, consider alternatives # if (balance$overall$max_abs_std_diff > 0.25) { # # Try MatchIt with propensity scores # library(MatchIt) # combined <- bind_rows(treatment_data, control_data) # m_ps <- matchit(treated ~ ., data = combined, method = "full") # } # # # Stage 4: Analysis on matched data # matched_data <- join_matched(matched, treatment_data, control_data) # model <- lm(outcome ~ treatment, data = matched_data) ## ----lalonde-style------------------------------------------------------------ set.seed(1986) # NSW treatment group (randomized) - smaller sample for CRAN nsw_treat <- tibble( id = 1:100, age = pmax(17, rnorm(100, 25, 7)), education = pmax(0, pmin(16, rnorm(100, 10, 2))), black = rbinom(100, 1, 0.84), hispanic = rbinom(100, 1, 0.06), married = rbinom(100, 1, 0.19), nodegree = rbinom(100, 1, 0.71), re74 = pmax(0, rnorm(100, 2100, 5000)), re75 = pmax(0, rnorm(100, 1500, 3500)), group = "treatment" ) # CPS comparison group (observational - very different!) # Reduced from 15,815 to 500 for CRAN timing compliance cps_control <- tibble( id = 101:600, age = pmax(17, rnorm(500, 33, 11)), education = pmax(0, pmin(16, rnorm(500, 12, 3))), black = rbinom(500, 1, 0.07), hispanic = rbinom(500, 1, 0.07), married = rbinom(500, 1, 0.71), nodegree = rbinom(500, 1, 0.30), re74 = pmax(0, rnorm(500, 14000, 9000)), re75 = pmax(0, rnorm(500, 13500, 9000)), group = "control" ) cat("NSW treatment:", nrow(nsw_treat), "individuals\n") cat("CPS control:", nrow(cps_control), "individuals\n") cat("(Note: Real CPS has ~16,000 controls; reduced here for vignette timing)\n") # Baseline imbalance is severe vars_lalonde <- c("age", "education", "black", "hispanic", "married", "nodegree", "re74", "re75") cat("\nBaseline standardized differences:\n") for (v in vars_lalonde) { t_mean <- mean(nsw_treat[[v]]) c_mean <- mean(cps_control[[v]]) pooled_sd <- sqrt((var(nsw_treat[[v]]) + var(cps_control[[v]])) / 2) std_diff <- (t_mean - c_mean) / pooled_sd cat(sprintf(" %s: %.2f\n", v, std_diff)) } ## ----lalonde-matching--------------------------------------------------------- # Greedy matching (fast for large control pools) result_lalonde <- greedy_couples( left = nsw_treat, right = cps_control, vars = vars_lalonde, strategy = "pq", # Priority queue - efficient for large pools auto_scale = TRUE, scale = "robust" ) cat("Matched", result_lalonde$info$n_matched, "of", nrow(nsw_treat), "treatment units\n") cat("Mean distance:", round(mean(result_lalonde$pairs$distance), 4), "\n") ## ----lalonde-balance, fig.width=9, fig.height=5, fig.alt="Balance plot for Lalonde-style matching showing improvement in standardized differences"---- balance_lalonde <- balance_diagnostics( result_lalonde, nsw_treat, cps_control, vars_lalonde ) # Before/after comparison before_df <- tibble( variable = vars_lalonde, std_diff = sapply(vars_lalonde, function(v) { t_mean <- mean(nsw_treat[[v]]) c_mean <- mean(cps_control[[v]]) pooled_sd <- sqrt((var(nsw_treat[[v]]) + var(cps_control[[v]])) / 2) (t_mean - c_mean) / pooled_sd }), stage = "Before" ) after_df <- balance_lalonde$var_stats %>% select(variable, std_diff) %>% mutate(stage = "After") balance_plot_df <- bind_rows(before_df, after_df) %>% mutate(stage = factor(stage, levels = c("Before", "After"))) ggplot(balance_plot_df, aes(x = reorder(variable, abs(std_diff)), y = std_diff, fill = stage)) + geom_col(position = "dodge") + geom_hline(yintercept = c(-0.1, 0.1), linetype = "dashed", color = "#93c54b") + geom_hline(yintercept = c(-0.25, 0.25), linetype = "dashed", color = "#f47c3c") + coord_flip() + labs( title = "Covariate Balance: Before vs After Matching", subtitle = "Lalonde-style job training evaluation", x = "", y = "Standardized Difference", fill = "" ) + scale_fill_manual(values = c("Before" = "#d9534f", "After" = "#93c54b")) + theme_minimal() + theme( legend.position = "bottom", plot.title = element_text(face = "bold") ) ## ----lalonde-summary---------------------------------------------------------- cat("Balance summary:\n") cat(" Mean |std diff| before:", round(mean(abs(before_df$std_diff)), 3), "\n") cat(" Mean |std diff| after:", round(balance_lalonde$overall$mean_abs_std_diff, 3), "\n") cat(" Max |std diff| after:", round(balance_lalonde$overall$max_abs_std_diff, 3), "\n") if (balance_lalonde$overall$max_abs_std_diff < 0.25) { cat("\n✓ All variables within acceptable balance threshold (0.25)\n") } else { cat("\n⚠ Some variables exceed 0.25 threshold - consider calipers or blocking\n") }