## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, error = TRUE ) library(couplr) library(dplyr) ## ----error=TRUE--------------------------------------------------------------- try({ # This fails: all assignments have Inf cost cost <- matrix(c(1, Inf, Inf, Inf, Inf, Inf, Inf, 2, 3), nrow = 3, byrow = TRUE) result <- lap_solve(cost) }) ## ----------------------------------------------------------------------------- # Check for infeasible rows check_feasibility <- function(cost_matrix) { finite_per_row <- rowSums(is.finite(cost_matrix)) infeasible_rows <- which(finite_per_row == 0) if (length(infeasible_rows) > 0) { cat("Infeasible rows (no finite costs):", infeasible_rows, "\n") return(FALSE) } finite_per_col <- colSums(is.finite(cost_matrix)) infeasible_cols <- which(finite_per_col == 0) if (length(infeasible_cols) > 0) { cat("Infeasible columns (no finite costs):", infeasible_cols, "\n") return(FALSE) } cat("Problem appears feasible\n") return(TRUE) } # Check the problematic matrix cost <- matrix(c(1, Inf, Inf, Inf, Inf, Inf, Inf, 2, 3), nrow = 3, byrow = TRUE) check_feasibility(cost) ## ----------------------------------------------------------------------------- # Remove rows with no valid assignments cost <- matrix(c(1, Inf, Inf, Inf, Inf, Inf, Inf, 2, 3), nrow = 3, byrow = TRUE) valid_rows <- rowSums(is.finite(cost)) > 0 cost_valid <- cost[valid_rows, , drop = FALSE] if (nrow(cost_valid) > 0) { result <- lap_solve(cost_valid) cat("Matched", nrow(result), "of", nrow(cost), "rows\n") } ## ----------------------------------------------------------------------------- # Replace Inf with high (but finite) penalty cost_with_fallback <- cost cost_with_fallback[!is.finite(cost_with_fallback)] <- 1e6 # Large penalty result <- lap_solve(cost_with_fallback) print(result) ## ----fig.width=7, fig.height=4, fig.alt="Density plot showing overlap between treatment and control groups on the age variable"---- # Simulate poor overlap scenario set.seed(123) left <- tibble(id = 1:50, age = rnorm(50, mean = 25, sd = 3)) right <- tibble(id = 1:50, age = rnorm(50, mean = 55, sd = 3)) # Visualize overlap library(ggplot2) combined <- bind_rows( left %>% mutate(group = "Left"), right %>% mutate(group = "Right") ) ggplot(combined, aes(x = age, fill = group)) + geom_density(alpha = 0.5) + labs(title = "Poor Covariate Overlap", subtitle = "No overlap means no good matches possible") + theme_minimal() + theme(plot.background = element_rect(fill = "transparent", color = NA), panel.background = element_rect(fill = "transparent", color = NA)) ## ----------------------------------------------------------------------------- # Example of poor balance set.seed(456) left <- tibble( id = 1:100, age = rnorm(100, 30, 5), income = rnorm(100, 80000, 20000) # Very different from right ) right <- tibble( id = 1:100, age = rnorm(100, 32, 5), income = rnorm(100, 40000, 10000) # Very different income ) result <- match_couples(left, right, vars = c("age", "income"), auto_scale = TRUE) balance <- balance_diagnostics(result, left, right, vars = c("age", "income")) print(balance) ## ----eval=FALSE--------------------------------------------------------------- # # Include additional relevant variables # result <- match_couples( # left, right, # vars = c("age", "income", "education", "region"), # More variables # auto_scale = TRUE # ) ## ----------------------------------------------------------------------------- result_strict <- match_couples( left, right, vars = c("age", "income"), max_distance = 0.3, # Stricter caliper auto_scale = TRUE ) cat("Original matches:", result$info$n_matched, "\n") cat("With caliper:", result_strict$info$n_matched, "\n") balance_strict <- balance_diagnostics(result_strict, left, right, vars = c("age", "income")) print(balance_strict) ## ----eval=FALSE--------------------------------------------------------------- # # Block on income tertiles to ensure exact balance # left$income_cat <- cut(left$income, breaks = 3, labels = c("low", "mid", "high")) # right$income_cat <- cut(right$income, breaks = 3, labels = c("low", "mid", "high")) # # blocks <- matchmaker(left, right, block_type = "group", block_by = "income_cat") # result_blocked <- match_couples( # blocks$left, blocks$right, # vars = c("age"), # Match on age within income blocks # block_id = "block_id" # ) ## ----------------------------------------------------------------------------- # Compare scaling methods for (scale_method in c("robust", "standardize", "range")) { res <- match_couples(left, right, vars = c("age", "income"), auto_scale = TRUE, scale = scale_method) bal <- balance_diagnostics(res, left, right, vars = c("age", "income")) cat(scale_method, "- max |std_diff|:", round(bal$overall$max_abs_std_diff, 3), "\n") } ## ----------------------------------------------------------------------------- # Estimate runtime estimate_runtime <- function(n, seconds_per_billion = 1) { ops <- n^3 time_sec <- ops / 1e9 * seconds_per_billion if (time_sec < 60) { sprintf("%.1f seconds", time_sec) } else if (time_sec < 3600) { sprintf("%.1f minutes", time_sec / 60) } else { sprintf("%.1f hours", time_sec / 3600) } } cat("Estimated runtime for optimal matching:\n") for (n in c(100, 500, 1000, 3000, 5000, 10000)) { cat(sprintf(" n = %5d: %s\n", n, estimate_runtime(n))) } ## ----------------------------------------------------------------------------- set.seed(789) n <- 500 large_left <- tibble(id = 1:n, x1 = rnorm(n), x2 = rnorm(n)) large_right <- tibble(id = 1:n, x1 = rnorm(n), x2 = rnorm(n)) # Greedy is much faster time_greedy <- system.time({ result_greedy <- greedy_couples( large_left, large_right, vars = c("x1", "x2"), strategy = "row_best" ) }) cat("Greedy matching (n=500):", round(time_greedy["elapsed"], 2), "seconds\n") cat("Quality (mean distance):", round(mean(result_greedy$pairs$distance), 4), "\n") ## ----eval=FALSE--------------------------------------------------------------- # # Create clusters to match within # blocks <- matchmaker( # large_left, large_right, # block_type = "cluster", # block_vars = c("x1", "x2"), # n_blocks = 10 # 10 blocks of ~200 each # ) # # # Match within blocks (10 x O(200^3) << O(2000^3)) # result_blocked <- match_couples( # blocks$left, blocks$right, # vars = c("x1", "x2"), # block_id = "block_id" # ) ## ----eval=FALSE--------------------------------------------------------------- # # For n > 1000, auction algorithm often faster # result <- match_couples( # large_left, large_right, # vars = c("x1", "x2"), # method = "auction" # ) # # # For sparse problems (many forbidden pairs) # result <- match_couples( # left, right, # vars = vars, # max_distance = 0.5, # Creates sparsity # method = "sap" # Sparse algorithm # ) ## ----------------------------------------------------------------------------- # Compute once, reuse multiple times dist_cache <- compute_distances( large_left, large_right, vars = c("x1", "x2"), scale = "robust" ) # Fast: reuse cached distances result1 <- match_couples(dist_cache, max_distance = 0.3) result2 <- match_couples(dist_cache, max_distance = 0.5) result3 <- match_couples(dist_cache, max_distance = 1.0) ## ----------------------------------------------------------------------------- # Memory requirements memory_needed <- function(n) { bytes <- 8 * n^2 if (bytes < 1e6) { sprintf("%.1f KB", bytes / 1e3) } else if (bytes < 1e9) { sprintf("%.1f MB", bytes / 1e6) } else { sprintf("%.1f GB", bytes / 1e9) } } cat("Memory for full distance matrix:\n") for (n in c(1000, 5000, 10000, 20000, 50000)) { cat(sprintf(" n = %5d: %s\n", n, memory_needed(n))) } ## ----eval=FALSE--------------------------------------------------------------- # # Greedy computes distances on-the-fly # result <- greedy_couples( # left, right, # vars = covariates, # strategy = "row_best" # Most memory-efficient # ) ## ----eval=FALSE--------------------------------------------------------------- # # Each block is much smaller # blocks <- matchmaker(left, right, block_type = "cluster", n_blocks = 20) # result <- match_couples(blocks$left, blocks$right, vars = vars, block_id = "block_id") ## ----eval=FALSE--------------------------------------------------------------- # # Caliper excludes distant pairs (sparse representation) # result <- match_couples( # left, right, # vars = covariates, # max_distance = 0.5, # method = "sap" # Sparse-optimized algorithm # ) ## ----eval=FALSE--------------------------------------------------------------- # # Increase to 16 GB (if available) # memory.limit(size = 16000) ## ----------------------------------------------------------------------------- cost <- matrix(c(1, 2, 2, 2, 1, 2, 2, 2, 1), nrow = 3, byrow = TRUE) result_jv <- lap_solve(cost, method = "jv") result_hungarian <- lap_solve(cost, method = "hungarian") cat("JV assignment: ", result_jv$target, "\n") cat("Hungarian assignment:", result_hungarian$target, "\n") cat("JV total cost: ", get_total_cost(result_jv), "\n") cat("Hungarian total cost:", get_total_cost(result_hungarian), "\n") ## ----------------------------------------------------------------------------- # Check for ties check_ties <- function(cost_matrix) { n <- nrow(cost_matrix) # Check if diagonal dominates (trivial ties) diag_costs <- diag(cost_matrix) if (length(unique(diag_costs)) < n) { cat("Tied costs on diagonal - multiple optima likely\n") } # Check cost uniqueness unique_costs <- length(unique(as.vector(cost_matrix))) total_entries <- length(cost_matrix) if (unique_costs < total_entries * 0.5) { cat("Many repeated costs - ties possible\n") } } check_ties(cost) ## ----------------------------------------------------------------------------- # Total cost should be identical stopifnot(get_total_cost(result_jv) == get_total_cost(result_hungarian)) cat("Both methods found optimal solutions (same total cost)\n") ## ----------------------------------------------------------------------------- # Hungarian has consistent tie-breaking result <- lap_solve(cost, method = "hungarian") ## ----------------------------------------------------------------------------- set.seed(42) cost_perturbed <- cost + matrix(rnorm(9, 0, 1e-10), 3, 3) result <- lap_solve(cost_perturbed) ## ----error=TRUE--------------------------------------------------------------- try({ left <- tibble(id = 1:5, age = c(25, 30, NA, 35, 40)) right <- tibble(id = 1:5, age = c(28, 32, 33, 36, 42)) # This may fail or give unexpected results result <- match_couples(left, right, vars = "age") }) ## ----------------------------------------------------------------------------- left_clean <- left %>% filter(!is.na(age)) right_clean <- right %>% filter(!is.na(age)) result <- match_couples(left_clean, right_clean, vars = "age") cat("Matched", result$info$n_matched, "pairs (excluded 1 left unit with NA)\n") ## ----------------------------------------------------------------------------- # Simple mean imputation left_imputed <- left %>% mutate(age = if_else(is.na(age), mean(age, na.rm = TRUE), age)) result <- match_couples(left_imputed, right, vars = "age") cat("Matched", result$info$n_matched, "pairs (imputed 1 NA with mean)\n") ## ----------------------------------------------------------------------------- health <- preprocess_matching_vars( left, right, vars = "age" ) print(health) ## ----eval=FALSE--------------------------------------------------------------- # # In R: # remove.packages("couplr") # install.packages("couplr") # # # Or from GitHub: # devtools::install_github("gcol33/couplr", force = TRUE) ## ----eval=FALSE--------------------------------------------------------------- # devtools::clean_dll() # devtools::load_all() ## ----------------------------------------------------------------------------- # Check if Rtools is properly configured Sys.which("make") Sys.which("g++") ## ----------------------------------------------------------------------------- cost <- matrix(c(1, NA, 3, 4), 2, 2) cost[is.na(cost)] <- Inf # Mark as forbidden result <- lap_solve(cost) ## ----------------------------------------------------------------------------- # Always verify data before matching stopifnot(nrow(left) > 0, nrow(right) > 0) ## ----eval=FALSE--------------------------------------------------------------- # # Minimal example template # library(couplr) # # # Minimal data that reproduces the issue # set.seed(123) # left <- tibble(id = 1:10, x = rnorm(10)) # right <- tibble(id = 1:10, x = rnorm(10)) # # # Code that causes the error # result <- match_couples(left, right, vars = "x") # # # Expected vs actual behavior # # Expected: ... # # Actual: [error message] # # # Session info # sessionInfo()