## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) library(couplr) library(dplyr) library(ggplot2) ## ----basic-example------------------------------------------------------------ # Simulate observational data set.seed(123) n_left <- 100 n_right <- 150 # Treatment group (tends to be younger, higher income) left_data <- tibble( id = 1:n_left, age = rnorm(n_left, mean = 45, sd = 10), income = rnorm(n_left, mean = 60000, sd = 15000), education = sample(c("HS", "BA", "MA"), n_left, replace = TRUE), group = "treatment" ) # Control group (older, lower income on average) right_data <- tibble( id = 1:n_right, age = rnorm(n_right, mean = 52, sd = 12), income = rnorm(n_right, mean = 50000, sd = 18000), education = sample(c("HS", "BA", "MA"), n_right, replace = TRUE), group = "control" ) # Perform optimal matching result <- match_couples( left = left_data, right = right_data, vars = c("age", "income"), auto_scale = TRUE, return_diagnostics = TRUE ) # View matched pairs head(result$pairs) # Summary statistics result$info ## ----inspect-output, fig.alt="Histogram showing the distribution of match distances, with most matches having low distances indicating good match quality"---- # How many matched? cat("Matched pairs:", result$info$n_matched, "\n") cat("Unmatched left:", nrow(result$left_unmatched), "\n") cat("Unmatched right:", nrow(result$right_unmatched), "\n") # Distribution of match distances summary(result$pairs$distance) # Visualize match quality ggplot(result$pairs, aes(x = distance)) + geom_histogram(bins = 30, fill = "#29abe0", alpha = 0.7) + labs( title = "Distribution of Match Distances", x = "Euclidean Distance (scaled)", y = "Count" ) + theme_minimal() + theme(plot.background = element_rect(fill = "transparent", color = NA), panel.background = element_rect(fill = "transparent", color = NA), panel.grid = element_blank()) ## ----preprocessing-demo------------------------------------------------------- # Create data with scaling challenges set.seed(456) challenging_data <- tibble( id = 1:50, age = rnorm(50, 50, 10), # Years (reasonable scale) income = rnorm(50, 60000, 20000), # Dollars (large scale) bmi = rnorm(50, 25, 5), # Ratio (small scale) smoker = sample(0:1, 50, replace = TRUE), # Binary education = factor( sample(c("HS", "BA", "MA", "PhD"), 50, replace = TRUE), ordered = TRUE, levels = c("HS", "BA", "MA", "PhD") ) ) left_chal <- challenging_data[1:25, ] right_chal <- challenging_data[26:50, ] # Match WITHOUT auto-scaling (income dominates) result_no_scale <- match_couples( left_chal, right_chal, vars = c("age", "income", "bmi"), auto_scale = FALSE ) # Match WITH auto-scaling (all variables contribute) result_scaled <- match_couples( left_chal, right_chal, vars = c("age", "income", "bmi"), auto_scale = TRUE, scale = "robust" # Median/MAD scaling (robust to outliers) ) # Compare match quality cat("Without scaling - mean distance:", mean(result_no_scale$pairs$distance), "\n") cat("With scaling - mean distance:", mean(result_scaled$pairs$distance), "\n") ## ----scaling-comparison------------------------------------------------------- # Demonstrate scaling methods demo_var <- c(10, 20, 25, 30, 100) # Contains outlier (100) # Function to show scaling show_scaling <- function(x, method) { left_demo <- tibble(id = 1:3, var = x[1:3]) right_demo <- tibble(id = 1:2, var = x[4:5]) result <- match_couples( left_demo, right_demo, vars = "var", auto_scale = TRUE, scale = method, return_diagnostics = TRUE ) # Extract scaled values from cost matrix cat(method, "scaling:\n") cat(" Original:", x, "\n") cat(" Distance matrix diagonal:", diag(result$cost_matrix)[1:2], "\n\n") } show_scaling(demo_var, "robust") show_scaling(demo_var, "standardize") show_scaling(demo_var, "range") ## ----health-checks, eval=FALSE------------------------------------------------ # # Create data with issues # problematic_data <- tibble( # id = 1:100, # age = rnorm(100, 50, 10), # constant_var = 5, # No variation - will warn # mostly_missing = c(rnorm(20, 50, 10), rep(NA, 80)), # >50% missing # extreme_skew = rexp(100, rate = 0.1) # Very skewed # ) # # # Attempt matching - will show warnings # result <- match_couples( # problematic_data[1:50, ], # problematic_data[51:100, ], # vars = c("age", "constant_var", "mostly_missing", "extreme_skew"), # auto_scale = TRUE # ) # # # Warning messages will indicate: # # - "constant_var excluded (SD = 0)" # # - "mostly_missing has 80% missing values" # # - "extreme_skew has high skewness (3.2)" ## ----optimal-vs-greedy-------------------------------------------------------- # Create moderately large dataset set.seed(789) n <- 1000 large_left <- tibble( id = 1:n, x1 = rnorm(n), x2 = rnorm(n), x3 = rnorm(n) ) large_right <- tibble( id = 1:n, x1 = rnorm(n), x2 = rnorm(n), x3 = rnorm(n) ) # Optimal matching time_optimal <- system.time({ result_optimal <- match_couples( large_left, large_right, vars = c("x1", "x2", "x3"), method = "hungarian" ) }) # Greedy matching (row_best strategy) time_greedy <- system.time({ result_greedy <- greedy_couples( large_left, large_right, vars = c("x1", "x2", "x3"), strategy = "row_best" ) }) # Compare cat("Optimal matching:\n") cat(" Time:", round(time_optimal["elapsed"], 3), "seconds\n") cat(" Mean distance:", round(mean(result_optimal$pairs$distance), 4), "\n\n") cat("Greedy matching:\n") cat(" Time:", round(time_greedy["elapsed"], 3), "seconds\n") cat(" Mean distance:", round(mean(result_greedy$pairs$distance), 4), "\n") cat(" Speedup:", round(time_optimal["elapsed"] / time_greedy["elapsed"], 1), "x\n") ## ----greedy-strategies-------------------------------------------------------- # Compare greedy strategies on same data set.seed(101) test_left <- tibble(id = 1:200, x = rnorm(200)) test_right <- tibble(id = 1:200, x = rnorm(200)) strategies <- c("sorted", "row_best", "pq") results <- list() for (strat in strategies) { time <- system.time({ result <- greedy_couples( test_left, test_right, vars = "x", strategy = strat ) }) results[[strat]] <- list( time = time["elapsed"], mean_dist = mean(result$pairs$distance), total_dist = result$info$total_distance ) } # Display comparison comparison <- do.call(rbind, lapply(names(results), function(s) { data.frame( strategy = s, time_sec = round(results[[s]]$time, 4), mean_distance = round(results[[s]]$mean_dist, 4), total_distance = round(results[[s]]$total_dist, 2) ) })) print(comparison) ## ----calipers, fig.alt="Overlapping histograms comparing match distances with and without caliper constraint, showing the caliper excludes distant matches"---- # Create data where some units are far apart set.seed(202) left_cal <- tibble( id = 1:50, x = c(rnorm(40, mean = 0, sd = 1), rnorm(10, mean = 5, sd = 0.5)) # Some outliers ) right_cal <- tibble( id = 1:50, x = rnorm(50, mean = 0, sd = 1) ) # Match without caliper - pairs everything result_no_cal <- match_couples( left_cal, right_cal, vars = "x", auto_scale = FALSE ) # Match with caliper - excludes poor matches result_with_cal <- match_couples( left_cal, right_cal, vars = "x", max_distance = 1.5, # Caliper: max distance = 1.5 auto_scale = FALSE ) cat("Without caliper:\n") cat(" Matched:", result_no_cal$info$n_matched, "\n") cat(" Mean distance:", round(mean(result_no_cal$pairs$distance), 3), "\n") cat(" Max distance:", round(max(result_no_cal$pairs$distance), 3), "\n\n") cat("With caliper (1.5):\n") cat(" Matched:", result_with_cal$info$n_matched, "\n") cat(" Mean distance:", round(mean(result_with_cal$pairs$distance), 3), "\n") cat(" Max distance:", round(max(result_with_cal$pairs$distance), 3), "\n") # Visualize caliper effect ggplot(result_no_cal$pairs, aes(x = distance)) + geom_histogram(aes(fill = "No caliper"), bins = 30, alpha = 0.5) + geom_histogram( data = result_with_cal$pairs, aes(fill = "With caliper"), bins = 30, alpha = 0.5 ) + geom_vline(xintercept = 1.5, linetype = "dashed", color = "red") + labs( title = "Caliper Effect on Match Distances", x = "Distance", y = "Count", fill = "Condition" ) + theme_minimal() + theme(plot.background = element_rect(fill = "transparent", color = NA), panel.background = element_rect(fill = "transparent", color = NA), legend.background = element_rect(fill = "transparent", color = NA), panel.grid = element_blank()) ## ----caliper-sd, eval=FALSE--------------------------------------------------- # # Calculate pooled SD # combined <- bind_rows( # left_data %>% mutate(group = "left"), # right_data %>% mutate(group = "right") # ) # # pooled_sd <- sd(combined$age) # For single variable # caliper_width <- 0.2 * pooled_sd # # result <- match_couples( # left_data, right_data, # vars = "age", # max_distance = caliper_width # ) ## ----caliper-empirical-------------------------------------------------------- # Fit all matches first all_matches <- match_couples(left_cal, right_cal, vars = "x") # Choose caliper at 90th percentile caliper_90 <- quantile(all_matches$pairs$distance, 0.90) # Refit with caliper refined_matches <- match_couples( left_cal, right_cal, vars = "x", max_distance = caliper_90 ) cat("90th percentile caliper:", round(caliper_90, 3), "\n") cat("Matches retained:", round(100 * refined_matches$info$n_matched / all_matches$info$n_matched, 1), "%\n") ## ----blocking-exact----------------------------------------------------------- # Create multi-site data set.seed(303) multi_site <- bind_rows( tibble( id = 1:100, site = sample(c("A", "B", "C"), 100, replace = TRUE), age = rnorm(100, 50, 10), income = rnorm(100, 55000, 15000), group = "treatment" ), tibble( id = 101:250, site = sample(c("A", "B", "C"), 150, replace = TRUE), age = rnorm(150, 50, 10), income = rnorm(150, 55000, 15000), group = "control" ) ) left_site <- multi_site %>% filter(group == "treatment") right_site <- multi_site %>% filter(group == "control") # Create exact blocks by site blocks <- matchmaker( left = left_site, right = right_site, block_type = "group", block_by = "site" ) cat("Blocking structure:\n") print(blocks$block_summary) # Match within blocks result_blocked <- match_couples( left = blocks$left, right = blocks$right, vars = c("age", "income"), block_id = "block_id", # Use block IDs from matchmaker auto_scale = TRUE ) # Verify exact site balance result_blocked$pairs %>% mutate(left_id = as.integer(left_id), right_id = as.integer(right_id)) %>% left_join(left_site %>% select(id, site), by = c("left_id" = "id")) %>% left_join(right_site %>% select(id, site), by = c("right_id" = "id"), suffix = c("_left", "_right")) %>% count(site_left, site_right) ## ----blocking-cluster--------------------------------------------------------- # Create blocks based on age groups (data-driven) cluster_blocks <- matchmaker( left = left_site, right = right_site, block_type = "cluster", block_vars = "age", n_blocks = 3 ) cat("Cluster-based blocks:\n") print(cluster_blocks$block_summary) # Match within clusters result_clustered <- match_couples( left = cluster_blocks$left, right = cluster_blocks$right, vars = c("age", "income"), block_id = "block_id", auto_scale = TRUE ) # Show age distribution by cluster cluster_blocks$left %>% group_by(block_id) %>% summarise( n = n(), mean_age = mean(age), sd_age = sd(age) ) ## ----balance-assessment------------------------------------------------------- # Perform matching match_result <- match_couples( left = left_data, right = right_data, vars = c("age", "income"), auto_scale = TRUE ) # Get matched samples matched_left <- left_data %>% filter(id %in% match_result$pairs$left_id) matched_right <- right_data %>% filter(id %in% match_result$pairs$right_id) # Compute balance diagnostics balance <- balance_diagnostics( result = match_result, left = left_data, right = right_data, vars = c("age", "income") ) # Print balance summary print(balance) # Extract balance table for reporting balance_table(balance) ## ----balance-plots, fig.alt="Bar chart comparing standardized differences before and after matching, with threshold lines at 0.1 and 0.25 showing improved balance after matching"---- # Before-after balance plot # (Requires creating pre-match balance for comparison) # For pre-match comparison, just compute summary statistics directly pre_match_stats <- tibble( variable = c("age", "income"), std_diff = c( (mean(left_data$age) - mean(right_data$age)) / sqrt((sd(left_data$age)^2 + sd(right_data$age)^2) / 2), (mean(left_data$income) - mean(right_data$income)) / sqrt((sd(left_data$income)^2 + sd(right_data$income)^2) / 2) ), when = "Before" ) # Combine for plotting balance_comparison <- bind_rows( pre_match_stats, balance$var_stats %>% select(variable, std_diff) %>% mutate(when = "After") ) ggplot(balance_comparison, aes(x = variable, y = std_diff, fill = when)) + geom_hline(yintercept = c(-0.1, 0.1), linetype = "dashed", color = "#93c54b", linewidth = 0.8) + geom_hline(yintercept = c(-0.25, 0.25), linetype = "dashed", color = "#f47c3c", linewidth = 0.8) + geom_col(position = "dodge", width = 0.5) + geom_label(aes(x = 1.5, y = -0.1, label = "±0.1 excellent"), fill = "#93c54b", color = "white", inherit.aes = FALSE, size = 3, fontface = "bold", label.padding = unit(0.2, "lines")) + geom_label(aes(x = 1.5, y = 0.25, label = "±0.25 acceptable"), fill = "#f47c3c", color = "white", inherit.aes = FALSE, size = 3, fontface = "bold", label.padding = unit(0.2, "lines")) + labs( title = "Covariate Balance Before and After Matching", x = "Variable", y = "Standardized Difference", fill = "Timing" ) + theme_minimal() + theme(plot.background = element_rect(fill = "transparent", color = NA), panel.background = element_rect(fill = "transparent", color = NA), legend.background = element_rect(fill = "transparent", color = NA), panel.grid = element_blank()) + coord_flip() ## ----real-world-data---------------------------------------------------------- set.seed(404) # Simulate realistic scenario with selection bias # Program attracts younger, more educated, currently employed individuals create_participant <- function(n, is_treatment) { if (is_treatment) { tibble( id = 1:n, age = rnorm(n, mean = 35, sd = 8), education_years = rnorm(n, mean = 14, sd = 2), prior_earnings = rnorm(n, mean = 35000, sd = 10000), employed = sample(c(0, 1), n, replace = TRUE, prob = c(0.3, 0.7)), treatment = 1 ) } else { tibble( id = (n+1):(n+500), age = rnorm(500, mean = 42, sd = 12), education_years = rnorm(500, mean = 12, sd = 3), prior_earnings = rnorm(500, mean = 30000, sd = 12000), employed = sample(c(0, 1), 500, replace = TRUE, prob = c(0.5, 0.5)), treatment = 0 ) } } treatment_group <- create_participant(200, TRUE) control_group <- create_participant(500, FALSE) # Simulate outcome (earnings) with treatment effect # True effect: +$5,000, with heterogeneity treatment_group <- treatment_group %>% mutate( earnings = prior_earnings + 5000 + # True treatment effect 2000 * rnorm(n()) + # Random variation 100 * education_years # Education effect ) control_group <- control_group %>% mutate( earnings = prior_earnings + 2000 * rnorm(n()) + 100 * education_years ) # Examine baseline imbalance cat("Pre-matching differences:\n") cat("Age diff:", mean(treatment_group$age) - mean(control_group$age), "\n") cat("Education diff:", mean(treatment_group$education_years) - mean(control_group$education_years), "\n") cat("Prior earnings diff:", mean(treatment_group$prior_earnings) - mean(control_group$prior_earnings), "\n") ## ----real-world-matching------------------------------------------------------ # Match on baseline covariates job_match <- match_couples( left = treatment_group, right = control_group, vars = c("age", "education_years", "prior_earnings", "employed"), auto_scale = TRUE, scale = "robust", return_diagnostics = TRUE ) cat("Matching summary:\n") cat(" Treated units:", nrow(treatment_group), "\n") cat(" Matched treated:", job_match$info$n_matched, "\n") cat(" Match rate:", round(100 * job_match$info$n_matched / nrow(treatment_group), 1), "%\n") ## ----real-world-balance------------------------------------------------------- # Extract matched samples matched_treated <- treatment_group %>% filter(id %in% job_match$pairs$left_id) matched_control <- control_group %>% filter(id %in% job_match$pairs$right_id) # Compute balance job_balance <- balance_diagnostics( result = job_match, left = treatment_group, right = control_group, vars = c("age", "education_years", "prior_earnings", "employed") ) print(job_balance) # Check overall balance quality cat("\nOverall balance:\n") cat(" Mean |std diff|:", round(job_balance$overall$mean_abs_std_diff, 3), "\n") cat(" Max |std diff|:", round(job_balance$overall$max_abs_std_diff, 3), "\n") cat(" % with |std diff| > 0.1:", round(job_balance$overall$pct_large_imbalance, 1), "%\n") ## ----real-world-ate----------------------------------------------------------- # Naive estimate (without matching) - BIASED naive_effect <- mean(treatment_group$earnings) - mean(control_group$earnings) # Matched estimate - accounts for baseline differences matched_effect <- mean(matched_treated$earnings) - mean(matched_control$earnings) # Paired t-test for significance paired_comparison <- tibble( treated = matched_treated$earnings, control = matched_control$earnings[match( matched_treated$id, job_match$pairs$left_id )] ) t_test <- t.test(paired_comparison$treated, paired_comparison$control, paired = TRUE) # Report results cat("Treatment Effect Estimates:\n\n") cat("Naive (unmatched):\n") cat(" Difference: $", round(naive_effect, 0), "\n") cat(" (Upward biased due to selection)\n\n") cat("Matched estimate:\n") cat(" Difference: $", round(matched_effect, 0), "\n") cat(" 95% CI: ($", round(t_test$conf.int[1], 0), ", $", round(t_test$conf.int[2], 0), ")\n") cat(" P-value:", format.pval(t_test$p.value, digits = 3), "\n") cat(" (Closer to true effect of $5,000)\n") ## ----real-world-table, eval=FALSE--------------------------------------------- # # Table 1: Balance table # balance_publication <- balance_table(job_balance) # print(balance_publication) # # # Table 2: Sample characteristics # sample_table <- bind_rows( # matched_treated %>% # summarise( # Group = "Treatment", # N = n(), # `Age (mean ± SD)` = sprintf("%.1f ± %.1f", mean(age), sd(age)), # `Education (years)` = sprintf("%.1f ± %.1f", mean(education_years), sd(education_years)), # `Prior Earnings` = sprintf("$%s ± %s", # format(round(mean(prior_earnings)), big.mark = ","), # format(round(sd(prior_earnings)), big.mark = ",")), # `Employed (%)` = sprintf("%.1f", 100 * mean(employed)) # ), # matched_control %>% # summarise( # Group = "Control", # N = n(), # `Age (mean ± SD)` = sprintf("%.1f ± %.1f", mean(age), sd(age)), # `Education (years)` = sprintf("%.1f ± %.1f", mean(education_years), sd(education_years)), # `Prior Earnings` = sprintf("$%s ± %s", # format(round(mean(prior_earnings)), big.mark = ","), # format(round(sd(prior_earnings)), big.mark = ",")), # `Employed (%)` = sprintf("%.1f", 100 * mean(employed)) # ) # ) # # print(sample_table) # # # Table 3: Treatment effect # effect_table <- tibble( # Method = c("Unmatched", "Matched"), # `N (Treated)` = c(nrow(treatment_group), nrow(matched_treated)), # `N (Control)` = c(nrow(control_group), nrow(matched_control)), # `Effect Estimate` = sprintf("$%s", format(round(c(naive_effect, matched_effect)), big.mark = ",")), # `95% CI` = c("--", sprintf("($%s, $%s)", # format(round(t_test$conf.int[1]), big.mark = ","), # format(round(t_test$conf.int[2]), big.mark = ","))) # ) # # print(effect_table) ## ----optimization-blocking, eval=FALSE---------------------------------------- # # Instead of matching 10,000 × 10,000: # # Create 10 blocks of ~1,000 × 1,000 each # blocks <- matchmaker( # left_large, right_large, # block_type = "cluster", # cluster_vars = "age", # n_clusters = 10 # ) # # # Much faster: 10 * O(1000^3) << O(10000^3) # result <- match_couples( # blocks$left, blocks$right, # vars = covariates, # block_id = "block_id" # ) ## ----optimization-greedy, eval=FALSE------------------------------------------ # # Quick greedy match for exploration # quick <- greedy_couples( # left_data, right_data, # vars = covariates, # strategy = "row_best" # ) # # # Assess balance # balance_quick <- balance_diagnostics(quick, left_data, right_data, vars = covariates) # # # If balance is acceptable, done! # # If not, try optimal or add blocking ## ----optimization-caliper, eval=FALSE----------------------------------------- # # Caliper removes distant pairs from cost matrix # # Can dramatically reduce effective problem size # result <- match_couples( # left_data, right_data, # vars = covariates, # max_distance = 0.25, # Strict caliper # auto_scale = TRUE # ) ## ----poor-balance-fix, eval=FALSE--------------------------------------------- # # 1. Add more matching variables # result <- match_couples(left, right, # vars = c("age", "income", "education", "region"), # Added! # auto_scale = TRUE) # # # 2. Tighten caliper (fewer but better matches) # result <- match_couples(left, right, vars = vars, # max_distance = 0.1) # Was 0.5 # # # 3. Block on the problematic variable # blocks <- matchmaker(left, right, block_type = "group", block_by = "region") # result <- match_couples(blocks$left, blocks$right, vars = other_vars, # block_id = "block_id") ## ----few-matches-diagnosis, eval=FALSE---------------------------------------- # # Check covariate overlap # library(ggplot2) # combined <- bind_rows( # left %>% mutate(group = "treatment"), # right %>% mutate(group = "control") # ) # ggplot(combined, aes(x = age, fill = group)) + # geom_density(alpha = 0.5) + # labs(title = "Check for Overlap") ## ----slow-matching-fix, eval=FALSE-------------------------------------------- # # For n > 3000: use greedy # result <- greedy_couples(left, right, vars = vars, strategy = "sorted") # # # For n > 5000: add blocking # blocks <- matchmaker(left, right, block_type = "cluster", n_blocks = 20) # result <- match_couples(blocks$left, blocks$right, vars = vars, # block_id = "block_id") ## ----workflow-diagram, fig.width=8, fig.height=6, echo=FALSE, fig.alt="Flowchart showing recommended matching workflow with iterative refinement loop"---- library(ggplot2) # Define workflow nodes nodes <- data.frame( label = c("1. Explore data\nIdentify confounders", "2. First match\nmatch_couples()", "3. Check balance\nbalance_diagnostics()", "Balance\nOK?", "4. Estimate\neffect", "Refine: caliper,\nblocking, more vars"), x = c(4, 4, 4, 4, 2, 6), y = c(6, 5, 4, 3, 2, 2), type = c("step", "step", "step", "decision", "step", "step") ) # Define edges edges <- data.frame( from_x = c(4, 4, 4, 4, 4, 6), from_y = c(6, 5, 4, 3, 3, 2), to_x = c(4, 4, 4, 2, 6, 4), to_y = c(5, 4, 3, 2, 2, 4), label = c("", "", "", "Yes", "No", ""), label_x = c(NA, NA, NA, 2.8, 5.2, 5.5), label_y = c(NA, NA, NA, 2.6, 2.6, 3) ) ggplot() + # Draw straight edges (vertical arrows) geom_segment(data = edges[1:3, ], aes(x = from_x, y = from_y, xend = to_x, yend = to_y), arrow = arrow(length = unit(0.15, "cm"), type = "closed"), color = "#3e3f3a", linewidth = 0.7) + # Draw decision branches geom_segment(data = edges[4:5, ], aes(x = from_x, y = from_y, xend = to_x, yend = to_y), arrow = arrow(length = unit(0.15, "cm"), type = "closed"), color = "#3e3f3a", linewidth = 0.7) + # Draw return loop (curved) geom_curve(data = edges[6, ], aes(x = from_x, y = from_y, xend = to_x, yend = to_y), arrow = arrow(length = unit(0.15, "cm"), type = "closed"), color = "#3e3f3a", linewidth = 0.7, curvature = 0.3) + # Edge labels - no explicit color so CSS can control dark mode geom_text(data = edges[!is.na(edges$label_x), ], aes(x = label_x, y = label_y, label = label), size = 3, fontface = "italic") + # Step nodes (rectangles) - sandstone info color with white text geom_label(data = nodes[nodes$type == "step", ], aes(x = x, y = y, label = label), size = 3, fill = "#29abe0", color = "white", label.padding = unit(0.4, "lines"), label.r = unit(0.15, "lines"), lineheight = 0.9) + # Decision node (diamond) - sandstone warning color geom_point(data = nodes[nodes$type == "decision", ], aes(x = x, y = y), shape = 23, size = 20, fill = "#ffc107", color = "#f47c3c", stroke = 1.5) + # Decision text - use default color that adapts to dark mode geom_text(data = nodes[nodes$type == "decision", ], aes(x = x, y = y, label = label), size = 3, lineheight = 0.85) + # Title labs(title = "Matching Workflow") + theme_minimal() + theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), axis.text = element_blank(), axis.title = element_blank(), panel.grid = element_blank(), plot.background = element_rect(fill = "transparent", color = NA), panel.background = element_rect(fill = "transparent", color = NA)) + coord_cartesian(xlim = c(0, 8), ylim = c(1.5, 6.5))