## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", message = FALSE, warning = FALSE, eval = TRUE ) ## ----example-data------------------------------------------------------------- library(bioLeak) set.seed(123) n <- 160 subject <- rep(seq_len(40), each = 4) batch <- sample(paste0("B", 1:6), n, replace = TRUE) study <- sample(paste0("S", 1:4), n, replace = TRUE) time <- seq_len(n) x1 <- rnorm(n) x2 <- rnorm(n) x3 <- rnorm(n) linpred <- 0.7 * x1 - 0.4 * x2 + 0.2 * x3 + rnorm(n, sd = 0.5) p <- stats::plogis(linpred) outcome <- factor(ifelse(runif(n) < p, "case", "control"), levels = c("control", "case")) df <- data.frame( subject = subject, batch = batch, study = study, time = time, outcome = outcome, x1 = x1, x2 = x2, x3 = x3 ) df_leaky <- within(df, { leak_subject <- ave(as.numeric(outcome == "case"), subject, FUN = mean) leak_batch <- ave(as.numeric(outcome == "case"), batch, FUN = mean) leak_global <- mean(as.numeric(outcome == "case")) }) df_time <- df df_time$leak_future <- c(as.numeric(df_time$outcome == "case")[-1], 0) predictors <- c("x1", "x2", "x3") # Example data (first 6 rows) head(df) # Outcome class counts as.data.frame(table(df$outcome)) ## ----leaky-splits------------------------------------------------------------- leaky_splits <- make_split_plan( df, outcome = "outcome", mode = "subject_grouped", group = "row_id", v = 5, repeats = 1, stratify = TRUE ) cat("Leaky splits summary (sample-wise CV):\n") leaky_splits ## ----safe-splits-------------------------------------------------------------- safe_splits <- make_split_plan( df, outcome = "outcome", mode = "subject_grouped", group = "subject", v = 5, repeats = 1, stratify = TRUE, seed = 10 ) cat("Leakage-safe splits summary (subject-grouped CV):\n") safe_splits ## ----splits-other-modes------------------------------------------------------- batch_splits <- make_split_plan( df, outcome = "outcome", mode = "batch_blocked", batch = "batch", v = 4, stratify = TRUE ) cat("Batch-blocked splits summary:\n") batch_splits study_splits <- make_split_plan( df, outcome = "outcome", mode = "study_loocv", study = "study" ) cat("Study leave-one-out splits summary:\n") study_splits time_splits <- make_split_plan( df, outcome = "outcome", mode = "time_series", time = "time", v = 4, horizon = 2 ) cat("Time-series splits summary:\n") time_splits nested_splits <- make_split_plan( df, outcome = "outcome", mode = "subject_grouped", group = "subject", v = 3, nested = TRUE, stratify = TRUE ) cat("Nested CV splits summary:\n") nested_splits ## ----compact-splits----------------------------------------------------------- # Efficient storage for large N big_splits <- make_split_plan( df, outcome = "outcome", mode = "subject_grouped", group = "subject", v = 5, compact = TRUE # <--- Saves memory ) cat("Compact-mode splits summary:\n") big_splits cat(sprintf("Compact storage enabled: %s\n", big_splits@info$compact)) ## ----leaky-scaling------------------------------------------------------------ df_leaky_scaled <- df df_leaky_scaled[predictors] <- scale(df_leaky_scaled[predictors]) scaled_summary <- data.frame( feature = predictors, mean = colMeans(df_leaky_scaled[predictors]), sd = apply(df_leaky_scaled[predictors], 2, stats::sd) ) scaled_summary$mean <- round(scaled_summary$mean, 3) scaled_summary$sd <- round(scaled_summary$sd, 3) # Leaky global scaling: means ~0 and SDs ~1 computed on all samples scaled_summary ## ----guarded-preprocess------------------------------------------------------- fold1 <- safe_splits@indices[[1]] train_x <- df[fold1$train, predictors] test_x <- df[fold1$test, predictors] guard <- .guard_fit( X = train_x, y = df$outcome[fold1$train], steps = list( impute = list(method = "median", winsor = TRUE), normalize = list(method = "zscore"), filter = list(var_thresh = 0, iqr_thresh = 0), fs = list(method = "none") ), task = "binomial" ) train_x_guarded <- predict_guard(guard, train_x) test_x_guarded <- predict_guard(guard, test_x) cat("GuardFit object:\n") guard cat("\nGuardFit summary:\n") summary(guard) # Guarded training data (first 6 rows) head(train_x_guarded) # Guarded test data (first 6 rows) head(test_x_guarded) ## ----guard-ensure-levels------------------------------------------------------ raw_levels <- data.frame( site = c("A", "B", "B"), status = c("yes", "no", "yes"), stringsAsFactors = FALSE ) level_state <- .guard_ensure_levels(raw_levels) # Aligned factor data with consistent levels level_state$data # Levels map level_state$levels ## ----leaky-impute------------------------------------------------------------- train <- data.frame(a = c(1, 2, NA, 4), b = c(NA, 1, 1, 0)) test <- data.frame(a = c(NA, 5), b = c(1, NA)) all_median <- vapply(rbind(train, test), function(col) median(col, na.rm = TRUE), numeric(1)) train_leaky <- as.data.frame(Map(function(col, m) { col[is.na(col)] <- m; col }, train, all_median)) test_leaky <- as.data.frame(Map(function(col, m) { col[is.na(col)] <- m; col }, test, all_median)) # Leaky medians computed on train + test data.frame(feature = names(all_median), median = all_median) # Leaky-imputed training data train_leaky # Leaky-imputed test data test_leaky ## ----safe-impute-------------------------------------------------------------- imp <- impute_guarded( train = train, test = test, method = "median", winsor = FALSE ) # Guarded-imputed training data imp$train # Guarded-imputed test data imp$test ## ----parsnip-spec------------------------------------------------------------- spec <- parsnip::logistic_reg(mode = "classification") |> parsnip::set_engine("glm") ## ----fit-leaky---------------------------------------------------------------- fit_leaky <- fit_resample( df_leaky, outcome = "outcome", splits = leaky_splits, learner = spec, metrics = c("auc", "accuracy"), preprocess = list( impute = list(method = "median"), normalize = list(method = "zscore"), filter = list(var_thresh = 0), fs = list(method = "none") ) ) cat("Leaky fit summary:\n") summary(fit_leaky) metrics_leaky <- as.data.frame(fit_leaky@metric_summary) num_cols <- vapply(metrics_leaky, is.numeric, logical(1)) metrics_leaky[num_cols] <- lapply(metrics_leaky[num_cols], round, digits = 3) # Leaky fit: mean and SD of metrics across folds metrics_leaky ## ----fit-safe----------------------------------------------------------------- fit_safe <- fit_resample( df, outcome = "outcome", splits = safe_splits, learner = spec, metrics = c("auc", "accuracy"), preprocess = list( impute = list(method = "median"), normalize = list(method = "zscore"), filter = list(var_thresh = 0), fs = list(method = "none") ), positive_class = "case", class_weights = c(control = 1, case = 1), refit = TRUE ) cat("Leakage-safe fit summary:\n") summary(fit_safe) metrics_safe <- as.data.frame(fit_safe@metric_summary) num_cols <- vapply(metrics_safe, is.numeric, logical(1)) metrics_safe[num_cols] <- lapply(metrics_safe[num_cols], round, digits = 3) # Leakage-safe fit: mean and SD of metrics across folds metrics_safe # Per-fold metrics (first 6 rows) head(fit_safe@metrics) ## ----multiclass-example------------------------------------------------------- if (requireNamespace("ranger", quietly = TRUE)) { set.seed(11) df_multi <- df df_multi$outcome3 <- factor(sample(c("A", "B", "C"), nrow(df_multi), replace = TRUE)) multi_splits <- make_split_plan( df_multi, outcome = "outcome3", mode = "subject_grouped", group = "subject", v = 4, stratify = TRUE, seed = 11 ) fit_multi <- fit_resample( df_multi, outcome = "outcome3", splits = multi_splits, learner = "ranger", metrics = c("accuracy", "macro_f1", "log_loss"), refit = FALSE ) cat("Multiclass fit summary:\n") summary(fit_multi) } else { cat("ranger not installed; skipping multiclass example.\n") } ## ----survival-example--------------------------------------------------------- if (requireNamespace("survival", quietly = TRUE) && requireNamespace("glmnet", quietly = TRUE)) { set.seed(12) df_surv <- df df_surv$time_to_event <- rexp(nrow(df_surv), rate = 0.1) df_surv$event <- rbinom(nrow(df_surv), 1, 0.7) surv_splits <- make_split_plan( df_surv, outcome = c("time_to_event", "event"), mode = "subject_grouped", group = "subject", v = 4, stratify = FALSE, seed = 12 ) fit_surv <- fit_resample( df_surv, outcome = c("time_to_event", "event"), splits = surv_splits, learner = "glmnet", metrics = "cindex", refit = FALSE ) cat("Survival fit summary:\n") summary(fit_surv) } else { cat("survival or glmnet not installed; skipping survival example.\n") } ## ----learner-args------------------------------------------------------------- if (requireNamespace("glmnet", quietly = TRUE)) { fit_glmnet <- fit_resample( df, outcome = "outcome", splits = safe_splits, learner = "glmnet", metrics = "auc", learner_args = list(glmnet = list(alpha = 0.5)), preprocess = list( impute = list(method = "median"), normalize = list(method = "zscore"), filter = list(var_thresh = 0), fs = list(method = "none") ) ) cat("GLMNET summary with learner-specific arguments:\n") summary(fit_glmnet) } else { cat("glmnet not installed; skipping learner_args example.\n") } ## ----se-example--------------------------------------------------------------- if (requireNamespace("SummarizedExperiment", quietly = TRUE)) { se <- SummarizedExperiment::SummarizedExperiment( assays = list(counts = t(as.matrix(df[, predictors]))), colData = df[, c("subject", "batch", "study", "time", "outcome"), drop = FALSE] ) se_splits <- make_split_plan( se, outcome = "outcome", mode = "subject_grouped", group = "subject", v = 3 ) se_fit <- fit_resample( se, outcome = "outcome", splits = se_splits, learner = spec, metrics = "auc" ) cat("SummarizedExperiment fit summary:\n") summary(se_fit) } else { cat("SummarizedExperiment not installed; skipping SE example.\n") } ## ----tidymodels-interop------------------------------------------------------- library(bioLeak) library(parsnip) library(recipes) library(yardstick) set.seed(123) N <- 60 df <- data.frame( subject = factor(rep(paste0("S", 1:20), length.out = N)), outcome = factor(rep(c("ClassA", "ClassB"), length.out = N)), x1 = rnorm(N), x2 = rnorm(N), x3 = rnorm(N) ) spec <- logistic_reg() |> set_engine("glm") # Use bioLeak's native split planner to avoid conversion errors set.seed(13) # Use make_split_plan instead of rsample::group_vfold_cv # This creates a subject-grouped CV directly compatible with fit_resample splits <- make_split_plan( df, outcome = "outcome", mode = "subject_grouped", group = "subject", v = 3 ) rec <- recipes::recipe(outcome ~ x1 + x2 + x3, data = df) |> recipes::step_impute_median(recipes::all_numeric_predictors()) |> recipes::step_normalize(recipes::all_numeric_predictors()) metrics_set <- yardstick::metric_set(yardstick::roc_auc, yardstick::accuracy) fit_rs <- fit_resample( df, outcome = "outcome", splits = splits, learner = spec, preprocess = rec, metrics = metrics_set, refit = FALSE ) if (exists("as_rsample", where = asNamespace("bioLeak"), mode = "function")) { rs_export <- as_rsample(fit_rs@splits, data = df) print(rs_export) } ## ----workflow-example--------------------------------------------------------- if (requireNamespace("workflows", quietly = TRUE)) { wf <- workflows::workflow() |> workflows::add_model(spec) |> workflows::add_formula(outcome ~ x1 + x2 + x3) fit_wf <- fit_resample( df, outcome = "outcome", splits = safe_splits, learner = wf, metrics = "auc", refit = FALSE ) cat("Workflow fit summary:\n") summary(fit_wf) } else { cat("workflows not installed; skipping workflow example.\n") } ## ----tune-example------------------------------------------------------------- library(bioLeak) library(parsnip) library(recipes) library(tune) library(glmnet) # --- 1. Create Data --- set.seed(123) N <- 60 df <- data.frame( subject = factor(rep(paste0("S", 1:20), length.out = N)), # FIX: Use sample() to randomize outcome and avoid perfect subject-confounding outcome = factor(sample(c("ClassA", "ClassB"), N, replace = TRUE)), x1 = rnorm(N), x2 = rnorm(N), x3 = rnorm(N) ) # --- 2. Generate Nested Splits --- set.seed(1) nested_splits <- make_split_plan( df, outcome = "outcome", mode = "subject_grouped", group = "subject", v = 3, nested = TRUE ) # --- 3. Define Recipe & Model --- rec <- recipes::recipe(outcome ~ x1 + x2 + x3, data = df) |> recipes::step_impute_median(recipes::all_numeric_predictors()) |> recipes::step_normalize(recipes::all_numeric_predictors()) spec_tune <- parsnip::logistic_reg( penalty = tune::tune(), mixture = 1, mode = "classification" ) |> parsnip::set_engine("glmnet") # --- 4. Run Tuning --- tuned <- tune_resample( df, outcome = "outcome", splits = nested_splits, learner = spec_tune, preprocess = rec, inner_v = 2, grid = 3, metrics = c("auc", "accuracy"), seed = 14 ) summary(tuned) ## ----fit-xgboost-------------------------------------------------------------- if (requireNamespace("parsnip", quietly = TRUE) && requireNamespace("xgboost", quietly = TRUE) && requireNamespace("recipes", quietly = TRUE)) { # 1. Define the model spec xgb_spec <- parsnip::boost_tree( mode = "classification", trees = 100, tree_depth = 6, learn_rate = 0.01 ) |> parsnip::set_engine("xgboost") # 2. Define a Recipe (CORRECTED) # FIX: Define the recipe on data WITHOUT 'subject'. # This matches the data bioLeak passes (which has subject removed). df_for_rec <- df[, !names(df) %in% "subject"] rec_xgb <- recipes::recipe(outcome ~ ., data = df_for_rec) |> recipes::step_dummy(recipes::all_nominal_predictors()) |> recipes::step_impute_median(recipes::all_numeric_predictors()) # Note: No need for step_rm(subject) because it's already gone! # 3. Fit fit_xgb <- fit_resample( df, outcome = "outcome", splits = nested_splits, learner = xgb_spec, metrics = "auc", preprocess = rec_xgb ) cat("XGBoost parsnip fit summary:\n") print(summary(fit_xgb)) } else { cat("parsnip/xgboost/recipes not installed.\n") } ## ----custom-learners---------------------------------------------------------- custom_learners <- list( glm = list( fit = function(x, y, task, weights, ...) { df_fit <- data.frame(y = y, x, check.names = FALSE) stats::glm(y ~ ., data = df_fit, family = stats::binomial(), weights = weights) }, predict = function(object, newdata, task, ...) { as.numeric(stats::predict(object, newdata = as.data.frame(newdata), type = "response")) } ) ) cat("Custom learner names:\n") names(custom_learners) cat("Custom learner components (fit/predict):\n") lapply(custom_learners, names) ## ----plot-fold-balance-------------------------------------------------------- if (requireNamespace("ggplot2", quietly = TRUE)) { plot_fold_balance(fit_safe) } else { cat("ggplot2 not installed; skipping fold balance plot.\n") } ## ----plot-calibration--------------------------------------------------------- if (requireNamespace("ggplot2", quietly = TRUE)) { plot_calibration(fit_safe, bins = 10) } else { cat("ggplot2 not installed; skipping calibration plot.\n") } ## ----plot-confounder-sensitivity---------------------------------------------- if (requireNamespace("ggplot2", quietly = TRUE)) { plot_confounder_sensitivity(fit_safe, confounders = c("batch", "study"), metric = "auc", min_n = 3) } else { cat("ggplot2 not installed; skipping confounder sensitivity plot.\n") } ## ----diagnostics-tables------------------------------------------------------- cal <- calibration_summary(fit_safe, bins = 10, min_bin_n = 5) conf_tbl <- confounder_sensitivity(fit_safe, confounders = c("batch", "study"), metric = "auc", min_n = 3) # Calibration metrics cal$metrics # Confounder sensitivity table (first 6 rows) head(conf_tbl) ## ----plot-overlap------------------------------------------------------------- if (requireNamespace("ggplot2", quietly = TRUE)) { plot_overlap_checks(fit_leaky, column = "subject") plot_overlap_checks(fit_safe, column = "subject") } else { cat("ggplot2 not installed; skipping overlap plots.\n") } ## ----audit-leakage------------------------------------------------------------ X_ref <- df[, predictors] X_ref[c(1, 5), ] <- X_ref[1, ] audit <- audit_leakage( fit_safe, metric = "auc", B = 20, perm_stratify = TRUE, batch_cols = c("batch", "study"), X_ref = X_ref, sim_method = "cosine", sim_threshold = 0.995, return_perm = TRUE ) cat("Leakage audit summary:\n") summary(audit) if (!is.null(audit@permutation_gap) && nrow(audit@permutation_gap) > 0) { # Permutation significance results audit@permutation_gap } if (!is.null(audit@batch_assoc) && nrow(audit@batch_assoc) > 0) { # Batch/study association with folds (Cramer's V) audit@batch_assoc } else { cat("No batch or study associations detected.\n") } if (!is.null(audit@target_assoc) && nrow(audit@target_assoc) > 0) { # Top features by target association score head(audit@target_assoc) } else { cat("No target leakage scan results available.\n") } if (!is.null(audit@duplicates) && nrow(audit@duplicates) > 0) { # Top duplicate/near-duplicate pairs by similarity head(audit@duplicates) } else { cat("No near-duplicates detected.\n") } ## ----plot-perm---------------------------------------------------------------- if (requireNamespace("ggplot2", quietly = TRUE)) { plot_perm_distribution(audit) } else { cat("ggplot2 not installed; skipping permutation plot.\n") } ## ----audit-by-learner--------------------------------------------------------- if (requireNamespace("ranger", quietly = TRUE) && requireNamespace("parsnip", quietly = TRUE)) { # 1. Define specs # Standard GLM (no tuning) spec_glm <- parsnip::logistic_reg() |> parsnip::set_engine("glm") # Random Forest spec_rf <- parsnip::rand_forest( mode = "classification", trees = 100 ) |> parsnip::set_engine("ranger") # 2. Fit with CORRECT splits fit_multi <- fit_resample( df, outcome = "outcome", splits = nested_splits, # <--- FIX: Use the valid split object learner = list(glm = spec_glm, rf = spec_rf), metrics = "auc" ) # 3. Run the audit audits <- audit_leakage_by_learner(fit_multi, metric = "auc", B = 20) cat("Per-learner audit summary:\n") print(audits) } else { cat("ranger/parsnip not installed.\n") } ## ----audit-report, eval = FALSE----------------------------------------------- # if (requireNamespace("rmarkdown", quietly = TRUE) && rmarkdown::pandoc_available()) { # report_path <- audit_report(audit, output_dir = ".") # cat("HTML report written to:\n", report_path, "\n") # } else { # cat("rmarkdown or pandoc not available; skipping audit report rendering.\n") # } ## ----time-series-fit---------------------------------------------------------- time_splits <- make_split_plan( df_time, outcome = "outcome", mode = "time_series", time = "time", v = 4, horizon = 1 ) cat("Time-series splits summary:\n") time_splits fit_time_leaky <- fit_resample( df_time, outcome = "outcome", splits = time_splits, learner = spec, metrics = "auc" ) cat("Time-series leaky fit summary:\n") summary(fit_time_leaky) ## ----time-series-safe--------------------------------------------------------- df_time_safe <- df_time df_time_safe$leak_future <- NULL fit_time_safe <- fit_resample( df_time_safe, outcome = "outcome", splits = time_splits, learner = spec, metrics = "auc" ) cat("Time-series safe fit summary:\n") summary(fit_time_safe) audit_time <- audit_leakage( fit_time_safe, metric = "auc", B = 20, time_block = "stationary", block_len = 5 ) cat("Time-series leakage audit summary:\n") summary(audit_time) if (!is.null(audit_time@permutation_gap) && nrow(audit_time@permutation_gap) > 0) { # Time-series permutation significance results audit_time@permutation_gap } if (requireNamespace("ggplot2", quietly = TRUE)) { plot_time_acf(fit_time_safe, lag.max = 20) } else { cat("ggplot2 not installed; skipping ACF plot.\n") } ## ----parallel-setup, eval=FALSE----------------------------------------------- # library(future) # # # Use multiple cores (works on all OS) # plan(multisession, workers = 4) # # # Run a heavy simulation # sim <- simulate_leakage_suite(..., parallel = TRUE) # # # Parallel folds or audits # # fit_resample(..., parallel = TRUE) # # audit_leakage(..., parallel = TRUE) # # audit_leakage_by_learner(..., parallel_learners = TRUE) # # # Return to sequential processing # plan(sequential) ## ----simulate-suite----------------------------------------------------------- if (requireNamespace("glmnet", quietly = TRUE)) { sim <- simulate_leakage_suite( n = 80, p = 6, mode = "subject_grouped", learner = "glmnet", leakage = "subject_overlap", seeds = 1:2, B = 20, parallel = FALSE, signal_strength = 1 ) # Simulation results (first 6 rows) head(sim) } else { cat("glmnet not installed; skipping simulation suite example.\n") }