## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) library(BORG) ## ----index-overlap------------------------------------------------------------ data <- data.frame(x = 1:100, y = rnorm(100)) # Accidental overlap result <- borg_inspect(data, train_idx = 1:60, test_idx = 51:100) result ## ----duplicate-rows----------------------------------------------------------- # Data with duplicate rows dup_data <- rbind( data.frame(x = 1:5, y = 1:5), data.frame(x = 1:5, y = 1:5) # Duplicates ) result <- borg_inspect(dup_data, train_idx = 1:5, test_idx = 6:10) result ## ----preprocessing-leak, eval=FALSE------------------------------------------- # # BAD: Scale fitted on all data # scaled_data <- scale(data) # Uses all rows! # train <- scaled_data[1:70, ] # test <- scaled_data[71:100, ] # # # BORG detects this # borg_inspect(scaled_data, train_idx = 1:70, test_idx = 71:100) ## ----target-leakage----------------------------------------------------------- # Simulate target leakage leaky <- data.frame( x = rnorm(100), outcome = rnorm(100) ) leaky$leaked <- leaky$outcome + rnorm(100, sd = 0.01) # Near-perfect correlation result <- borg_inspect(leaky, train_idx = 1:70, test_idx = 71:100, target = "outcome") result ## ----group-leakage------------------------------------------------------------ # Clinical data with patient IDs clinical <- data.frame( patient_id = rep(1:10, each = 10), measurement = rnorm(100) ) # Random split ignoring patients set.seed(123) all_idx <- sample(100) train_idx <- all_idx[1:70] test_idx <- all_idx[71:100] result <- borg_inspect(clinical, train_idx = train_idx, test_idx = test_idx, groups = "patient_id") result ## ----temporal-leak------------------------------------------------------------ # Time series data ts_data <- data.frame( date = seq(as.Date("2020-01-01"), by = "day", length.out = 100), value = cumsum(rnorm(100)) ) # Wrong: random split ignores time set.seed(42) random_idx <- sample(100) train_idx <- random_idx[1:70] test_idx <- random_idx[71:100] result <- borg_inspect(ts_data, train_idx = train_idx, test_idx = test_idx, time = "date") result ## ----proxy-leakage------------------------------------------------------------ # Strong but not extreme correlation proxy <- data.frame( x = rnorm(100), outcome = rnorm(100) ) proxy$strong_predictor <- proxy$outcome + rnorm(100, sd = 0.3) # r ~ 0.96 result <- borg_inspect(proxy, train_idx = 1:70, test_idx = 71:100, target = "outcome") result ## ----spatial-proximity-------------------------------------------------------- set.seed(42) spatial <- data.frame( lon = runif(100, 0, 100), lat = runif(100, 0, 100), value = rnorm(100) ) # Random split intermixes nearby points train_idx <- sample(100, 70) test_idx <- setdiff(1:100, train_idx) result <- borg_inspect(spatial, train_idx = train_idx, test_idx = test_idx, coords = c("lon", "lat")) result ## ----random-cv-inflation------------------------------------------------------ # Diagnose data dependencies spatial <- data.frame( lon = runif(200, 0, 100), lat = runif(200, 0, 100), response = rnorm(200) ) diagnosis <- borg_diagnose(spatial, coords = c("lon", "lat"), target = "response", verbose = FALSE) diagnosis@recommended_cv ## ----risk-access-------------------------------------------------------------- # Create result with violations result <- borg_inspect( data.frame(x = 1:100, y = rnorm(100)), train_idx = 1:60, test_idx = 51:100 ) # Summary cat("Valid:", result@is_valid, "\n") cat("Hard violations:", result@n_hard, "\n") cat("Soft warnings:", result@n_soft, "\n") # Individual risks for (risk in result@risks) { cat("\n", risk$type, "(", risk$severity, "):\n", sep = "") cat(" ", risk$description, "\n") if (!is.null(risk$affected)) { cat(" Affected:", head(risk$affected, 5), "...\n") } } # Tabular format as.data.frame(result)