## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ## ----load-libraries----------------------------------------------------------- library(xplainfi) library(mlr3) library(mlr3learners) library(data.table) # Create a task for demonstration task_mixed = tsk("penguins") task_numeric = sim_dgp_correlated(n = 200) ## ----feature-types------------------------------------------------------------ # Check supported feature types for different samplers task_mixed$feature_types permutation = MarginalPermutationSampler$new(task_mixed) permutation$feature_types ## ----sample-methods----------------------------------------------------------- # Sample from stored task (using row_ids) sampled_task = permutation$sample( feature = "bill_length", row_ids = 40:45 ) sampled_task # Sample from "external" data test_data = task_mixed$data(rows = 40:45) sampled_external = permutation$sample_newdata( feature = "bill_length", newdata = test_data ) sampled_external ## ----permutation-example------------------------------------------------------ # Create permutation sampler permutation = MarginalPermutationSampler$new(task_mixed) # Sample a continuous feature original = task_mixed$data(rows = 1:10) sampled = permutation$sample("bill_length", row_ids = 1:10) # Compare original and sampled values data.table( original_bill = original$bill_length, sampled_bill = sampled$bill_length, sex = original$sex # Unchanged ) ## ----marginal-ref-example----------------------------------------------------- # Create marginal reference sampler with n_samples reference pool marginal_ref = MarginalReferenceSampler$new(task_mixed, n_samples = 30L) # Sample a feature - each row gets values from a randomly sampled reference row original = task_mixed$data(rows = 1:5) sampled = marginal_ref$sample("bill_length", row_ids = 1:5) # Compare data.table( original_bill = original$bill_length, sampled_bill = sampled$bill_length, sex = original$sex # Unchanged ) ## ----correlation-preservation------------------------------------------------- # Sample with MarginalPermutationSampler (breaks correlations) perm = MarginalPermutationSampler$new(task_numeric) sampled_perm = perm$sample(c("x1", "x2"), row_ids = 1:10) # Sample with MarginalReferenceSampler (preserves within-row correlations) ref = MarginalReferenceSampler$new(task_numeric, n_samples = 50L) sampled_ref = ref$sample(c("x1", "x2"), row_ids = 1:10) # Check correlations cor_original = cor(task_numeric$data()$x1, task_numeric$data()$x2) cor_perm = cor(sampled_perm$x1, sampled_perm$x2) cor_ref = cor(sampled_ref$x1, sampled_ref$x2) data.table( method = c("Original", "Permutation", "Reference"), correlation = c(cor_original, cor_perm, cor_ref) ) ## ----gaussian-sampler--------------------------------------------------------- # Create Gaussian conditional sampler gaussian = ConditionalGaussianSampler$new(task_numeric) # Sample x1 conditioned on other features sampled = gaussian$sample( feature = "x1", row_ids = 1:10, conditioning_set = c("x2", "x3", "x4") ) # Compare original and conditionally sampled values original = task_numeric$data(rows = 1:10) data.table( original = original$x1, sampled = sampled$x1, x2 = original$x2 # Conditioning feature (unchanged) ) ## ----arf-sampler-------------------------------------------------------------- # Create ARF sampler (works with full task including categorical features) arf = ConditionalARFSampler$new(task_mixed, num_trees = 20, verbose = FALSE) # Sample island conditioned on body measurements sampled = arf$sample( feature = "island", row_ids = 1:10, conditioning_set = c("bill_length", "body_mass") ) # Compare original and sampled island original = task_mixed$data(rows = 1:10) data.table( original_island = original$island, sampled_island = sampled$island, bill_length = original$bill_length, # Conditioning feature body_mass = original$body_mass # Conditioning feature ) ## ----ctree-sampler------------------------------------------------------------ # Create ctree sampler ctree = ConditionalCtreeSampler$new(task_mixed) # Sample with default parameters sampled = ctree$sample( feature = "bill_length", row_ids = 1:10, conditioning_set = "island" ) original = task_mixed$data(rows = 1:10) data.table( island = original$island, # Conditioning feature original = original$bill_length, sampled = sampled$bill_length ) ## ----knn-sampler-numeric------------------------------------------------------ # Create kNN sampler with k=5 neighbors knn_numeric = ConditionalKNNSampler$new(task_numeric, k = 5) # Sample x1 based on nearest neighbors in (x2, x3) space sampled_numeric = knn_numeric$sample( feature = "x1", row_ids = 1:5, conditioning_set = c("x2", "x3") ) original_numeric = task_numeric$data(rows = 1:5) data.table( x2 = original_numeric$x2, x3 = original_numeric$x3, original_x1 = original_numeric$x1, sampled_x1 = sampled_numeric$x1 ) ## ----knn-sampler-mixed-------------------------------------------------------- # Use task with categorical features knn_mixed = ConditionalKNNSampler$new(task_mixed, k = 5) # Sample bill_length conditioning on island (categorical) and body_mass (numeric) sampled_mixed = knn_mixed$sample( feature = "bill_length", row_ids = 1:5, conditioning_set = c("island", "body_mass") ) original_mixed = task_mixed$data(rows = 1:5) data.table( island = original_mixed$island, body_mass = original_mixed$body_mass, original_bill = original_mixed$bill_length, sampled_bill = sampled_mixed$bill_length ) ## ----knockoff-sampler--------------------------------------------------------- # Create Gaussian knockoff sampler (using task_numeric from earlier) knockoff = KnockoffGaussianSampler$new(task_numeric) # Generate knockoffs original = task_numeric$data(rows = 1:5) knockoffs = knockoff$sample( feature = task_numeric$feature_names, row_ids = 1:5 ) # Original vs knockoff values data.table( x1_original = original$x1, x1_knockoff = knockoffs$x1, x2_original = original$x2, x2_knockoff = knockoffs$x2 ) ## ----cfi-knockoff, eval = FALSE----------------------------------------------- # # CFI with knockoff sampler for conditional independence testing # cfi_knockoff = CFI$new( # task = task_numeric, # learner = lrn("regr.ranger"), # measure = msr("regr.mse"), # sampler = knockoff # ) # # # Compute importance with CPI-based inference # cfi_knockoff$compute() # cfi_knockoff$importance(ci_method = "cpi")