--- title: Cross-validation for Dimensionality Reduction output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Cross-validation for Dimensionality Reduction} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} params: family: red css: albers.css resource_files: - albers.css - albers.js includes: in_header: |- --- ```{r setup, include=FALSE} knitr::opts_chunk$set(collapse = TRUE, comment = "#>", fig.width=6, fig.height=4) library(multivarious) library(ggplot2) ``` ## Why Cross-validate Dimensionality Reduction? When using PCA or other dimensionality reduction methods, we often face questions like: - How many components should I keep? - How well does my model generalize to new data? - Which preprocessing strategy works best? Cross-validation provides principled answers by testing how well models trained on one subset of data perform on held-out data. ## Quick Example: Finding the Right Number of Components Let's use the iris dataset to find the optimal number of PCA components via reconstruction error. ```{r basic_example_setup} set.seed(123) X <- as.matrix(iris[, 1:4]) # 150 samples x 4 features # Create 5-fold cross-validation splits K <- 5 fold_ids <- sample(rep(1:K, length.out = nrow(X))) folds <- lapply(1:K, function(k) list( train = which(fold_ids != k), test = which(fold_ids == k) )) ``` Define the fitting and measurement functions. The measurement function projects test data, reconstructs it, and computes RMSE: ```{r basic_example_functions} fit_pca <- function(train_data, ncomp) { pca(train_data, ncomp = ncomp, preproc = center()) } measure_reconstruction <- function(model, test_data) { # Project test data to score space scores <- project(model, test_data) # Reconstruct: scores %*% t(loadings), then reverse centering recon <- scores %*% t(model$v) recon <- inverse_transform(model$preproc, recon) # Compute RMSE rmse <- sqrt(mean((test_data - recon)^2)) tibble::tibble(rmse = rmse) } ``` Now run cross-validation for 1--4 components: ```{r basic_example_cv} results_list <- lapply(1:4, function(nc) { cv_res <- cv_generic( data = X, folds = folds, .fit_fun = fit_pca, .measure_fun = measure_reconstruction, fit_args = list(ncomp = nc), backend = "serial" ) # Extract RMSE from each fold and average fold_rmse <- sapply(cv_res$metrics, function(m) m$rmse) data.frame(ncomp = nc, rmse = mean(fold_rmse)) }) cv_results <- do.call(rbind, results_list) print(cv_results) ``` Two components capture most of the structure; additional components yield diminishing returns. ## Understanding the Output The `cv_generic()` function returns a tibble with three columns: - **fold**: The fold number (integer) - **model**: The fitted model for that fold (list column) - **metrics**: Performance metrics for that fold (list column of tibbles) ```{r understanding_output} # Run CV once to inspect the structure cv_example <- cv_generic( X, folds, .fit_fun = fit_pca, .measure_fun = measure_reconstruction, fit_args = list(ncomp = 2) ) # Structure overview str(cv_example, max.level = 1) # Extract metrics from all folds all_metrics <- dplyr::bind_rows(cv_example$metrics) print(all_metrics) ``` ## Custom Cross-validation Scenarios ### Scenario 1: Comparing Preprocessing Strategies Use `cv_generic()` to compare centering alone versus z-scoring: ```{r preprocessing_comparison} prep_center <- center() prep_zscore <- colscale(center(), type = "z") fit_with_prep <- function(train_data, ncomp, preproc) { pca(train_data, ncomp = ncomp, preproc = preproc) } # Compare both strategies with 3 components cv_center <- cv_generic( X, folds, .fit_fun = fit_with_prep, .measure_fun = measure_reconstruction, fit_args = list(ncomp = 3, preproc = prep_center) ) cv_zscore <- cv_generic( X, folds, .fit_fun = fit_with_prep, .measure_fun = measure_reconstruction, fit_args = list(ncomp = 3, preproc = prep_zscore) ) rmse_center <- mean(sapply(cv_center$metrics, `[[`, "rmse")) rmse_zscore <- mean(sapply(cv_zscore$metrics, `[[`, "rmse")) cat("Center only - RMSE:", round(rmse_center, 4), "\n") cat("Z-score - RMSE:", round(rmse_zscore, 4), "\n") ``` For iris, centering alone performs slightly better since the variables are already on similar scales. ### Scenario 2: Parallel Cross-validation For larger datasets, run folds in parallel: ```{r parallel_cv, eval=FALSE} # Setup parallel backend library(future) plan(multisession, workers = 4) # Run CV in parallel cv_parallel <- cv_generic( X, folds = folds, .fit_fun = fit_pca, .measure_fun = measure_pca, fit_args = list(ncomp = 4), backend = "future" # Use parallel backend ) # Don't forget to reset plan(sequential) ``` ## Computing Multiple Metrics You can return multiple metrics from the measurement function: ```{r multiple_metrics} measure_multi <- function(model, test_data) { scores <- project(model, test_data) recon <- scores %*% t(model$v) recon <- inverse_transform(model$preproc, recon) residuals <- test_data - recon ss_res <- sum(residuals^2) ss_tot <- sum((test_data - mean(test_data))^2) tibble::tibble( rmse = sqrt(mean(residuals^2)), mae = mean(abs(residuals)), r2 = 1 - ss_res / ss_tot ) } cv_multi <- cv_generic( X, folds, .fit_fun = fit_pca, .measure_fun = measure_multi, fit_args = list(ncomp = 3) ) all_metrics <- dplyr::bind_rows(cv_multi$metrics) print(all_metrics) cat("\nMean across folds:\n") cat("RMSE:", round(mean(all_metrics$rmse), 4), "\n") cat("MAE: ", round(mean(all_metrics$mae), 4), "\n") cat("R2: ", round(mean(all_metrics$r2), 4), "\n") ``` | Metric | Description | Interpretation | |--------|-------------|----------------| | RMSE | Root mean squared error | Lower is better; in original units | | MAE | Mean absolute error | Less sensitive to outliers than RMSE | | R² | Coefficient of determination | Proportion of variance explained (1 = perfect) | ## Tips for Effective Cross-validation ### 1. Preprocessing Inside the Loop Always fit preprocessing parameters inside the CV loop: ```{r preprocessing_tip, eval=FALSE} # WRONG: Preprocessing outside CV leaks information X_scaled <- scale(X) # Uses mean/sd from ALL samples including test! cv_wrong <- cv_generic(X_scaled, folds, ...) # RIGHT: Let the model handle preprocessing internally # Each fold fits centering/scaling on training data only fit_pca <- function(train_data, ncomp) { pca(train_data, ncomp = ncomp, preproc = center()) } ``` ### 2. Choose Appropriate Fold Sizes - **Small datasets (< 100 samples)**: Use leave-one-out or 10-fold CV - **Medium datasets (100-1000)**: Use 5-10 fold CV - **Large datasets (> 1000)**: Use 3-5 fold CV or hold-out validation ### 3. Consider Metric Choice - Use **RMSE** for general reconstruction quality - Use **R²** to understand proportion of variance explained - Use **MAE** when outliers are a concern ## Advanced: Cross-validating Other Projectors The CV framework works with any projector type. The key is writing appropriate fit and measure functions. ```{r other_projectors, eval=FALSE} # Nyström approximation for kernel PCA fit_nystrom <- function(train_data, ncomp) { nystrom_approx(train_data, ncomp = ncomp, nlandmarks = 50, preproc = center()) } # Kernel-space reconstruction error measure_kernel <- function(model, test_data) { S <- project(model, test_data) K_hat <- S %*% t(S) Xc <- reprocess(model, test_data) K_true <- Xc %*% t(Xc) tibble::tibble(kernel_rmse = sqrt(mean((K_hat - K_true)^2))) } cv_nystrom <- cv_generic( X, folds, .fit_fun = fit_nystrom, .measure_fun = measure_kernel, fit_args = list(ncomp = 10) ) ``` ## Kernel PCA via Nyström (standard and double) The `nystrom_approx()` function provides two variants: - `method = "standard"`: Williams–Seeger single-stage Nyström with the usual scaling - `method = "double"`: Lim–Jin–Zhang two-stage Nyström (efficient when `p` is large) With a centered linear kernel and all points as landmarks (`m = N`), the Nyström eigen-decomposition recovers the exact top eigenpairs of the kernel matrix `K = X_c X_c^T`. Below is a reproducible snippet that demonstrates this and shows how to project new data. ```{r nystrom_demo} set.seed(123) X <- matrix(rnorm(80 * 10), 80, 10) ncomp <- 5 # Exact setting: linear kernel + centering + m = N fit_std <- nystrom_approx( X, ncomp = ncomp, landmarks = 1:nrow(X), preproc = center(), method = "standard" ) # Compare kernel eigenvalues: eig(K) vs fit_std$sdev^2 Xc <- transform(fit_std$preproc, X) K <- Xc %*% t(Xc) lam_K <- eigen(K, symmetric = TRUE)$values[1:ncomp] data.frame( component = 1:ncomp, nystrom = sort(fit_std$sdev^2, decreasing = TRUE), exact_K = sort(lam_K, decreasing = TRUE) ) # Relationship with PCA: prcomp() returns singular values / sqrt(n - 1) p <- prcomp(Xc, center = FALSE, scale. = FALSE) lam_from_pca <- p$sdev[1:ncomp]^2 * (nrow(X) - 1) # equals eig(K) data.frame( component = 1:ncomp, from_pca = sort(lam_from_pca, decreasing = TRUE), exact_K = sort(lam_K, decreasing = TRUE) ) # Out-of-sample projection for new rows new_rows <- 1:5 scores_new <- project(fit_std, X[new_rows, , drop = FALSE]) head(scores_new) # Double Nyström collapses to standard when l = m = N fit_dbl <- nystrom_approx( X, ncomp = ncomp, landmarks = 1:nrow(X), preproc = center(), method = "double", l = nrow(X) ) all.equal(sort(fit_std$sdev^2, decreasing = TRUE), sort(fit_dbl$sdev^2, decreasing = TRUE)) ``` For large feature counts (`p >> n`), set `method = "double"` and choose a modest intermediate rank `l` to reduce the small problem size. Provide a custom `kernel_func` if you need a non-linear kernel (e.g., RBF). ```{r nystrom_rbf, eval=FALSE} # Example RBF kernel gaussian_kernel <- function(A, B, sigma = 1) { # ||a-b||^2 = ||a||^2 + ||b||^2 - 2 a·b G <- A %*% t(B) a2 <- rowSums(A * A) b2 <- rowSums(B * B) D2 <- outer(a2, b2, "+") - 2 * G exp(-D2 / (2 * sigma^2)) } fit_rbf <- nystrom_approx( X, ncomp = 8, nlandmarks = 40, preproc = center(), method = "double", l = 20, kernel_func = gaussian_kernel ) scores_rbf <- project(fit_rbf, X[1:10, ]) ``` ### Test coverage for Nyström This package includes unit tests that validate Nyström correctness: - Standard Nyström recovers the exact kernel eigenpairs when `m = N` (centered linear kernel) - Double Nyström matches standard when `l = m = N` - `project()` reproduces training scores and matches manual formulas for both methods See `tests/testthat/test_nystrom.R` in the source for details. ### Cross‑validated kernel RMSE: Nyström vs PCA Below we compare PCA and Nyström (linear kernel) via a kernel‑space RMSE on held‑out folds. For a test block with preprocessed data `X_test_c`, the true kernel is `K_true = X_test_c %*% t(X_test_c)`. With a rank‑`k` model, the approximated kernel is `K_hat = S %*% t(S)`, where `S` are the component scores returned by `project()`. ```{r nystrom_cv_compare} set.seed(202) # PCA fit function (reuses earlier fit_pca) fit_pca <- function(train_data, ncomp) { pca(train_data, ncomp = ncomp, preproc = center()) } # Nyström fit function (standard variant, linear kernel, no RSpectra needed for small data) fit_nystrom <- function(train_data, ncomp, nlandmarks = 50) { nystrom_approx(train_data, ncomp = ncomp, nlandmarks = nlandmarks, preproc = center(), method = "standard", use_RSpectra = FALSE) } # Kernel-space RMSE metric for a test fold measure_kernel_rmse <- function(model, test_data) { S <- project(model, test_data) K_hat <- S %*% t(S) Xc <- reprocess(model, test_data) K_true <- Xc %*% t(Xc) tibble::tibble(kernel_rmse = sqrt(mean((K_hat - K_true)^2))) } # Use a local copy of iris data and local folds for this comparison X_cv <- as.matrix(scale(iris[, 1:4])) K <- 5 fold_ids <- sample(rep(1:K, length.out = nrow(X_cv))) folds_cv <- lapply(1:K, function(k) list( train = which(fold_ids != k), test = which(fold_ids == k) )) # Compare for k = 3 components k_sel <- 3 cv_pca_kernel <- cv_generic( X_cv, folds_cv, .fit_fun = fit_pca, .measure_fun = measure_kernel_rmse, fit_args = list(ncomp = k_sel) ) cv_nys_kernel <- cv_generic( X_cv, folds_cv, .fit_fun = fit_nystrom, .measure_fun = measure_kernel_rmse, fit_args = list(ncomp = k_sel, nlandmarks = 50) ) metrics_pca <- dplyr::bind_rows(cv_pca_kernel$metrics) metrics_nys <- dplyr::bind_rows(cv_nys_kernel$metrics) rmse_pca <- mean(metrics_pca$kernel_rmse, na.rm = TRUE) rmse_nys <- mean(metrics_nys$kernel_rmse, na.rm = TRUE) cv_summary <- data.frame( method = c("PCA", "Nyström (linear)"), kernel_rmse = c(rmse_pca, rmse_nys) ) print(cv_summary) # Simple bar plot ggplot(cv_summary, aes(x = method, y = kernel_rmse, fill = method)) + geom_col(width = 0.6) + guides(fill = "none") + labs(title = "Cross‑validated kernel RMSE (k = 3)", y = "Kernel RMSE", x = NULL) ``` ## Summary The `multivarious` CV framework provides: - Easy cross-validation for any dimensionality reduction method - Flexible metric calculation - Parallel execution support - Tidy output format for easy analysis Use it to make informed decisions about model complexity and ensure your dimensionality reduction generalizes well to new data.