## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.path = "figures/workflow-", fig.ext = "png", fig.width = 7, fig.height = 4 ) set.seed(19101991) ## ----message=FALSE------------------------------------------------------------ # IMPORTANT: do NOT use devtools in vignettes (it is not available on CRAN/CI runners). # Only load packages that are in Imports/Suggests. for (p in c("prospectr", "pls", "reticulate", "soilVAE")) { if (!requireNamespace(p, quietly = TRUE)) { stop("Package '", p, "' is required to build this vignette. ", "Please install it (it should be installed automatically from Suggests/Imports during checks).") } } library(prospectr) library(pls) library(reticulate) library(soilVAE) ## ----------------------------------------------------------------------------- data("datsoilspc", package = "soilVAE") str(datsoilspc, max.level = 1) ## ----------------------------------------------------------------------------- eval_quant <- function(y, yhat) { y <- as.numeric(y); yhat <- as.numeric(yhat) ok <- is.finite(y) & is.finite(yhat) y <- y[ok]; yhat <- yhat[ok] if (length(y) < 3) { return(list(n = length(y), ME = NA_real_, MAE = NA_real_, RMSE = NA_real_, R2 = NA_real_, RPIQ = NA_real_, RPD = NA_real_)) } err <- yhat - y me <- mean(err) mae <- mean(abs(err)) rmse <- sqrt(mean(err^2)) ss_res <- sum((y - yhat)^2) ss_tot <- sum((y - mean(y))^2) r2 <- if (ss_tot == 0) NA_real_ else 1 - ss_res / ss_tot rpiq <- stats::IQR(y) / rmse rpd <- stats::sd(y) / rmse list(n = length(y), ME = me, MAE = mae, RMSE = rmse, R2 = r2, RPIQ = rpiq, RPD = rpd) } as_df_metrics <- function(x) { data.frame( n = x$n, ME = round(x$ME, 2), MAE = round(x$MAE, 2), RMSE = round(x$RMSE, 2), R2 = round(x$R2, 2), RPIQ = round(x$RPIQ, 2), RPD = round(x$RPD, 2), stringsAsFactors = FALSE ) } ## ----------------------------------------------------------------------------- # Reflectance → absorbance spcA <- log(1 / as.matrix(datsoilspc$spc)) # Resample to 5 nm oldWavs <- as.numeric(colnames(spcA)) newWavs <- seq(min(oldWavs), max(oldWavs), by = 5) spcARs <- prospectr::resample( X = spcA, wav = oldWavs, new.wav = newWavs, interpol = "linear" ) # SNV + moving average spcASnv <- prospectr::standardNormalVariate(spcARs) spcAMovav <- prospectr::movav(spcASnv, w = 11) # Store in object to match book-style workflows datsoilspc$spcAMovav <- spcAMovav ## ----spectra-plot, fig.alt="Preprocessed spectra (SNV + movav)", fig.cap="Preprocessed spectra (SNV + movav)"---- matplot( x = as.numeric(colnames(datsoilspc$spcAMovav)), y = t(datsoilspc$spcAMovav), xlab = "Wavelength / nm", ylab = "Absorbance (SNV + movav)", type = "l", lty = 1, col = rgb(0.5, 0.5, 0.5, alpha = 0.25) ) ## ----------------------------------------------------------------------------- set.seed(19101991) calId <- sample(seq_len(nrow(datsoilspc)), size = round(0.75 * nrow(datsoilspc))) datC <- datsoilspc[calId, ] datV <- datsoilspc[-calId, ] ## ----pls, fig.alt="PLS CV performance", fig.cap="PLS CV performance"---------- maxc <- 30 pls_fit <- pls::plsr( TotalCarbon ~ spcAMovav, data = datC, method = "oscorespls", ncomp = maxc, validation = "CV" ) plot(pls_fit, "val", main = "PLS CV performance", xlab = "Number of components") ## ----------------------------------------------------------------------------- nc <- 14 pls_pred_C <- as.numeric(predict(pls_fit, ncomp = nc, newdata = datC$spcAMovav)) pls_pred_V <- as.numeric(predict(pls_fit, ncomp = nc, newdata = datV$spcAMovav)) pls_cal <- eval_quant(datC$TotalCarbon, pls_pred_C) pls_tst <- eval_quant(datV$TotalCarbon, pls_pred_V) rbind( cbind(Model = "PLS", Split = "Calibration (datC)", as_df_metrics(pls_cal)), cbind(Model = "PLS", Split = "TEST (datV)", as_df_metrics(pls_tst)) ) ## ----------------------------------------------------------------------------- has_py <- reticulate::py_available(initialize = FALSE) has_tf <- FALSE if (has_py) { try(reticulate::py_config(), silent = TRUE) has_tf <- reticulate::py_module_available("tensorflow") && reticulate::py_module_available("keras") } has_py has_tf ## ----------------------------------------------------------------------------- # Predictors X_tr <- scale(as.matrix(datC$spcAMovav)) X_center <- attr(X_tr, "scaled:center") X_scale <- attr(X_tr, "scaled:scale") X_te <- scale(as.matrix(datV$spcAMovav), center = X_center, scale = X_scale) # Target (no transform, keep original units like the PLS baseline) y_tr <- as.numeric(datC$TotalCarbon) y_te <- as.numeric(datV$TotalCarbon) dim(X_tr) length(y_tr) ## ----eval=has_tf-------------------------------------------------------------- # # If needed, pin an env *before* importing TF/Keras: # # soilVAE::vae_configure(conda = "soilvae-tf") # # # Internal split from datC -> train/val for early stopping + model selection # set.seed(19101991) # idx <- seq_len(nrow(X_tr)) # val_id <- sample(idx, size = max(1L, round(0.20 * length(idx)))) # tr_id <- setdiff(idx, val_id) # # X_int_tr <- X_tr[tr_id, , drop = FALSE] # y_int_tr <- y_tr[tr_id] # # X_int_va <- X_tr[val_id, , drop = FALSE] # y_int_va <- y_tr[val_id] # # # A small-but-useful grid # grid_vae <- data.frame( # latent_dim = c(8L, 16L, 32L, 64L), # dropout = c(0.2, 0.3), # lr = c(5e-4), # beta_kl = c(0.01), # alpha_y = c(5), # epochs = c(500L), # batch_size = c(64L, 128L), # patience = c(50L), # stringsAsFactors = FALSE # ) # grid_vae$hidden_enc <- list(c(512L, 256L, 128L)) # grid_vae$hidden_dec <- list(c(128L, 256L, 512L)) # # tuned <- soilVAE::tune_vae_train_val( # X_tr = X_int_tr, y_tr = y_int_tr, # X_va = X_int_va, y_va = y_int_va, # seed = 19101991, # grid_vae = grid_vae # ) # # best <- soilVAE::select_best_from_grid(tuned$tuning_df, selection_metric = "euclid") # cfg <- best$best # # m <- soilVAE::vae_build( # input_dim = ncol(X_tr), # hidden_enc = as.integer(strsplit(cfg$hidden_enc_str, "-")[[1]]), # hidden_dec = as.integer(strsplit(cfg$hidden_dec_str, "-")[[1]]), # latent_dim = as.integer(cfg$latent_dim), # dropout = as.numeric(cfg$dropout), # lr = as.numeric(cfg$lr), # beta_kl = as.numeric(cfg$beta_kl), # alpha_y = as.numeric(cfg$alpha_y) # ) # # soilVAE::vae_fit( # model = m, # X = X_int_tr, y = y_int_tr, # X_val = X_int_va, y_val = y_int_va, # epochs = as.integer(cfg$epochs), # batch_size = as.integer(cfg$batch_size), # patience = as.integer(cfg$patience), # verbose = 0L # ) # # # Predict on: internal train, internal val, and external TEST (datV) # yhat_int_tr <- soilVAE::vae_predict(m, X_int_tr) # yhat_int_va <- soilVAE::vae_predict(m, X_int_va) # yhat_te <- soilVAE::vae_predict(m, X_te) # # vae_tr <- eval_quant(y_int_tr, yhat_int_tr) # vae_va <- eval_quant(y_int_va, yhat_int_va) # vae_te <- eval_quant(y_te, yhat_te) # # rbind( # cbind(Model = "soilVAE", Split = "Train (internal)", as_df_metrics(vae_tr)), # cbind(Model = "soilVAE", Split = "Val (internal)", as_df_metrics(vae_va)), # cbind(Model = "soilVAE", Split = "TEST (datV)", as_df_metrics(vae_te)) # ) ## ----eval=has_tf-------------------------------------------------------------- # tab <- rbind( # cbind(Model = "PLS", Split = "Calibration (datC)", as_df_metrics(pls_cal)), # cbind(Model = "PLS", Split = "TEST (datV)", as_df_metrics(pls_tst)), # cbind(Model = "soilVAE",Split = "Train (internal)", as_df_metrics(vae_tr)), # cbind(Model = "soilVAE",Split = "Val (internal)", as_df_metrics(vae_va)), # cbind(Model = "soilVAE",Split = "TEST (datV)", as_df_metrics(vae_te)) # ) # # row.names(tab) <- NULL # tab