## -----------------------------------------------------------------------------
#| echo: false
options(cli.num_colors = 1)
Sys.setenv(OMP_NUM_THREADS = 2)


## -----------------------------------------------------------------------------
#| message: false
#| echo: false
library(resemble)


## -----------------------------------------------------------------------------
#| label: fit-methods
#| eval: true

# Creates an object with instructions to build PLS models
my_pls <- fit_pls(ncomp = 15)
my_pls

# Creates an object with instructions to build WAPLS models
my_wapls <- fit_wapls(min_ncomp = 3, max_ncomp = 20)
my_wapls

# Creates an object with instructions to build GPR models
my_gpr <- fit_gpr()
my_gpr


## -----------------------------------------------------------------------------
#| label: validation-control
#| eval: true
# Create an object with instructions to conduct both validation types
# "NNv" and "local_cv"
two_val_control <- mbl_control(
  validation_type = c("NNv", "local_cv"),
  number = 10,
  p = 0.75
)


## -----------------------------------------------------------------------------
#| message: false
library(resemble)
library(prospectr)

# obtain a numeric vector of the wavelengths at which spectra is recorded 
wavs <- as.numeric(colnames(NIRsoil$spc))

# pre-process the spectra:
# - use detrend
# - use first order derivative
diff_order <- 1
poly_order <- 1
window <- 7

# Preprocess spectra
NIRsoil$spc_pr <- savitzkyGolay(
  detrend(NIRsoil$spc, wav = wavs),
  m = diff_order, p = poly_order, w = window
)
train_x <- NIRsoil$spc_pr[NIRsoil$train == 1, ]
train_y <- NIRsoil$Ciso[NIRsoil$train == 1]

test_x  <- NIRsoil$spc_pr[NIRsoil$train == 0, ]
test_y  <- NIRsoil$Ciso[NIRsoil$train == 0]


## -----------------------------------------------------------------------------
#| label: mbl-local
#| eval: true
#| results: hide
# Define the neighborhood sizes to test
my_ks <- seq(80, 160, by = 40)

# Define how to use the dissimilarity information (ignore it)
ignore_diss <- "none"

# Define the regression method to be used at each neighborhood
my_wapls <- fit_wapls(min_ncomp = 3, max_ncomp = 25, scale = TRUE)

# For the moment use only "NNv" validation (faster)
nnv_val_control <- mbl_control()

# Predict Total Carbon
# (remove missing values)
local_ciso <- mbl(
  Xr = train_x[!is.na(train_y), ],
  Yr = train_y[!is.na(train_y)],
  Xu = test_x,
  neighbors = neighbors_k(my_ks),
  diss_method = diss_correlation(center = FALSE),
  diss_usage = ignore_diss,
  fit_method = my_wapls,
  gh = TRUE,
  control = nnv_val_control
)


## -----------------------------------------------------------------------------
#| label: fig-localresultsciso
#| fig-cap: "MBL results for Total Carbon predictions using the LOCAL algorithm. NNv: nearest-neighbor cross-validation."
#| fig-align: "center"
#| fig-width: 7.5
#| fig-height: 4
plot(local_ciso, main = "")
local_ciso


## -----------------------------------------------------------------------------
#| label: bestk
#| eval: true
#| echo: false
#| results: hide
bestk <- which.min(local_ciso$validation_results$nearest_neighbor_validation$rmse)
bestk <- local_ciso$validation_results$nearest_neighbor_validation$k[bestk]


## -----------------------------------------------------------------------------
#| label: get-predictions
#| eval: true
#| results: hide
#| fig-show: hide
bki <- which.min(local_ciso$validation_results$nearest_neighbor_validation$rmse)
bk <- local_ciso$validation_results$nearest_neighbor_validation$k[bki]

# All the prediction results are stored in:
local_ciso$results

# The get_predictions function makes easier to retrieve the
# predictions from the previous object
ciso_hat <- as.matrix(get_predictions(local_ciso))[, bki]


## -----------------------------------------------------------------------------
#| label: fig-plot-predictions
#| fig-cap: "Predicted vs reference values for Total Carbon using the LOCAL algorithm with the best neighborhood size according to nearest-neighbor validation."
#| eval: true
#| fig-width: 5
#| fig-height: 5
# Plot predicted vs reference
rng <- range(ciso_hat, test_y, na.rm = TRUE)
plot(ciso_hat, test_y,
     xlim = rng,
     ylim = rng,
     xlab = "Predicted Total Carbon, %",
     ylab = "Total Carbon, %",
     main = "LOCAL using a fixed k", 
     cex = 1.5,
     pch = 16, col = rgb(0.5, 0.5, 0.5, 0.6))
grid(lty = 1)
abline(0, 1, col = "red")


## -----------------------------------------------------------------------------
#| eval: true
# Prediction RMSE:
sqrt(mean((ciso_hat - test_y)^2, na.rm = TRUE))

# Squared R
cor(ciso_hat, test_y, use = "complete.obs")^2


## -----------------------------------------------------------------------------
#| label: mbl-diss-threshold
#| eval: true
#| results: hide
#| fig-show: hide
# Create a vector of dissimilarity thresholds to evaluate
# since the correlation dissimilarity will be used
# these thresholds need to be > 0 and <= 1
dths <- seq(0.025, 0.15, by = 0.025)

# Indicate the minimum and maximum sizes allowed for the neighborhood
k_min <- 30
k_max <- 150

local_ciso_diss <- mbl(
  Xr = train_x[!is.na(train_y), ],
  Yr = train_y[!is.na(train_y)],
  Xu = test_x,
  neighbors = neighbors_diss(threshold = dths, k_min = k_min, k_max = k_max),
  diss_method = diss_correlation(center = FALSE),
  diss_usage = ignore_diss,
  fit_method = my_wapls,
  control = nnv_val_control
)


## -----------------------------------------------------------------------------
#| label: plot-diss-results
#| eval: false
# plot(local_ciso_diss)


## -----------------------------------------------------------------------------
#| label: bestd
#| eval: true
#| echo: false
#| results: hide
bestd <- which.min(local_ciso_diss$validation_results$nearest_neighbor_validation$rmse)
bestd <- local_ciso_diss$validation_results$nearest_neighbor_validation$k_diss[bestd]


## -----------------------------------------------------------------------------
#| label: show-diss-results
#| eval: true
local_ciso_diss


## -----------------------------------------------------------------------------
#| label: diss-predictions
#| eval: true
#| results: hide
#| fig-show: hide
# Best distance threshold
bdi <- which.min(local_ciso_diss$validation_results$nearest_neighbor_validation$rmse)
bd <- local_ciso_diss$validation_results$nearest_neighbor_validation$k_diss[bdi]

# Predictions for the best distance
ciso_diss_hat <- as.matrix(get_predictions(local_ciso_diss))[, bdi]


## -----------------------------------------------------------------------------
#| label: fig-diss-predictions
#| fig-cap: "Predicted vs reference values for Total Carbon using the LOCAL algorithm with the best neighborhood size according to nearest-neighbor validation and distance thresholds."
#| eval: true
# Plot predicted vs reference
plot(ciso_diss_hat, test_y,
     xlim = rng,
     ylim = rng,
     xlab = "Predicted Total Carbon, %",
     ylab = "Total Carbon, %",
     main = "LOCAL using a distance threshold \nfor neighbor retrieval", 
     cex = 1.5,
     pch = 16, col = rgb(0.5, 0.5, 0.5, 0.6))
grid()
abline(0, 1, col = "red")

## -----------------------------------------------------------------------------
# RMSE
sqrt(mean((ciso_diss_hat - test_y)^2, na.rm = TRUE))

# Squared R
cor(ciso_diss_hat, test_y, use = "complete.obs")^2


## -----------------------------------------------------------------------------
train_x <- NIRsoil$spc_pr[NIRsoil$train == 1, ]
train_cec <- NIRsoil$CEC[NIRsoil$train == 1]

test_x  <- NIRsoil$spc_pr[NIRsoil$train == 0, ]
test_cec  <- NIRsoil$CEC[NIRsoil$train == 0]


## -----------------------------------------------------------------------------
#| label: cec-examples
#| results: hide
#| eval: true
#| fig-show: hide
# Define the WAPLS fitting method
my_wapls <- fit_wapls(min_ncomp = 2, max_ncomp = 25, scale = FALSE)

# mbl_cor: LOCAL algorithm with correlation dissimilarity
dth_cor <- seq(0.01, 0.3, by = 0.08)
mbl_cor <- mbl(
  Xr = train_x[!is.na(train_cec), ],
  Yr = train_cec[!is.na(train_cec)],
  Xu = test_x,
  neighbors = neighbors_diss(threshold = dth_cor, k_min = 80, k_max = 150),
  diss_method = diss_correlation(),
  diss_usage = "none",
  fit_method = my_wapls,
  control = nnv_val_control
)

# mbl_pc: PCA dissimilarity with dissimilarity matrix as predictors
dth_pc <- seq(0.05, 1, by = 0.4)
mbl_pc <- mbl(
  Xr = train_x[!is.na(train_cec), ],
  Yr = train_cec[!is.na(train_cec)],
  Xu = test_x,
  neighbors = neighbors_diss(threshold = dth_pc, k_min = 80, k_max = 150),
  diss_method = diss_pca(ncomp = ncomp_by_opc(), scale = TRUE),
  diss_usage = "predictors",
  fit_method = my_wapls,
  control = nnv_val_control
)

# mbl_pls: PLS dissimilarity without dissimilarity predictors
mbl_pls <- mbl(
  Xr = train_x[!is.na(train_cec), ],
  Yr = train_cec[!is.na(train_cec)],
  Xu = test_x,
  Yu = test_cec,
  neighbors = neighbors_diss(threshold = dth_pc, k_min = 80, k_max = 150),
  diss_method = diss_pls(ncomp = ncomp_by_opc(), scale = TRUE),
  diss_usage = "none",
  fit_method = my_wapls,
  control = nnv_val_control
)

# mbl_gpr: Gaussian process regression with PCA dissimilarity as predictors
mbl_gpr <- mbl(
  Xr = train_x[!is.na(train_cec), ],
  Yr = train_cec[!is.na(train_cec)],
  Xu = test_x,
  neighbors = neighbors_diss(threshold = dth_pc, k_min = 80, k_max = 150),
  diss_method = diss_pca(ncomp = ncomp_by_opc(), scale = TRUE),
  diss_usage = "predictors",
  fit_method = fit_gpr(),
  control = nnv_val_control
)


## -----------------------------------------------------------------------------
#| label: collect-predictions
#| eval: true
#| fig-show: hide
# Get the indices of the best results according to
# nearest neighbor validation statistics
c_val_name <- "validation_results"
c_nn_val_name <- "nearest_neighbor_validation"

bi_cor <- which.min(mbl_cor[[c_val_name]][[c_nn_val_name]]$rmse)
bi_pc  <- which.min(mbl_pc[[c_val_name]][[c_nn_val_name]]$rmse)
bi_pls <- which.min(mbl_pls[[c_val_name]][[c_nn_val_name]]$rmse)
bi_gpr <- which.min(mbl_gpr[[c_val_name]][[c_nn_val_name]]$rmse)

preds <- cbind(
  get_predictions(mbl_cor)[, bi_cor],
  get_predictions(mbl_pc)[, bi_pc],
  get_predictions(mbl_pls)[, bi_pls],
  get_predictions(mbl_gpr)[, bi_gpr]
)

colnames(preds) <- c("mbl_cor", "mbl_pc", "mbl_pls", "mbl_gpr")
preds <- as.matrix(preds)

# R2s
cor(test_cec, preds, use = "complete.obs")^2

# RMSEs
colMeans((preds - test_cec)^2, na.rm = TRUE)^0.5


## -----------------------------------------------------------------------------
#| label: cec-plots-code
#| eval: false
#| fig-show: hide
# old_par <- par("mfrow", "mar")
# par(mfrow = c(2, 2))
# 
# plot(test_cec, preds[, "mbl_cor"],
#      xlab = "CEC, meq/100g",
#      ylab = "Predicted CEC, meq/100g",
#      main = "mbl_cor (LOCAL)")
# abline(0, 1, col = "red")
# 
# plot(test_cec, preds[, "mbl_pc"],
#      xlab = "CEC, meq/100g",
#      ylab = "Predicted CEC, meq/100g",
#      main = "mbl_pc")
# abline(0, 1, col = "red")
# 
# plot(test_cec, preds[, "mbl_pls"],
#      xlab = "CEC, meq/100g",
#      ylab = "Predicted CEC, meq/100g",
#      main = "mbl_pls")
# abline(0, 1, col = "red")
# 
# plot(test_cec, preds[, "mbl_gpr"],
#      xlab = "CEC, meq/100g",
#      ylab = "Predicted CEC, meq/100g",
#      main = "mbl_gpr")
# abline(0, 1, col = "red")
# 
# par(old_par)


## -----------------------------------------------------------------------------
#| label: fig-mblcomparisons
#| fig-cap: "CEC prediction results for the different MBL configurations tested"
#| fig-align: center
#| fig-width: 8
#| fig-height: 8
#| echo: false
old_par <- par("mfrow", "mar")
par(mfrow = c(2, 2), pch = 16, mar = c(4, 4, 4, 4))

my_cols <- c(
  "mbl_cor" = "#D42B08CC",
  "mbl_pc"  = "#750E3380",
  "mbl_pls" = "#EFBF4780",
  "mbl_gpr" = "#5177A180"
)

# R2s
r2s <- drop(round(cor(test_cec, preds, use = "complete.obs")^2, 2))

# RMSEs
rmses <- round(colMeans((preds - test_cec)^2, na.rm = TRUE)^0.5, 2)

plot_titles <- c(
  "mbl_cor" = "mbl_cor (LOCAL)",
  "mbl_pc"  = "mbl_pc",
  "mbl_pls" = "mbl_pls",
  "mbl_gpr" = "mbl_gpr"
)

p <- sapply(
  colnames(preds),
  FUN = function(label, y, yhats, cols, rsq, rmse, titles) {
    plot(
      x = y,
      y = yhats[, label],
      xlim = range(y, na.rm = TRUE),
      ylim = range(y, na.rm = TRUE),
      xlab = "CEC, meq/100g",
      ylab = "Predicted CEC, meq/100g",
      col = cols[label]
    )
    title(titles[label])
    title(paste0("\n\n\n RMSE: ", rmse[label], "; R²: ", rsq[label]), cex.main = 1)
    grid(col = "#80808080", lty = 1)
    abline(0, 1, col = "#FF1A0080")
  },
  y = test_cec,
  yhats = preds,
  cols = my_cols,
  rsq = r2s,
  rmse = rmses,
  titles = plot_titles
)

par(old_par)


## -----------------------------------------------------------------------------
#| label: yu-argument
#| eval: true
# Use Yu argument to validate the predictions
pc_pred_cec_yu <- mbl(
  Xr = train_x[!is.na(train_cec), ],
  Yr = train_cec[!is.na(train_cec)],
  Xu = test_x,
  Yu = test_cec,
  neighbors = neighbors_k(80),
  diss_method = diss_pca(scale = TRUE),
  diss_usage = "none",
  verbose = FALSE,
  control = mbl_control()
)

pc_pred_cec_yu


## -----------------------------------------------------------------------------
#| label: parallel-example
#| eval: false
# # Running the mbl function using multiple cores
# 
# # Execute with two cores, if available, ...
# n_cores <- 2
# 
# # ... if not then go with 1 core
# if (parallel::detectCores() < 2) {
#   n_cores <- 1
# }
# 
# # Set the number of cores
# library(doParallel)
# clust <- makeCluster(n_cores)
# registerDoParallel(clust)
# 
# # Alternatively:
# # library(doSNOW)
# # clust <- makeCluster(n_cores, type = "SOCK")
# # registerDoSNOW(clust)
# # getDoParWorkers()
# 
# pc_pred_cec <- mbl(
#   Xr = train_x[!is.na(train_cec), ],
#   Yr = train_cec[!is.na(train_cec)],
#   Xu = test_x,
#   neighbors = neighbors_k(seq(40, 100, by = 10)),
#   diss_method = diss_pca(ncomp = ncomp_by_opc(), scale = TRUE),
#   diss_usage = "none",
#   control = mbl_control()
# )
# 
# # Go back to sequential processing
# registerDoSEQ()
# try(stopCluster(clust))
# 
# pc_pred_cec