## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
# install.packages("devtools")
# devtools::install_github("SchlossLab/mikropml")
library(mikropml)
head(otu_mini_bin)

## ---- eval = FALSE------------------------------------------------------------
#  results <- run_ml(otu_mini_bin,
#    "glmnet",
#    outcome_colname = "dx",
#    seed = 2019
#  )

## ---- echo = FALSE------------------------------------------------------------
# reduce vignette runtime by using precomputed results
results <- otu_mini_bin_results_glmnet

## -----------------------------------------------------------------------------
names(results)

## -----------------------------------------------------------------------------
names(results$trained_model)

## -----------------------------------------------------------------------------
head(results$test_data)

## -----------------------------------------------------------------------------
results$performance

## -----------------------------------------------------------------------------
results$feature_importance

## -----------------------------------------------------------------------------
results_custom <- run_ml(otu_mini_bin,
  "glmnet",
  kfold = 2,
  cv_times = 5,
  training_frac = 0.5,
  seed = 2019
)

## ----custom_train_indices, warning=FALSE--------------------------------------
n_obs <- otu_mini_bin %>% nrow()
training_size <- 0.8 * n_obs
training_rows <- sample(n_obs, training_size)
results_custom_train <- run_ml(otu_mini_bin,
  "glmnet",
  kfold = 2,
  cv_times = 5,
  training_frac = training_rows,
  seed = 2019
)

## ---- echo=FALSE--------------------------------------------------------------
# TODO: can we get these programmatically somehow instead of hard-coding them?
c("logLoss", "AUC", "prAUC", "Accuracy", "Kappa", "Mean_F1", "Mean_Sensitivity", "Mean_Specificity", "Mean_Pos_Pred_Value", "Mean_Neg_Pred_Value", "Mean_Precision", "Mean_Recall", "Mean_Detection_Rate", "Mean_Balanced_Accuracy")

## ---- echo=FALSE--------------------------------------------------------------
c("RMSE", "Rsquared", "MAE")

## -----------------------------------------------------------------------------
results_pr <- run_ml(otu_mini_bin,
  "glmnet",
  cv_times = 5,
  perf_metric_name = "prAUC",
  seed = 2019
)

## -----------------------------------------------------------------------------
results_pr$performance

## ----custom_groups, warning=FALSE---------------------------------------------
# make random groups
set.seed(2019)
grps <- sample(LETTERS[1:8], nrow(otu_mini_bin), replace = TRUE)
results_grp <- run_ml(otu_mini_bin,
  "glmnet",
  cv_times = 2,
  training_frac = 0.8,
  groups = grps,
  seed = 2019
)

## ----group_partitions, warning=FALSE------------------------------------------
results_grp_part <- run_ml(otu_mini_bin,
  "glmnet",
  cv_times = 2,
  training_frac = 0.8,
  groups = grps,
  group_partitions = list(
    train = c("A", "B"),
    test = c("C", "D")
  ),
  seed = 2019
)

## ----only_group_A_train, warning = FALSE--------------------------------------
results_grp_trainA <- run_ml(otu_mini_bin,
  "glmnet",
  cv_times = 2,
  kfold = 2,
  training_frac = 0.5,
  groups = grps,
  group_partitions = list(
    train = c("A", "B", "C", "D", "E", "F"),
    test = c("A", "B", "C", "D", "E", "F", "G", "H")
  ),
  seed = 2019
)

## ----calc-case-weights, message = FALSE---------------------------------------
set.seed(20221016)
library(dplyr)
train_set_indices <- get_partition_indices(otu_mini_bin %>% pull(dx),
  training_frac = 0.70
)
case_weights_dat <- otu_mini_bin %>%
  count(dx) %>%
  mutate(p = n / sum(n)) %>%
  select(dx, p) %>%
  right_join(otu_mini_bin, by = "dx") %>%
  select(-starts_with("Otu")) %>%
  mutate(
    row_num = row_number(),
    in_train = row_num %in% train_set_indices
  ) %>%
  filter(in_train)
head(case_weights_dat)
tail(case_weights_dat)
nrow(case_weights_dat) / nrow(otu_mini_bin)

## ----weighted-results, eval = FALSE-------------------------------------------
#  results_weighted <- run_ml(otu_mini_bin,
#    "glmnet",
#    outcome_colname = "dx",
#    seed = 2019,
#    training_frac = case_weights_dat %>% pull(row_num),
#    weights = case_weights_dat %>% pull(p)
#  )

## ---- eval = FALSE------------------------------------------------------------
#  results_imp <- run_ml(otu_mini_bin,
#    "rf",
#    outcome_colname = "dx",
#    find_feature_importance = TRUE,
#    seed = 2019
#  )

## ---- echo = FALSE------------------------------------------------------------
results_imp <- otu_mini_bin_results_rf

## -----------------------------------------------------------------------------
results_imp$feature_importance

## -----------------------------------------------------------------------------
results_imp_corr <- run_ml(otu_mini_bin,
  "glmnet",
  cv_times = 5,
  find_feature_importance = TRUE,
  corr_thresh = 0.2,
  seed = 2019
)
results_imp_corr$feature_importance

## ---- eval = FALSE------------------------------------------------------------
#  results_rf <- run_ml(otu_mini_bin,
#    "rf",
#    cv_times = 5,
#    seed = 2019
#  )

## ---- eval = FALSE------------------------------------------------------------
#  results_rf_nt <- run_ml(otu_mini_bin,
#    "rf",
#    cv_times = 5,
#    ntree = 1000,
#    seed = 2019
#  )

## ---- eval = FALSE------------------------------------------------------------
#  results_dt <- run_ml(otu_mini_bin,
#    "rpart2",
#    cv_times = 5,
#    seed = 2019
#  )

## ---- eval = FALSE------------------------------------------------------------
#  results_svm <- run_ml(otu_mini_bin,
#    "svmRadial",
#    cv_times = 5,
#    seed = 2019
#  )

## -----------------------------------------------------------------------------
otu_mini_multi %>%
  dplyr::pull("dx") %>%
  unique()

## ---- eval = FALSE------------------------------------------------------------
#  results_multi <- run_ml(otu_mini_multi,
#    outcome_colname = "dx",
#    seed = 2019
#  )

## ---- echo = FALSE------------------------------------------------------------
results_multi <- otu_mini_multi_results_glmnet

## -----------------------------------------------------------------------------
results_multi$performance

## ---- eval = FALSE------------------------------------------------------------
#  results_cont <- run_ml(otu_mini_bin[, 2:11],
#    "glmnet",
#    outcome_colname = "Otu00001",
#    seed = 2019
#  )

## ---- echo = FALSE------------------------------------------------------------
results_cont <- otu_mini_cont_results_glmnet

## -----------------------------------------------------------------------------
results_cont$performance