## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----echo=FALSE,include=FALSE,eval=TRUE--------------------------------------- # devtools::install_github("SMAC-Group/SWAG-R-Package") library(swag) #load the new package ## ---- eval=F,echo=TRUE-------------------------------------------------------- # remotes::install_github("SMAC-Group/SWAG-R-Package") # # library(swag) #load the new package ## ----BreastCancer, eval=T----------------------------------------------------- # After having installed the mlbench package data(BreastCancer, package = "mlbench") # Pre-processing of the data y <- BreastCancer$Class # response variable x <- as.matrix(BreastCancer[setdiff(names(BreastCancer),c("Id","Class"))]) # features # remove missing values and change to 'numeric' id <- which(apply(x,1,function(x) sum(is.na(x)))>0) y <- y[-id] x <- x[-id,] x <- apply(x,2,as.numeric) # Training and test set set.seed(180) # for replication ind <- sample(1:dim(x)[1],dim(x)[1]*0.2) y_test <- y[ind] y_train <- y[-ind] x_test <- x[ind,] x_train <-x[-ind,] ## ----caret, warning=FALSE----------------------------------------------------- ## if not installed ## install.packages("caret") library(caret) ## ----control-swag, eval=T----------------------------------------------------- # Meta-parameters chosen for the breast cancer dataset swagcon <- swagControl(pmax = 4L, alpha = 0.5, m = 20L, seed = 163L, #for replicability verbose = T #keeps track of completed dimensions ) # Given the low dimensional dataset, we can afford a wider search by fixing alpha = 0.5 as a smaller alpha may also stop the training procedure earlier than expected. ## ---- eval=FALSE, message=FALSE,warning=FALSE,echo=FALSE---------------------- # library(caret) # swag is build around caret and uses it to train each learner ## ----SVM, eval=TRUE, warning=FALSE,message=FALSE------------------------------ ## SVM Linear Learner ## `kernlab` is needed ## if not installed, install.packages("kernlab") train_swag_svml <- swag( # arguments for swag x = x_train, y = y_train, control = swagcon, auto_control = FALSE, # arguments for caret trControl = trainControl(method = "repeatedcv", number = 10, repeats = 1, allowParallel = F), # trainControl is from caret package metric = "Accuracy", method = "svmLinear", # Use method = "svmRadial" to train this specific learner preProcess = c("center", "scale") ) ## ----CVs, eval=T-------------------------------------------------------------- train_swag_svml$CVs # A list which contains the cv training errors of each learner explored in a given dimension ## ----VarMat, eval=T----------------------------------------------------------- train_swag_svml$VarMat # A list which contrains a matrix, for each dimension, with the attributes tested at that step ## ----cv-alpha, eval= T-------------------------------------------------------- train_swag_svml$cv_alpha # The cut-off cv training error, at each dimension, determined by the choice of alpha ## ----lasso, eval=TRUE--------------------------------------------------------- ## Lasso Learner ## `glmnet` is needed ## if not installed, install.packages("glmnet") train_swag_lasso <- swag( # arguments for swag x = x, y = y, control = swagcon, auto_control = FALSE, # arguments for caret trControl = trainControl(method = "repeatedcv", number = 10, repeats = 1, allowParallel = F), # trainControl is from caret package metric = "Accuracy", method = "glmnet", tuneGrid=expand.grid(alpha = 1, lambda = seq(0,.35,length.out=10)), family="binomial", # dynamically modify arguments for caret caret_args_dyn = function(list_arg,iter){ if(iter==1){ list_arg$method = "glm" list_arg$tuneGrid = NULL } list_arg } ) ## ----random-forest, eval=TRUE------------------------------------------------- ## Random Forest Learner ## `randomForest` is needed ## if not installed, install.packages("randomForest") train_swag_rf <- swag( # arguments for swag x = x, y = y, control = swagcon, auto_control = FALSE, # arguments for caret trControl = trainControl(method = "repeatedcv", number = 10, repeats = 1, allowParallel = F), # trainControl is from caret package metric = "Accuracy", method = "rf", # dynamically modify arguments for caret caret_args_dyn = function(list_arg,iter){ list_arg$tuneGrid = expand.grid(.mtry=sqrt(iter)) list_arg } ) ## ---- eval=F, echo=FALSE------------------------------------------------------ # # IN-SAMPLE # # # predictions below a given CV error in-sample # train_pred <- predict(train_swag_svml, # newdata = x_train, # type="cv_performance", # cv_performance = 0.05) # # # predictions for a given dimension in-sample # train_pred_att <- predict(train_swag_svml,newdata = x_train,type="attribute",attribute = 4) # ## ----predictions, eval=T------------------------------------------------------ # best learner predictions # if `newdata` is not specified, then predict gives predictions based on the training sample sapply(predict(object = train_swag_svml), function(x) head(x)) # best learner predictions best_pred <- predict(object = train_swag_svml, newdata = x_test) sapply(best_pred, function(x) head(x)) # predictions for a given dimension dim_pred <- predict( object = train_swag_svml, newdata = x_test, type = "attribute", attribute = 4L) sapply(dim_pred,function(x) head(x)) # predictions below a given CV error cv_pred <- predict( object = train_swag_svml, newdata = x_test, type = "cv_performance", cv_performance = 0.04) sapply(cv_pred,function(x) head(x)) ## ----confusion-matrix, eval=T------------------------------------------------- # transform predictions into a data.frame of factors with levels of `y_test` best_learn <- factor(levels(y_test)[best_pred$predictions]) confusionMatrix(best_learn,y_test) # from caret package