## ----style, echo = FALSE, results = 'asis'------------------------------------ BiocStyle::markdown() ## ----setup, include=FALSE, echo=F, warning= F, message=F---------------------- knitr::opts_chunk$set( message = FALSE, #fig.height = 6, fig.width = 6, warning = FALSE, error = FALSE, tidy = FALSE, fig.align = "center", # dpi = 600, cache = TRUE, progress = FALSE, quite = TRUE ) library(BiocStyle) #BiocStyle::latex(relative.path = TRUE) library(knitr) ## ---- message=FALSE----------------------------------------------------------- # Install the released version from CRAN using if (!requireNamespace("multiclassPairs", quietly = TRUE)) { install.packages("multiclassPairs") } # Or install the dev version from GitHub using # if (!requireNamespace("multiclassPairs", quietly = TRUE)) { # if (!requireNamespace("devtools", quietly = TRUE)) { # install.packages("devtools") # } # library(devtools) # this package is needed to install from GitHub # install_github("NourMarzouka/multiclassPairs", build_vignettes = TRUE) #} # Install the dependencies from Bioconductor # BiocManager, Biobase, and switchBox packages from Bioconductor are needed if (!requireNamespace("BiocManager", quietly = TRUE)) { install.packages("BiocManager") } if (!requireNamespace("Biobase", quietly = TRUE)) { BiocManager::install("Biobase") } if (!requireNamespace("switchBox", quietly = TRUE)) { BiocManager::install("switchBox") } # load multiclassPairs library library(multiclassPairs) ## ---- echo=FALSE,fig.cap="Workflow in multiclassPairs R package: Functions are colored in green."---- knitr::include_graphics("images/workflow_v0_3.png") ## ---- message=FALSE----------------------------------------------------------- library(multiclassPairs) # example of creating data object from matrix # we will generate fake data in this example # matrix with random values Data <- matrix(runif(10000), nrow=100, ncol=100, dimnames = list(paste0("G",1:100), paste0("S",1:100))) # class labels L1 <- sample(x = c("A","B","C"), size = 100, replace = TRUE) # platform/study labels P1 <- sample(x = c("P1","P2"), size = 100, replace = TRUE) # create the data object object <- ReadData(Data = Data, Labels = L1, Platform = P1, verbose = FALSE) object ## ---- message=FALSE----------------------------------------------------------- library(multiclassPairs, quietly = TRUE) # install Leukemia cancers data if (!requireNamespace("BiocManager", quietly = TRUE)){ install.packages("BiocManager") } if (!requireNamespace("leukemiasEset", quietly = TRUE)){ BiocManager::install("leukemiasEset") } # load the data package library(leukemiasEset, quietly = TRUE) data(leukemiasEset) ## ---- message=FALSE----------------------------------------------------------- # check the Expressionset leukemiasEset # explore the phenotypes data knitr::kable(head(pData(leukemiasEset))) # We are interested in LeukemiaType knitr::kable(table(pData(leukemiasEset)[,"LeukemiaType"])) # split the data # 60% as training data and 40% as testing data n <- ncol(leukemiasEset) set.seed(1234) training_samples <- sample(1:n,size = n*0.6) train <- leukemiasEset[1:1000,training_samples] test <- leukemiasEset[1:1000,-training_samples] # just to be sure there are no shared samples between the training and testing data sum(sampleNames(test) %in% sampleNames(train)) == 0 # create the data object # when we use Expressionset we can use the name of the phenotypes variable # ReadData will automatically extract the phenotype variable and use it as class labels # the same can be used with the Platform/study labels # in this example we are not using any platform labels, so leave it NULL object <- ReadData(Data = train, Labels = "LeukemiaType", Platform = NULL, verbose = FALSE) object ## ---- echo=FALSE, fig.cap="One-vs-rest scheme"-------------------------------- knitr::include_graphics("images/one_vs_rest_scheme.png") ## ---- echo=FALSE, fig.cap="Gene filtering options"---------------------------- knitr::include_graphics("images/gene_filtering_TSP.png") ## ---- echo=FALSE, fig.cap="Platform-wise gene filtering"---------------------- knitr::include_graphics("images/platform_wise_gene_filtering_TSP.png") ## ----------------------------------------------------------------------------- # let's go with gene filtering using one_vs_one option # for featureNo argument, a sufficient number of returned features is # recommended if large number of rules is used in the downstream training steps. filtered_genes <- filter_genes_TSP(data_object = object, filter = "one_vs_one", platform_wise = FALSE, featureNo = 1000, UpDown = TRUE, verbose = TRUE) filtered_genes ## ---- eval = FALSE------------------------------------------------------------ # # using the object that is generated by ReadData # # we can create genes object with all genes to skip filtering step # # # Get the class names # classes <- unique(object$data$Labels) # # # create empty genes object # genes_all <- list(OnevsrestScheme = list(filtered_genes = NULL, # calls = c())) # class(genes_all) <- "OnevsrestScheme_genes_TSP" # # # prepare the slots for each class # tmp <- vector("list", length(classes)) # names(tmp) <- classes # # genes_all$OnevsrestScheme$filtered_genes <- tmp # genes_all$OnevsrestScheme$calls <- c() # genes_all # # # fill the gene object in each slot # for (i in classes) { # genes_all$OnevsrestScheme$filtered_genes[[i]] <- rownames(object$data$Data) # } # # # This is the gene object with all genes # genes_all ## ----------------------------------------------------------------------------- # Let's train our model classifier <- train_one_vs_rest_TSP(data_object = object, filtered_genes = filtered_genes, k_range = 5:50, include_pivot = FALSE, one_vs_one_scores = TRUE, platform_wise_scores = FALSE, seed = 1234, verbose = FALSE) classifier ## ----------------------------------------------------------------------------- # apply on the training data # To have the classes in output in specific order, we can use classes argument results_train <- predict_one_vs_rest_TSP(classifier = classifier, Data = object, tolerate_missed_genes = TRUE, weighted_votes = TRUE, classes = c("ALL","AML","CLL","CML","NoL"), verbose = TRUE) # apply on the testing data results_test <- predict_one_vs_rest_TSP(classifier = classifier, Data = test, tolerate_missed_genes = TRUE, weighted_votes = TRUE, classes=c("ALL","AML","CLL","CML","NoL"), verbose = TRUE) # get a look over the scores in the testing data knitr::kable(head(results_test)) ## ----------------------------------------------------------------------------- # Confusion Matrix and Statistics on training data caret::confusionMatrix(data = factor(results_train$max_score, levels = unique(object$data$Labels)), reference = factor(object$data$Labels, levels = unique(object$data$Labels)), mode="everything") # Confusion Matrix and Statistics on testing data caret::confusionMatrix(data = factor(results_test$max_score, levels = unique(object$data$Labels)), reference = factor(pData(test)[,"LeukemiaType"], levels = unique(object$data$Labels)), mode="everything") ## ---- include=FALSE, results="hide"------------------------------------------- # Confusion Matrix and Statistics on training data x1 <- caret::confusionMatrix(data = factor(results_train$max_score, levels = unique(object$data$Labels)), reference = factor(object$data$Labels, levels = unique(object$data$Labels)), mode="everything") # Confusion Matrix and Statistics on testing data x2 <- caret::confusionMatrix(data = factor(results_test$max_score, levels = unique(object$data$Labels)), reference = factor(pData(test)[,"LeukemiaType"], levels = unique(object$data$Labels)), mode="everything") ## ----------------------------------------------------------------------------- # plot for the rules and scores in the training data plot_binary_TSP(Data = object, # we are using the data object here classifier = classifier, prediction = results_train, classes = c("ALL","AML","CLL","CML","NoL"), #margin = c(0,5,0,10), title = "Training data") # plot for the rules and scores in the testing data plot_binary_TSP(Data = test, # ExpressionSet ref = "LeukemiaType", # ref label names in pData classifier = classifier, prediction = results_test, classes = c("ALL","AML","CLL","CML","NoL"), title = "Testing data"#, #margin = c(0,5,0,10) ) ## ----------------------------------------------------------------------------- # (500 trees here just for fast example) genes_RF <- sort_genes_RF(data_object = object, # featureNo_altogether, it is better not to specify a number here # featureNo_one_vs_rest, it is better not to specify a number here rank_data = TRUE, platform_wise = FALSE, num.trees = 500, # more features, more tress are recommended seed=123456, # for reproducibility verbose = TRUE) genes_RF # sorted genes object ## ----------------------------------------------------------------------------- # to get an idea of how many genes we will use # and how many rules will be generated summary_genes <- summary_genes_RF(sorted_genes_RF = genes_RF, genes_altogether = c(10,20,50,100,150,200), genes_one_vs_rest = c(10,20,50,100,150,200)) knitr::kable(summary_genes) # 50 genes_altogether and 50 genes_one_vs_rest seems # to give enough number of rules and unique genes for our classes # (500 trees here just for fast example) # Now let's run sort_rules_RF to create the rules and sort them rules_RF <- sort_rules_RF(data_object = object, sorted_genes_RF = genes_RF, genes_altogether = 50, genes_one_vs_rest = 50, num.trees = 500,# more rules, more tress are recommended seed=123456, verbose = TRUE) rules_RF # sorted rules object ## ---- results="hide"---------------------------------------------------------- # prepare the simple data.frame for the parameters I want to test # names of arguments as column names # this df has three sets (3 rows) of parameters parameters <- data.frame( gene_repetition=c(3,2,1), rules_one_vs_rest=c(2,3,10), rules_altogether=c(2,3,10), run_boruta=c(FALSE,"make_error",TRUE), # I want to produce error in the 2nd trial plot_boruta = FALSE, num.trees=c(100,200,300), stringsAsFactors = FALSE) # parameters # for overall and byclass possible options, check the help files para_opt <- optimize_RF(data_object = object, sorted_rules_RF = rules_RF, parameters = parameters, test_object = NULL, overall = c("Accuracy","Kappa"), # wanted overall measurements byclass = c("F1"), # wanted measurements per class verbose = TRUE) para_opt # results object # para_opt$summary # the df of with summarized information knitr::kable(para_opt$summary) ## ----------------------------------------------------------------------------- # train the final model # it is preferred to increase the number of trees and rules in case you have # large number of samples and features # for quick example, we have small number of trees and rules here # based on the optimize_RF results we will select the parameters RF_classifier <- train_RF(data_object = object, sorted_rules_RF = rules_RF, gene_repetition = 1, rules_altogether = 10, rules_one_vs_rest = 10, run_boruta = TRUE, plot_boruta = FALSE, probability = TRUE, num.trees = 300, boruta_args = list(), verbose = TRUE) ## ----------------------------------------------------------------------------- # plot proximity matrix of the out-of-bag samples # Note: this takes a lot of time if the data is big proximity_matrix_RF(object = object, classifier = RF_classifier, plot = TRUE, return_matrix = FALSE, # if we need to get the matrix itself title = "Leukemias", cluster_cols = TRUE) ## ----------------------------------------------------------------------------- # training accuracy # get the prediction labels from the trained model # if the classifier trained using probability = FALSE training_pred <- RF_classifier$RF_scheme$RF_classifier$predictions if (is.factor(training_pred)) { x <- as.character(training_pred) } # if the classifier trained using probability = TRUE if (is.matrix(training_pred)) { x <- colnames(training_pred)[max.col(training_pred)] } # training accuracy caret::confusionMatrix(data =factor(x), reference = factor(object$data$Labels), mode = "everything") ## ---- include=FALSE, results="hide"------------------------------------------- # training accuracy x1 <- caret::confusionMatrix(data =factor(x), reference = factor(object$data$Labels), mode = "everything") ## ----------------------------------------------------------------------------- # apply on test data results <- predict_RF(classifier = RF_classifier, Data = test, impute = TRUE) # can handle missed genes by imputation # get the prediction labels # if the classifier trained using probability = FALSE test_pred <- results$predictions if (is.factor(test_pred)) { x <- as.character(test_pred) } # if the classifier trained using probability = TRUE if (is.matrix(test_pred)) { x <- colnames(test_pred)[max.col(test_pred)] } # training accuracy caret::confusionMatrix(data = factor(x), reference = factor(pData(test)[,"LeukemiaType"]), mode = "everything") ## ---- include=FALSE, results="hide"------------------------------------------- x2 <- caret::confusionMatrix(data = factor(x), reference = factor(pData(test)[,"LeukemiaType"]), mode = "everything") ## ----------------------------------------------------------------------------- #visualize the binary rules in training dataset plot_binary_RF(Data = object, classifier = RF_classifier, prediction = NULL, as_training = TRUE, # to extract the scores from the model show_scores = TRUE, top_anno = "ref", show_predictions = TRUE, #margin = c(0,5,0,8), title = "Training data") # visualize the binary rules in testing dataset plot_binary_RF(Data = test, ref = "LeukemiaType", # Get ref labels from the test ExpressionSet classifier = RF_classifier, prediction = results, as_training = FALSE, show_scores = TRUE, top_anno = "ref", show_predictions = TRUE, title = "Testing data") ## ---- eval=FALSE, tidy=FALSE-------------------------------------------------- # # For one-vs-rest scheme settings: # # one_vs_rest gene filtering with k_range of 10:50 # object <- ReadData(Data = training_data, # Labels = Train_Classes) # # filtered_genes <- filter_genes_TSP(data_object = object, # featureNo = 1000, UpDown = TRUE, # filter = "one_vs_rest") # # classifier_1_vs_r <- train_one_vs_rest_TSP(data_object = object, # filtered_genes = filtered_genes, # k_range = 10:50, disjoint = TRUE, # include_pivot = FALSE, # platform_wise_scores = FALSE, # one_vs_one_scores = FALSE) # # # For RF scheme settings: # # Settings represents a large model to show calculation time with high number of genes/rules # object <- ReadData(Data = training_data, # Labels = Train_Classes) # # sorted_genes <- sort_genes_RF(object) # # sorted_rules <- sort_rules_RF(data_object = object, # sorted_genes_RF = sorted_genes, # genes_altogether = 200, # genes_one_vs_rest = 200) # # classifier_RF <- train_RF(data_object = object, # sorted_rules_RF = sorted_rules, # rules_one_vs_rest = 200, # rules_altogether = 200, # run_boruta = FALSE, gene_repetition = 1) # # # For DT scheme from Rgtsp package: # # default settings with min.score=0.6 (to avoid running out of rules with the default setting 0.75) # #devtools::install_github("bioinfo-recetox/Rgtsp") # library(Rgtsp) # classifier_DT <- mtsp(X = t(as.matrix(training_data)), # y = Train_Classes, # min.score=0.60) ## ---- fig.cap="Overall accuracy for the three pair-based approaches. X-axis show the performance of different models trained on training datasets with different sizes.", echo=FALSE, message=FALSE---- knitr::include_graphics("images/Accuracy_final.png") ## ---- out.height="90%", fig.cap="Average of the overall training time including the gene and rules filtering and model training. Training repeated 5 times for each model and the bars show the average time.", echo=FALSE, message=FALSE---- knitr::include_graphics("images/Both_CPUs.png") ## ----------------------------------------------------------------------------- sessionInfo()