--- title: "leapR Paper Examples" output: BiocStyle::html_document: toc: true toc_depth: 2 vignette: | %\VignetteIndexEntry{leapR} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r knitr, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", cache = FALSE ) ``` # Load libraries needed ```{r libraries, message=FALSE, warning = FALSE} # load the core libraries library(leapR) library(gplots) library(rmarkdown) # plotting helpers used in this vignette library(ggplot2) library(dplyr) library(tibble) library(stringr) ``` # Example data A sample data set is included that is from the CPTAC study of 169 ovarian tumors. We include the dataset as a \code{SummarizedExperiment} object, containing three assays (transcriptomics, global proteomics, and phosphoproteomics) to enable interoperability with other tools, and store example file as `rda` on [Figshare](https://figshare.com/account/projects/251807/articles/29250845) as example. This data can be loaded as follows: ```{r omicsdata, message=FALSE, warning=FALSE} url <- "https://api.figshare.com/v2/file/download/56536217" pdata <- download.file(url, method = "libcurl", destfile = "protData.rda") # as.matrix() load("protData.rda") p <- file.remove("protData.rda") url <- "https://api.figshare.com/v2/file/download/56536214" tdata <- download.file(url, method = "libcurl", destfile = "transData.rda") load("transData.rda") p <- file.remove("transData.rda") url <- "https://api.figshare.com/v2/file/download/56536211" phdata <- download.file(url, method = "libcurl", destfile = "phosData.rda") load("phosData.rda") p <- file.remove("phosData.rda") ``` We also have local data we can load ```{r load lists} data(shortlist) data(longlist) data(ncipid) data(kinasesubstrates) ## columns that we want to use for results cols_to_display <- c("ingroup_n", "outgroup_n", "background_n", "pvalue", "BH_pvalue") ``` # Figure 2 We will compare the ability of transcriptomics, proteomics, and phosphoproteomics to inform about differences between short and long surviving patient groups. In addition to other methods, we also employ a `calcTTest` function that takes two sets of samples from the `SummarizedExperiment` object and computes the t-test between them. The results are then stored in the `rowData` of the same object, so that they can be used for enrichment later on. This spans the multiple enrichment methods in leapR and also includes multi-omics. The resulting heatmap is presented as Figure 2 in the paper. ``` {r figure_2, echo=TRUE, warning=FALSE, message=FALSE, error=FALSE} # load the single omic and multi-omic pathway databases data("krbpaths") data("mo_krbpaths") # comparison enrichment in transcriptional data transdata.comp.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "enrichment_comparison", eset = tset, assay_name = "transcriptomics", primary_columns = shortlist, secondary_columns = longlist ) # comparison enrichment in proteomics data # this is the same code used above, just repeated here for clarity protdata.comp.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "enrichment_comparison", eset = pset, assay_name = "proteomics", primary_columns = shortlist, secondary_columns = longlist ) # comparison enrichment in phosphoproteomics data phosphodata.comp.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "enrichment_comparison", eset = phset, assay_name = "phosphoproteomics", primary_columns = shortlist, secondary_columns = longlist, id_column = "hgnc_id" ) # set enrichment in transcriptomics data # perform the comparison t-test tset <- leapR::calcTTest(tset, assay_name = "transcriptomics", shortlist, longlist) ## now we need to run enrichment in sets with target list, not eset transdata.set.enrichment.svl <- leapR::leapR( geneset = krbpaths, eset = tset, assay_name = "transcriptomics", enrichment_method = "enrichment_in_sets", primary_columns = "pvalue", greaterthan = FALSE, threshold = 0.05 ) pset <- leapR::calcTTest(pset, assay_name = "proteomics", shortlist, longlist) protdata.set.enrichment.svl <- leapR::leapR( geneset = krbpaths, eset = pset, assay_name = "proteomics", enrichment_method = "enrichment_in_sets", primary_columns = "pvalue", greaterthan = FALSE, threshold = 0.05 ) phset <- leapR::calcTTest(phset, assay_name = "phosphoproteomics", shortlist, longlist) phosphodata.set.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "enrichment_in_sets", id_column = "hgnc_id", assay_name = "phosphoproteomics", eset = phset, primary_columns = "pvalue", greaterthan = FALSE, threshold = 0.05 ) # order enrichment in transcriptomics data transdata.order.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "enrichment_in_order", eset = tset, assay_name = "transcriptomics", primary_columns = "difference" ) # order enrichment in proteomics data protdata.order.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "enrichment_in_order", eset = pset, assay_name = "proteomics", primary_columns = "difference" ) # order enrichment in phosphoproteomics data phosphodata.order.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "enrichment_in_order", id_column = "hgnc_id", method = 'ztest', eset = phset, assay_name = "phosphoproteomics", primary_columns = "difference" ) # correlation difference in transcriptomics data transdata.corr.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "correlation_comparison", eset = tset, assay_name = "transcriptomics", primary_columns = shortlist, secondary_columns = longlist ) # correlation difference in proteomics data protdata.corr.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "correlation_comparison", eset = pset, assay_name = "proteomics", primary_columns = shortlist, secondary_columns = longlist ) # correlation difference in phosphoproteomics data phosphodata.corr.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "correlation_comparison", eset = phset, assay_name = "phosphoproteomics", primary_columns = shortlist, secondary_columns = longlist, id_column = "hgnc_id" ) # combine the omics data into one with prefix tags comboset <- leapR::combine_omics(list(pset, phset, tset), c(NA, "hgnc_id", NA)) # comparison enrichment for combodata # when we use expression set, we do not need to use the mo_krbpaths #since the id mapping column is used combodata.enrichment.svl <- leapR::leapR( geneset = krbpaths, # mo_krbpaths, enrichment_method = "enrichment_comparison", eset = comboset, assay_name = "combined", primary_columns = shortlist, secondary_columns = longlist, id_column = "id" ) # set enrichment in combo data # perform the comparison t test comboset <- leapR::calcTTest(comboset, assay_name = "combined", shortlist, longlist) combodata.set.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "enrichment_in_sets", eset = comboset, primary_columns = "pvalue", assay_name = "combined", id_column = "id", greaterthan = FALSE, threshold = 0.05 ) # order enrichment in combo data combodata.order.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "enrichment_in_order", assay_name = "combined", eset = comboset, primary_columns = "difference", id_column = "id" ) # correlation difference in combo data combodata.corr.enrichment.svl <- leapR::leapR( geneset = krbpaths, enrichment_method = "correlation_comparison", eset = comboset, assay_name = "combined", primary_columns = shortlist, id_column = "id", secondary_columns = longlist ) # now take all these results and combine them into one figure all_results <- list( transdata.comp.enrichment.svl, protdata.comp.enrichment.svl, phosphodata.comp.enrichment.svl, combodata.enrichment.svl, transdata.set.enrichment.svl, protdata.set.enrichment.svl, phosphodata.set.enrichment.svl, combodata.set.enrichment.svl, transdata.order.enrichment.svl, protdata.order.enrichment.svl, phosphodata.order.enrichment.svl, combodata.order.enrichment.svl, transdata.corr.enrichment.svl, protdata.corr.enrichment.svl, phosphodata.corr.enrichment.svl, combodata.corr.enrichment.svl ) pathways_of_interest <- c( "KEGG_APOPTOSIS", "KEGG_CELL_CYCLE", "KEGG_ERBB_SIGNALING_PATHWAY", "KEGG_FOCAL_ADHESION", "KEGG_INSULIN_SIGNALING_PATHWAY", "KEGG_MAPK_SIGNALING_PATHWAY", "KEGG_MISMATCH_REPAIR", "KEGG_MTOR_SIGNALING_PATHWAY", "KEGG_OXIDATIVE_PHOSPHORYLATION", "KEGG_P53_SIGNALING_PATHWAY", "KEGG_PATHWAYS_IN_CANCER", "KEGG_PROTEASOME", "KEGG_RIBOSOME", "KEGG_VEGF_SIGNALING_PATHWAY", "KEGG_WNT_SIGNALING_PATHWAY" ) results.frame <- data.frame( pathway = pathways_of_interest, td.comp = all_results[[1]][pathways_of_interest, "BH_pvalue"] < 0.05, pd.comp = all_results[[2]][pathways_of_interest, "BH_pvalue"] < 0.05, fd.comp = all_results[[3]][pathways_of_interest, "BH_pvalue"] < 0.05, cd.comp = all_results[[4]][pathways_of_interest, "BH_pvalue"] < 0.05, td.set = all_results[[5]][pathways_of_interest, "BH_pvalue"] < 0.05, pd.set = all_results[[6]][pathways_of_interest, "BH_pvalue"] < 0.05, fd.set = all_results[[7]][pathways_of_interest, "BH_pvalue"] < 0.05, cd.set = all_results[[8]][pathways_of_interest, "BH_pvalue"] < 0.05, td.order = all_results[[9]][pathways_of_interest, "BH_pvalue"] < 0.05, pd.order = all_results[[10]][pathways_of_interest, "BH_pvalue"] < 0.05, fd.order = all_results[[11]][pathways_of_interest, "BH_pvalue"] < 0.05, cd.order = all_results[[12]][pathways_of_interest, "BH_pvalue"] < 0.05, td.corr = all_results[[13]][pathways_of_interest, "BH_pvalue"] < 0.05, pd.corr = all_results[[14]][pathways_of_interest, "BH_pvalue"] < 0.05, fd.corr = all_results[[15]][pathways_of_interest, "BH_pvalue"] < 0.05, cd.corr = all_results[[16]][pathways_of_interest, "BH_pvalue"] < 0.05 ) results.frame.or <- data.frame( pathway = pathways_of_interest, td.comp = all_results[[1]][pathways_of_interest, "oddsratio"], pd.comp = all_results[[2]][pathways_of_interest, "oddsratio"], fd.comp = all_results[[3]][pathways_of_interest, "oddsratio"], cd.comp = all_results[[4]][pathways_of_interest, "oddsratio"], td.set = log(all_results[[5]][pathways_of_interest, "oddsratio"], 2), pd.set = log(all_results[[6]][pathways_of_interest, "oddsratio"], 2), fd.set = log(all_results[[7]][pathways_of_interest, "oddsratio"], 2), cd.set = log(all_results[[8]][pathways_of_interest, "oddsratio"], 2), td.order = all_results[[9]][pathways_of_interest, "oddsratio"], pd.order = all_results[[10]][pathways_of_interest, "oddsratio"], fd.order = all_results[[11]][pathways_of_interest, "oddsratio"], cd.order = all_results[[12]][pathways_of_interest, "oddsratio"], td.corr = all_results[[13]][pathways_of_interest, "oddsratio"], pd.corr = all_results[[14]][pathways_of_interest, "oddsratio"], fd.corr = all_results[[15]][pathways_of_interest, "oddsratio"], cd.corr = all_results[[16]][pathways_of_interest, "oddsratio"] ) rownames(results.frame) <- results.frame[, 1] rownames(results.frame.or) <- results.frame.or[, 1] results.frame.sig <- results.frame[, 2:17] * results.frame.or[, 2:17] heatmap.2(as.matrix(results.frame.sig[, c(1:4, 9:16)]), Colv = NA, trace = "none", breaks = c(-1, -.1, -0.0001, 0, 0.1, 1), col = c("blue", "lightblue", "grey", "pink", "red"), dendrogram = "none") ``` Figure 3. An application of the KSEA-like approach in leapR as applied to our example data. In this example we are looking for known substrate sets of kinases (from Phosphosite Plus) that are enriched in the short vs long comparison of phosphopeptides. ``` {r figure_3, warning=FALSE, message=FALSE} # this comparison of abundance in substrates between case and control # is lopsided in the sense that phosphorylation levels were previously # reported to be overall higher in the short survivors. Thus the # results are not terribly interesting (all kinases are in the same # direction) phosphodata.ksea.comp.svl <- leapR::leapR( geneset = kinasesubstrates, enrichment_method = "enrichment_comparison", eset = phset, assay_name = "phosphoproteomics", primary_columns = shortlist, secondary_columns = longlist ) # thus for the example we'll look at correlation between known substrates # in the case v control conditions phosphodata.ksea.corr.svl <- leapR::leapR( geneset = kinasesubstrates, enrichment_method = "correlation_comparison", eset = phset, assay_name = "phosphoproteomics", primary_columns = shortlist, secondary_columns = longlist ) # for the example we are using an UNCORRECTED PVALUE # which will allow us to plot more values, but # for real applications it's necessary to use the # CORRECTED PVALUE # here are all the kinases *significant (*uncorrected) from the analysis or <- order(phosphodata.ksea.corr.svl[, "pvalue"]) ksea_result <- phosphodata.ksea.corr.svl[or, ][1:9, ] ksea_cols <- rep("grey", 9) ksea_cols[which(ksea_result[, "oddsratio"] > 0)] <- "black" # plot left panel: correlation significance of top most significant kinases barplot(ksea_result[, "oddsratio"], horiz = TRUE, xlim = c(-1, 0.5), names.arg = rownames(ksea_result), las = 1, col = ksea_cols ) # plot right panel: abundance comparison results of the same kinases barplot(phosphodata.ksea.comp.svl[rownames(ksea_result), "oddsratio"], horiz = TRUE, names.arg = rownames(ksea_result), las = 1, col = "black" ) ```