--- title: "Guided tutorial to COTAN V.2" author: - name: "Silvia Giulia Galfrè" affiliation: "Department of Computer Science, University of Pisa" package: COTAN output: BiocStyle::html_document vignette: > %\VignetteIndexEntry{Guided tutorial to COTAN V.2} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ## Preamble ```{r, include = FALSE} knitr::opts_chunk[["set"]]( collapse = TRUE, comment = "#>", dev = "png", fig.width = 6L, fig.height = 4L, dpi = 72 ) ``` ```{r message=FALSE, warning=FALSE} library(COTAN) library(zeallot) library(rlang) library(data.table) library(Rtsne) library(qpdf) library(GEOquery) library(ComplexHeatmap) options(parallelly.fork.enable = TRUE) ``` ## Introduction This tutorial contains the same functionalities as the first release of the COTAN tutorial but done using the new and updated functions. ## Get the data-set Download the data-set for `"mouse cortex E17.5"`. ```{r eval=TRUE, include=TRUE} dataDir <- tempdir() GEO <- "GSM2861514" fName <- "GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz" dataSetFile <- file.path(dataDir, GEO, fName) dir.create(file.path(dataDir, GEO), showWarnings = FALSE) if (!file.exists(dataSetFile)) { getGEOSuppFiles(GEO, makeDirectory = TRUE, baseDir = dataDir, fetch_files = TRUE, filter_regex = fName) } sample.dataset <- read.csv(dataSetFile, sep = "\t", row.names = 1L) ``` Define a directory where the output will be stored. ```{r} outDir <- dataDir # Log-level 2 was chosen to showcase better how the package works # In normal usage a level of 0 or 1 is more appropriate setLoggingLevel(2L) # This file will contain all the logs produced by the package # as if at the highest logging level setLoggingFile(file.path(outDir, "vignette_v2.log")) message("COTAN uses the `torch` library when asked to `optimizeForSpeed`") message("Run the command 'options(COTAN.UseTorch = FALSE)'", " in your session to disable `torch` completely!") # this command does check whether the torch library is properly installed c(useTorch, deviceStr) %<-% COTAN:::canUseTorch(TRUE, "cuda") if (useTorch) { message("The `torch` library is available and ready to use") if (deviceStr == "cuda") { message("The `torch` library can use the `CUDA` GPU") } else { message("The `torch` library can only use the CPU") message("Please ensure you have the `OpenBLAS` libraries", " installed on the system") } } rm(useTorch, deviceStr) ``` # Analytical pipeline Initialize the `COTAN` object with the row count table and the metadata for the experiment. ```{r} cond <- "mouse_cortex_E17.5" obj <- COTAN(raw = sample.dataset) obj <- initializeMetaDataset(obj, GEO = GEO, sequencingMethod = "Drop_seq", sampleCondition = cond) logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])), logLevel = 1L) ``` Before we proceed to the analysis, we need to clean the data. The analysis will use a matrix of raw `UMI` counts as the input. To obtain this matrix, we have to remove any potential cell doublets or multiplets, as well as any low quality or dying cells. ## Data cleaning We can check the library size (`UMI` number) with an *Empirical Cumulative Distribution* function ```{r} plot(ECDPlot(obj)) ``` ```{r} plot(cellSizePlot(obj)) ``` ```{r} plot(genesSizePlot(obj)) ``` ```{r} plot(scatterPlot(obj)) ``` During the cleaning, every time we want to remove cells or genes we can use the `dropGenesCells()`function. Drop cells with too many reads as they are probably doublets or multiplets ```{r} cellsSizeThr <- 6000L obj <- addElementToMetaDataset(obj, "Cells size threshold", cellsSizeThr) cells_to_rem <- getCells(obj)[getCellsSize(obj) > cellsSizeThr] obj <- dropGenesCells(obj, cells = cells_to_rem) plot(cellSizePlot(obj)) ``` To drop cells by gene number: high genes count might also indicate doublets... ```{r} genesSizeHighThr <- 3000L obj <- addElementToMetaDataset(obj, "Num genes high threshold", genesSizeHighThr) cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) > genesSizeHighThr] obj <- dropGenesCells(obj, cells = cells_to_rem) plot(genesSizePlot(obj)) ``` Drop cells with too low genes expression as they are probably dead ```{r} genesSizeLowThr <- 500L obj <- addElementToMetaDataset(obj, "Num genes low threshold", genesSizeLowThr) cells_to_rem <- getCells(obj)[getNumExpressedGenes(obj) < genesSizeLowThr] obj <- dropGenesCells(obj, cells = cells_to_rem) plot(genesSizePlot(obj)) ``` Cells with a too high percentage of mitochondrial genes are likely dead (or at the last problematic) cells. So we drop them! ```{r} c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt") plot(mitPlot) ``` ```{r} mitPercThr <- 1.0 obj <- addElementToMetaDataset(obj, "Mitoc. perc. threshold", mitPercThr) cells_to_rem <- rownames(mitSizes)[mitSizes[["mit.percentage"]] > mitPercThr] obj <- dropGenesCells(obj, cells = cells_to_rem) c(mitPlot, mitSizes) %<-% mitochondrialPercentagePlot(obj, genePrefix = "^Mt") plot(mitPlot) ``` If we do not want to consider the mitochondrial genes we can remove them before starting the analysis. ```{r} genes_to_rem <- getGenes(obj)[grep("^Mt", getGenes(obj))] cells_to_rem <- getCells(obj)[which(getCellsSize(obj) == 0L)] obj <- dropGenesCells(obj, genes_to_rem, cells_to_rem) ``` We want also to log the current status. ```{r} logThis(paste("n cells", getNumCells(obj)), logLevel = 1L) ``` The `clean()` function estimates all the parameters for the data. Therefore, we have to run it again every time we remove any genes or cells from the data. ```{r} obj <- addElementToMetaDataset(obj, "Num drop B group", 0L) obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) plot(pcaCellsPlot) ``` ```{r} plot(genesPlot) ``` We can observe here that the red cells are really enriched in hemoglobin genes so we prefer to remove them. They can be extracted from the `pcaCellsData` object and removed. ```{r eval=TRUE, include=TRUE} cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"] obj <- dropGenesCells(obj, cells = cells_to_rem) obj <- addElementToMetaDataset(obj, "Num drop B group", 1L) obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) plot(pcaCellsPlot) ``` To color the PCA based on `nu` (so the cells' efficiency) ```{r} plot(UDEPlot) ``` `UDE` (color) should not correlate with principal components! This is very important. The next part is used to remove the cells with efficiency too low. ```{r} plot(nuPlot) ``` We can zoom on the smallest values and, if COTAN detects a clear elbow, we can decide to remove the cells. ```{r} plot(zoomedNuPlot) ``` We also save the defined threshold in the metadata and re-run the estimation ```{r} UDELowThr <- 0.30 obj <- addElementToMetaDataset(obj, "Low UDE cells' threshold", UDELowThr) obj <- addElementToMetaDataset(obj, "Num drop B group", 2L) obj <- estimateNuLinear(obj) cells_to_rem <- getCells(obj)[getNu(obj) < UDELowThr] obj <- dropGenesCells(obj, cells = cells_to_rem) ``` Repeat the estimation after the cells are removed ```{r} obj <- clean(obj) c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot, zoomedNuPlot) %<-% cleanPlots(obj) plot(pcaCellsPlot) ``` ```{r} plot(scatterPlot(obj)) ``` ```{r} logThis(paste("n cells", getNumCells(obj)), logLevel = 1L) ``` ## COTAN analysis In this part, all the contingency tables are computed and used to get the statistics necessary to `COEX` evaluation and storing ```{r} obj <- proceedToCoex(obj, calcCoex = TRUE, optimizeForSpeed = TRUE, cores = 6L, deviceStr = "cuda", saveObj = FALSE, outDir = outDir) ``` When `saveObj == TRUE`, in the previous step, this step can be skipped as the `COTAN` object has already been saved in the `outDir`. ```{r eval=FALSE, include=TRUE} # saving the structure saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS"))) ``` ## Automatic run It is also possible to run directly a single function if we don't want to clean anything. ```{r eval=FALSE, include=TRUE} obj2 <- automaticCOTANObjectCreation( raw = sample.dataset, GEO = GEO, sequencingMethod = "Drop_seq", sampleCondition = cond, calcCoex = TRUE, cores = 6L, optimizeForSpeed = TRUE, saveObj = TRUE, outDir = outDir) ``` # Analysis of the elaborated data ## `GDI` To calculate and store the Global Differentiation Index (`GDI`) we can run: ```{r} gdiDF <- calculateGDI(obj) head(gdiDF) # This will store only the $GDI column obj <- storeGDI(obj, genesGDI = gdiDF) ``` The next function can either plot the `GDI` for the dataset directly or use the pre-computed dataframe. It marks the given threshold 1.43 (in red) and the 50% and 75% quantiles (in blue). We can also specify some gene sets (three in this case) that we want to label explicitly in the `GDI` plot. ```{r} genesList <- list( "NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"), "PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"), "hk" = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a", "Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1", "Tars", "Amacr") ) GDIPlot(obj, cond = cond, genes = genesList, GDIThreshold = 1.40) ``` The percentage of cells expressing the gene in the third column of this data-frame is reported. ## Heatmaps To perform the Gene Pair Analysis, we can plot a heatmap of the `COEX` values between two gene sets. We have to define the different gene sets (`list.genes`) in a list. Then we can choose which sets to use in the function parameter sets (for example, from 1 to 3). We also have to provide an array of the file name prefixes for each condition (for example, "mouse_cortex_E17.5"). In fact, this function can plot genes relationships across many different conditions to get a complete overview. ```{r} plot(heatmapPlot(obj, genesLists = genesList)) ``` We can also plot a general heatmap of `COEX` values based on some markers like the following one. ```{r, eval=TRUE, include=TRUE} invisible( genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Vim", "Hes1"), pValueThreshold = 0.001, symmetric = TRUE)) ``` ```{r, eval=FALSE, include=TRUE} invisible( genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"), secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"), pValueThreshold = 0.001, symmetric = FALSE)) ``` ## Get data tables Sometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes. To get them we can use two functions: `contingencyTables()` to produce the observed and expected data ```{r} print("Contingency Tables:") contingencyTables(obj, g1 = "Satb2", g2 = "Bcl11b") print("Corresponding Coex") getGenesCoex(obj)["Satb2", "Bcl11b"] ``` Another useful function is `getGenesCoex()`. This can be used to extract the whole or a partial `COEX` matrix from a `COTAN` object. ```{r} # For the whole matrix coex <- getGenesCoex(obj, zeroDiagonal = FALSE) coex[1000L:1005L, 1000L:1005L] ``` ```{r} # For a partial matrix coex <- getGenesCoex(obj, genes = c("Satb2", "Bcl11b", "Fezf2")) coex[1000L:1005L, ] ``` ## Establishing genes' clusters `COTAN` provides a way to establish genes' clusters given some lists of markers ```{r eval=TRUE, include=TRUE} layersGenes <- list( "L1" = c("Reln", "Lhx5"), "L2/3" = c("Satb2", "Cux1"), "L4" = c("Rorb", "Sox5"), "L5/6" = c("Bcl11b", "Fezf2"), "Prog" = c("Vim", "Hes1", "Dummy") ) c(gSpace, eigPlot, pcaGenesClDF, treePlot) %<-% establishGenesClusters(obj, groupMarkers = layersGenes, numGenesPerMarker = 25L, kCuts = 5L) plot(eigPlot) ``` ```{r eval=TRUE, include=TRUE} plot(treePlot) ``` ```{r eval=TRUE, include=TRUE} colSelection <- vapply(pcaGenesClDF, is.numeric, logical(1L)) genesUmapPl <- UMAPPlot(pcaGenesClDF[, colSelection, drop = FALSE], clusters = getColumnFromDF(pcaGenesClDF, "hclust"), elements = layersGenes, title = "Genes' clusters UMAP Plot", numNeighbors = 32L, minPointsDist = 0.25) plot(genesUmapPl) ``` ## Uniform Clustering It is possible to obtain a cell clusterization based on the concept of *uniformity* of expression of the genes across the cells. That is the cluster satisfies the null hypothesis of the `COTAN` model: the genes expression is not dependent on the cell in consideration. There are two functions involved into obtaining a proper clusterization: the first is `cellsUniformClustering` that uses standard tools clusterization methods, but then discards and re-clusters any *non-uniform* cluster. Please note that the most important parameters for the users are the `GDIThreshold`s inside the **Uniform Transcript** checkers: they define how strict is the check. Default constructed advance check gives a pretty strong guarantee of uniformity for the *cluster*. ```{r eval=FALSE, include=TRUE} # This code is a little too computationally heavy to be used in an example # So we stored the result and we can load it in the next section # default constructed checker is OK advChecker <- new("AdvancedGDIUniformityCheck") c(splitClusters, splitCoexDF) %<-% cellsUniformClustering(obj, initialResolution = 0.8, checker = advChecker, optimizeForSpeed = TRUE, deviceStr = "cuda", cores = 6L, genesSel = "HGDI", saveObj = TRUE, outDir = outDir) obj <- addClusterization(obj, clName = "split", clusters = splitClusters, coexDF = splitCoexDF) table(splitClusters) ``` In the case one already has an existing *clusterization*, it is possible to calculate the *DEA* `data.frame` and add it to the `COTAN` object. ```{r eval=TRUE, include=TRUE} data("vignette.split.clusters", package = "COTAN") splitClusters <- vignette.split.clusters[getCells(obj)] splitCoexDF <- DEAOnClusters(obj, clusters = splitClusters) obj <- addClusterization(obj, clName = "split", clusters = splitClusters, coexDF = splitCoexDF, override = FALSE) ``` It is possible to have some statistics about the *clusterization* ```{r eval=TRUE, include=TRUE} c(summaryData, summaryPlot) %<-% clustersSummaryPlot(obj, clName = "split", plotTitle = "split summary") summaryData ``` The `ExpGenes` column contains the number of genes that are expressed in any cell of the relevant *cluster*, while the `ExpGenes25` column contains the number of genes expressed in at the least 25% of the cells of the relevant *cluster* ```{r eval=TRUE, include=TRUE} plot(summaryPlot) ``` It is possible to visualize how relevant are the *marker genes'* `lists` with respect to the given *clusterization* ```{r eval=TRUE, include=TRUE} c(splitHeatmap, scoreDF, pValueDF) %<-% clustersMarkersHeatmapPlot(obj, groupMarkers = layersGenes, clName = "split", kCuts = 5L, adjustmentMethod = "holm") draw(splitHeatmap) ``` Since the algorithm that creates the *clusters* is not directly geared to achieve cluster uniformity, there might be some *clusters* that can be merged back together and still be **uniform**. This is the purpose of the function `mergeUniformCellsClusters` that takes a *clusterization* and tries to merge cluster pairs after checking that together the pair forms a uniform cluster. In order to avoid running the totality of the possible checks (as it can explode quickly with the number of *clusters*), the function relies on a *related* distance the find the cluster pairs that have the highest chance to be merged. ```{r eval=FALSE, include=TRUE} c(mergedClusters, mergedCoexDF) %<-% mergeUniformCellsClusters(obj, clusters = splitClusters, checkers = advChecker, optimizeForSpeed = TRUE, deviceStr = "cuda", cores = 6L, saveObj = TRUE, outDir = outDir) # merges are: # 1 <- 06 + 07 # 2 <- '-1' + 08 + 11 # 3 <- 09 # 4 <- 01 # 5 <- 02 # 6 <- 12 # 7 <- 03 + 04 + 10 + 13 # 8 <- 05 obj <- addClusterization(obj, clName = "merge", override = TRUE, clusters = mergedClusters, coexDF = mergedCoexDF) table(mergedClusters) ``` Again, here, the most important parameters for the users are the `GDIThreshold`s inside the **Uniform Transcript** checkers: they define how strict is the check. Default constructed advance check gives a pretty strong guarantee of uniformity for the *cluster*. If one wants to achieve a more *relaxed* merge, it is possible to define a sequence of checkers, each less stringent than the previous, that will be applied sequentially: First all the merges with the stricter checker are performed, than those that satisfy the second, and so on. ```{r eval=FALSE, include=TRUE} checkersList <- list(advChecker, shiftCheckerThresholds(advChecker, 0.01), shiftCheckerThresholds(advChecker, 0.03)) prevCheckRes <- data.frame() # In this case we want to re-use the already calculated merge checks # Se we reload them from the output files. This, along a similar facility for # the split method, is also useful in the cases the execution is interrupted # prematurely... # if (TRUE) { # read from the last file among those named all_check_results_XX.csv mergeDir <- file.path(outDir, cond, "leafs_merge") resFiles <- list.files(path = mergeDir, pattern = "all_check_results_.*csv", full.names = TRUE) prevCheckRes <- read.csv(resFiles[length(resFiles)], header = TRUE, row.names = 1L) } c(mergedClusters2, mergedCoexDF2) %<-% mergeUniformCellsClusters(obj, clusters = splitClusters, checkers = checkersList, allCheckResults = prevCheckRes, optimizeForSpeed = TRUE, deviceStr = "cuda", cores = 6L, saveObj = TRUE, outDir = outDir) # merges are: # 1 <- '-1' + 06 + 09 # 2 <- 07 # 3 <- 03 + 04 + 10 + 13 # 4 <- 12 # 5 <- 01 + 08 + 11 # 6 <- 02 + 05 obj <- addClusterization(obj, clName = "merge2", override = TRUE, clusters = mergedClusters2, coexDF = mergedCoexDF2) table(mergedClusters2) ``` In the case one already has existing *clusterizations*, it is possible to calculate their *DEA* `data.frame` and add them to the `COTAN` object. ```{r eval=TRUE, include=TRUE} data("vignette.merge.clusters", package = "COTAN") mergedClusters <- vignette.merge.clusters[getCells(obj)] mergedCoexDF <- DEAOnClusters(obj, clusters = mergedClusters) obj <- addClusterization(obj, clName = "merge", clusters = mergedClusters, coexDF = mergedCoexDF, override = FALSE) data("vignette.merge2.clusters", package = "COTAN") mergedClusters2 <- vignette.merge2.clusters[getCells(obj)] mergedCoexDF2 <- DEAOnClusters(obj, clusters = mergedClusters2) obj <- addClusterization(obj, clName = "merge2", clusters = mergedClusters2, coexDF = mergedCoexDF2, override = FALSE) table(mergedClusters2, mergedClusters) ``` Here is possible to visualize how the `split` clusters are merged in to `merge` and `merge2` ```{r eval=TRUE, include=TRUE} c(mergeHeatmap, mergeScoresDF, mergePValuesDF) %<-% clustersMarkersHeatmapPlot(obj, clName = "merge", condNameList = "split", conditionsList = list(splitClusters)) draw(mergeHeatmap) c(merge2Heatmap, merge2ScoresDF, merge2PValuesDF) %<-% clustersMarkersHeatmapPlot(obj, clName = "merge2", condNameList = "split", conditionsList = list(splitClusters)) draw(merge2Heatmap) ``` In the following graph, it is possible to check that the found clusters align very well to the expression of the layers' genes defined above... It is possible to plot the *clusterization* on a `UMAP` plot ```{r} c(umapPlot, cellsRDM) %<-% cellsUMAPPlot(obj, clName = "merge2", clusters = NULL, useCoexEigen = TRUE, dataMethod = "SignLogL", numComp = 25L, genesSel = "HGDI", numGenes = 200L, colors = NULL, numNeighbors = 30L, minPointsDist = 0.3) plot(umapPlot) ``` ## Vignette clean-up stage The next few lines are just to clean. ```{r} if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) { #Delete file if it exists file.remove(file.path(outDir, paste0(cond, ".cotan.RDS"))) } if (file.exists(file.path(outDir, paste0(cond, "_times.csv")))) { #Delete file if it exists file.remove(file.path(outDir, paste0(cond, "_times.csv"))) } if (dir.exists(file.path(outDir, cond))) { unlink(file.path(outDir, cond), recursive = TRUE) } if (dir.exists(file.path(outDir, GEO))) { unlink(file.path(outDir, GEO), recursive = TRUE) } # stop logging to file setLoggingFile("") file.remove(file.path(outDir, "vignette_v2.log")) options(parallelly.fork.enable = FALSE) ``` ```{r} Sys.time() ``` ```{r} sessionInfo() ```