--- title: "Examples and use cases" author: - name: Lukas M. Weber affiliation: - &id1 "Institute of Molecular Life Sciences, University of Zurich, Zurich, Switzerland" - &id2 "SIB Swiss Institute of Bioinformatics, Zurich, Switzerland" - name: Charlotte Soneson affiliation: - &id3 "Friedrich Miescher Institute for Biomedical Research, Basel, Switzerland" - &id4 "SIB Swiss Institute of Bioinformatics, Basel, Switzerland" package: HDCytoData output: BiocStyle::html_document vignette: > %\VignetteIndexEntry{2. Examples and use cases} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Examples and use cases This vignette demonstrates several examples and use cases for the datasets in the `HDCytoData` package. # Dimension reduction Using the clustering datasets, we can generate dimension reduction plots with colors indicating the ground truth cell population labels. This provides a visual representation of the cell population structure in these datasets, which is useful during exploratory data analysis and for representing the output of clustering or other downstream analysis algorithms. Below, we compare three different dimension reduction algorithms (principal component analysis [PCA], t-distributed stochastic neighbor embedding [tSNE], and uniform manifold approximation and projection [UMAP]), for one of the datasets (`Levine_32dim`). This dataset contains ground truth cell population labels for 14 immune cell populations. ```{r} suppressPackageStartupMessages(library(HDCytoData)) suppressPackageStartupMessages(library(SummarizedExperiment)) suppressPackageStartupMessages(library(Rtsne)) suppressPackageStartupMessages(library(umap)) suppressPackageStartupMessages(library(ggplot2)) ``` ```{r} # --------- # Load data # --------- d_SE <- Levine_32dim_SE() ``` ```{r} # ------------- # Preprocessing # ------------- # select 'cell type' marker columns for defining clusters d_sub <- assay(d_SE[, colData(d_SE)$marker_class == "type"]) # extract cell population labels population <- rowData(d_SE)$population_id dim(d_sub) stopifnot(nrow(d_sub) == length(population)) # transform data using asinh with cofactor 5 cofactor <- 5 d_sub <- asinh(d_sub / cofactor) summary(d_sub) # subsample cells for faster runtimes in vignette n <- 2000 set.seed(123) ix <- sample(seq_len(nrow(d_sub)), n) d_sub <- d_sub[ix, ] population <- population[ix] dim(d_sub) stopifnot(nrow(d_sub) == length(population)) # remove any near-duplicate rows (required by Rtsne) dups <- duplicated(d_sub) d_sub <- d_sub[!dups, ] population <- population[!dups] dim(d_sub) stopifnot(nrow(d_sub) == length(population)) ``` ```{r, fig.width = 6} # ------------------------ # Dimension reduction: PCA # ------------------------ n_dims <- 2 # run PCA # (note: no scaling, since asinh-transformed dimensions are already comparable) out_PCA <- prcomp(d_sub, center = TRUE, scale. = FALSE) dims_PCA <- out_PCA$x[, seq_len(n_dims)] colnames(dims_PCA) <- c("PC_1", "PC_2") head(dims_PCA) stopifnot(nrow(dims_PCA) == length(population)) colnames(dims_PCA) <- c("dimension_x", "dimension_y") dims_PCA <- cbind(as.data.frame(dims_PCA), population, type = "PCA") head(dims_PCA) str(dims_PCA) # generate plot d_plot <- dims_PCA str(d_plot) colors <- c(rainbow(14), "gray75") ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) + facet_wrap(~ type, scales = "free") + geom_point(size = 0.7, alpha = 0.5) + scale_color_manual(values = colors) + labs(x = "dimension x", y = "dimension y") + theme_bw() + theme(aspect.ratio = 1, legend.key.height = unit(4, "mm")) ``` ```{r, fig.width = 6} # ------------------------- # Dimension reduction: tSNE # ------------------------- # run Rtsne set.seed(123) out_Rtsne <- Rtsne(as.matrix(d_sub), dims = n_dims) dims_Rtsne <- out_Rtsne$Y colnames(dims_Rtsne) <- c("tSNE_1", "tSNE_2") head(dims_Rtsne) stopifnot(nrow(dims_Rtsne) == length(population)) colnames(dims_Rtsne) <- c("dimension_x", "dimension_y") dims_Rtsne <- cbind(as.data.frame(dims_Rtsne), population, type = "tSNE") head(dims_Rtsne) str(dims_Rtsne) # generate plot d_plot <- dims_Rtsne ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) + facet_wrap(~ type, scales = "free") + geom_point(size = 0.7, alpha = 0.5) + scale_color_manual(values = colors) + labs(x = "dimension x", y = "dimension y") + theme_bw() + theme(aspect.ratio = 1, legend.key.height = unit(4, "mm")) ``` ```{r, fig.width = 6} # ------------------------- # Dimension reduction: UMAP # ------------------------- # run umap set.seed(123) out_umap <- umap(d_sub) dims_umap <- out_umap$layout colnames(dims_umap) <- c("UMAP_1", "UMAP_2") head(dims_umap) stopifnot(nrow(dims_umap) == length(population)) colnames(dims_umap) <- c("dimension_x", "dimension_y") dims_umap <- cbind(as.data.frame(dims_umap), population, type = "UMAP") head(dims_umap) str(dims_umap) # generate plot d_plot <- dims_umap ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) + facet_wrap(~ type, scales = "free") + geom_point(size = 0.7, alpha = 0.5) + scale_color_manual(values = colors) + labs(x = "dimension x", y = "dimension y") + theme_bw() + theme(aspect.ratio = 1, legend.key.height = unit(4, "mm")) ``` # Clustering We can also use the clustering datasets to calculate a new clustering using an algorithm of our choice, and then evaluate the performance of the clustering by comparing against the ground truth cell population labels. Common metrics for evaluating clustering performance include the mean F1 score and adjusted Rand index. Below, we use the FlowSOM clustering algorithm (Van Gassen et al. 2015) to calculate a new clustering on the `Samusik_01` dataset, and then calculate clustering performance using the ground truth labels. Note that for simplicity in this vignette, we only calculate the adjusted Rand index. This calculation does not take into account the number of cells per cluster, so could potentially be dominated by one or two large clusters. For more sophisticated evaluations based on the mean F1 score, where we (i) weight clusters equally (instead of weighting cells equally), and (ii) check for multi-mapping populations (i.e. prevent multiple clusters mapping to the same ground truth population), see the code in our GitHub repository from our previous publication (Weber and Robinson, 2016), available at https://github.com/lmweber/cytometry-clustering-comparison. ```{r} suppressPackageStartupMessages(library(HDCytoData)) suppressPackageStartupMessages(library(FlowSOM)) suppressPackageStartupMessages(library(flowCore)) suppressPackageStartupMessages(library(mclust)) suppressPackageStartupMessages(library(umap)) suppressPackageStartupMessages(library(ggplot2)) ``` ```{r} # --------- # Load data # --------- d_SE <- Samusik_01_SE() dim(d_SE) ``` ```{r} # ------------- # Preprocessing # ------------- # select 'cell type' marker columns for defining clusters d_sub <- assay(d_SE[, colData(d_SE)$marker_class == "type"]) # extract cell population labels population <- rowData(d_SE)$population_id dim(d_sub) stopifnot(nrow(d_sub) == length(population)) # transform data using asinh with cofactor 5 cofactor <- 5 d_sub <- asinh(d_sub / cofactor) # create flowFrame object (required input format for FlowSOM) d_FlowSOM <- flowFrame(d_sub) ``` ```{r} # ----------- # Run FlowSOM # ----------- # set seed for reproducibility set.seed(123) # run FlowSOM (initial steps prior to meta-clustering) out <- ReadInput(d_FlowSOM, transform = FALSE, scale = FALSE) out <- BuildSOM(out) out <- BuildMST(out) # optional FlowSOM visualization #PlotStars(out) # extract cluster labels (pre meta-clustering) from output object labels_pre <- out$map$mapping[, 1] # specify final number of clusters for meta-clustering k <- 40 # run meta-clustering seed <- 123 out <- metaClustering_consensus(out$map$codes, k = k, seed = seed) # extract cluster labels from output object labels <- out[labels_pre] # summary of cluster sizes and number of clusters table(labels) length(table(labels)) ``` ```{r} # ------------------------------- # Evaluate clustering performance # ------------------------------- # calculate adjusted Rand index # note: this calculation weights all cells equally, which may not be # appropriate for some datasets (see above) stopifnot(nrow(d_sub) == length(labels)) stopifnot(length(population) == length(labels)) # remove "unassigned" cells from cluster evaluation (but note these were # included for clustering) ix_unassigned <- population == "unassigned" d_sub_eval <- d_sub[!ix_unassigned, ] population_eval <- population[!ix_unassigned] labels_eval <- labels[!ix_unassigned] stopifnot(nrow(d_sub_eval) == length(labels_eval)) stopifnot(length(population_eval) == length(labels_eval)) # calculate adjusted Rand index adjustedRandIndex(population_eval, labels_eval) ``` ```{r, fig.width = 7} # ------------ # Plot results # ------------ # subsample cells for faster runtimes in vignette n <- 4000 set.seed(1004) ix <- sample(seq_len(nrow(d_sub)), n) d_sub <- d_sub[ix, ] population <- population[ix] labels <- labels[ix] dim(d_sub) stopifnot(nrow(d_sub) == length(population)) stopifnot(nrow(population) == length(labels)) # run umap set.seed(1234) out_umap <- umap(d_sub) dims_umap <- out_umap$layout colnames(dims_umap) <- c("UMAP_1", "UMAP_2") stopifnot(nrow(dims_umap) == length(population)) stopifnot(nrow(population) == length(labels)) d_plot <- cbind( as.data.frame(dims_umap), population, labels = as.factor(labels), type = "UMAP" ) # generate plots colors <- c(rainbow(24), "gray75") ggplot(d_plot, aes(x = UMAP_1, y = UMAP_2, color = population)) + geom_point(size = 0.7, alpha = 0.5) + scale_color_manual(values = colors) + ggtitle("Ground truth population labels") + theme_bw() + theme(aspect.ratio = 1, legend.key.height = unit(4, "mm")) ggplot(d_plot, aes(x = UMAP_1, y = UMAP_2, color = labels)) + geom_point(size = 0.7, alpha = 0.5) + ggtitle("FlowSOM cluster labels") + theme_bw() + theme(aspect.ratio = 1, legend.key.height = unit(4, "mm")) ``` # Differential analysis In this section, we use the semi-simulated differential analysis datasets (`Weber_AML_sim` and `Weber_BCR_XL_sim`) to demonstrate how to perform differential analyses using the `diffcyt` package (Weber et al. 2019). For more more details on how to use the `diffcyt` package, see the [Bioconductor vignette](http://bioconductor.org/packages/diffcyt). For a complete workflow for performing differential discovery analyses in high-dimensional cytometry data, including exploratory analyses, differential testing, and visualizations, see Nowicka et al. (2017, 2019) (also available as a [Bioconductor workflow package](http://bioconductor.org/packages/cytofWorkflow)). We perform two sets of differential analyses: testing for differential abundance (DA) of cell populations (using the `Weber_AML_sim` dataset), and testing for differential states (DS) within cell populations (using the `Weber_BCR_XL_sim` dataset). In both cases, clusters are defined using "cell type" markers, while for DS testing we also use additional "cell state" markers to test for differential expression within clusters. See our paper introducing the `diffcyt` framework (Weber et al. 2019) for more details. For extended evaluations showing how to calculate performance using the ground truth labels (spike-in cells), see the code in our GitHub repository accompanying our previous publication (Weber et al. 2019), available at https://github.com/lmweber/diffcyt-evaluations. ## Differential abundance (DA) ```{r} # suppressPackageStartupMessages(library(diffcyt)) suppressPackageStartupMessages(library(SummarizedExperiment)) ``` ```{r} # --------- # Load data # --------- d_SE <- Weber_AML_sim_main_5pc_SE() ``` ```{r} # --------------- # Set up metadata # --------------- # set column names colnames(d_SE) <- colData(d_SE)$marker_name # split input data into one matrix per sample d_input <- split(as.data.frame(assay(d_SE)), rowData(d_SE)$sample_id) # extract sample information experiment_info <- metadata(d_SE)$experiment_info experiment_info # extract marker information marker_info <- colData(d_SE) marker_info ``` ```{r} # # ----------------------------------- # # Differential abundance (DA) testing # # ----------------------------------- # # # create design matrix # design <- createDesignMatrix( # experiment_info, cols_design = c("group_id", "patient_id") # ) # design # # # create contrast matrix # # note: testing condition CN vs. healthy # contrast <- createContrast(c(0, 1, 0, 0, 0, 0, 0)) # contrast # # # test for differential abundance (DA) of clusters # out_DA <- diffcyt( # d_input, # experiment_info, # marker_info, # design = design, # contrast = contrast, # analysis_type = "DA", # seed_clustering = 1234 # ) # # # display results for top DA clusters # topTable(out_DA, format_vals = TRUE) ``` ## Differential states (DS) ```{r} # --------- # Load data # --------- d_SE <- Weber_BCR_XL_sim_main_SE() ``` ```{r} # --------------- # Set up metadata # --------------- # set column names colnames(d_SE) <- colData(d_SE)$marker_name # split input data into one matrix per sample d_input <- split(as.data.frame(assay(d_SE)), rowData(d_SE)$sample_id) # extract sample information experiment_info <- metadata(d_SE)$experiment_info experiment_info # extract marker information marker_info <- colData(d_SE) marker_info ``` ```{r} # # ------------------------------- # # Differential state (DS) testing # # ------------------------------- # # # create design matrix # design <- createDesignMatrix( # experiment_info, cols_design = c("group_id", "patient_id") # ) # design # # # create contrast matrix # # note: testing condition spike vs. base # contrast <- createContrast(c(0, 1, 0, 0, 0, 0, 0, 0, 0)) # contrast # # # test for differential abundance (DA) of clusters # out_DS <- diffcyt( # d_input, # experiment_info, # marker_info, # design = design, # contrast = contrast, # analysis_type = "DS", # seed_clustering = 1234 # ) # # # display results for top DA clusters # topTable(out_DS, format_vals = TRUE) ```