---
title: "Examples and use cases"
author: 
  - name: Lukas M. Weber
    affiliation: 
      - &id1 "Institute of Molecular Life Sciences, University of Zurich, Zurich, Switzerland"
      - &id2 "SIB Swiss Institute of Bioinformatics, Zurich, Switzerland"
  - name: Charlotte Soneson
    affiliation: 
      - &id3 "Friedrich Miescher Institute for Biomedical Research, Basel, Switzerland"
      - &id4 "SIB Swiss Institute of Bioinformatics, Basel, Switzerland"
package: HDCytoData
output: 
  BiocStyle::html_document
vignette: >
  %\VignetteIndexEntry{2. Examples and use cases}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```


# Examples and use cases

This vignette demonstrates several examples and use cases for the datasets in the `HDCytoData` package.


# Dimension reduction

Using the clustering datasets, we can generate dimension reduction plots with colors indicating the ground truth cell population labels. This provides a visual representation of the cell population structure in these datasets, which is useful during exploratory data analysis and for representing the output of clustering or other downstream analysis algorithms.

Below, we compare three different dimension reduction algorithms (principal component analysis [PCA], t-distributed stochastic neighbor embedding [tSNE], and uniform manifold approximation and projection [UMAP]), for one of the datasets (`Levine_32dim`). This dataset contains ground truth cell population labels for 14 immune cell populations.


```{r}
suppressPackageStartupMessages(library(HDCytoData))
suppressPackageStartupMessages(library(SummarizedExperiment))
suppressPackageStartupMessages(library(Rtsne))
suppressPackageStartupMessages(library(umap))
suppressPackageStartupMessages(library(ggplot2))
```


```{r}
# ---------
# Load data
# ---------

d_SE <- Levine_32dim_SE()
```


```{r}
# -------------
# Preprocessing
# -------------

# select 'cell type' marker columns for defining clusters
d_sub <- assay(d_SE[, colData(d_SE)$marker_class == "type"])

# extract cell population labels
population <- rowData(d_SE)$population_id

dim(d_sub)
stopifnot(nrow(d_sub) == length(population))

# transform data using asinh with cofactor 5
cofactor <- 5
d_sub <- asinh(d_sub / cofactor)

summary(d_sub)

# subsample cells for faster runtimes in vignette
n <- 2000

set.seed(123)
ix <- sample(seq_len(nrow(d_sub)), n)

d_sub <- d_sub[ix, ]
population <- population[ix]

dim(d_sub)
stopifnot(nrow(d_sub) == length(population))

# remove any near-duplicate rows (required by Rtsne)
dups <- duplicated(d_sub)
d_sub <- d_sub[!dups, ]
population <- population[!dups]

dim(d_sub)
stopifnot(nrow(d_sub) == length(population))
```


```{r, fig.width = 6}
# ------------------------
# Dimension reduction: PCA
# ------------------------

n_dims <- 2

# run PCA
# (note: no scaling, since asinh-transformed dimensions are already comparable)
out_PCA <- prcomp(d_sub, center = TRUE, scale. = FALSE)
dims_PCA <- out_PCA$x[, seq_len(n_dims)]
colnames(dims_PCA) <- c("PC_1", "PC_2")
head(dims_PCA)

stopifnot(nrow(dims_PCA) == length(population))

colnames(dims_PCA) <- c("dimension_x", "dimension_y")
dims_PCA <- cbind(as.data.frame(dims_PCA), population, type = "PCA")

head(dims_PCA)
str(dims_PCA)


# generate plot
d_plot <- dims_PCA
str(d_plot)

colors <- c(rainbow(14), "gray75")

ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) + 
  facet_wrap(~ type, scales = "free") + 
  geom_point(size = 0.7, alpha = 0.5) + 
  scale_color_manual(values = colors) + 
  labs(x = "dimension x", y = "dimension y") + 
  theme_bw() + 
  theme(aspect.ratio = 1, 
        legend.key.height = unit(4, "mm"))
```


```{r, fig.width = 6}
# -------------------------
# Dimension reduction: tSNE
# -------------------------

# run Rtsne
set.seed(123)
out_Rtsne <- Rtsne(as.matrix(d_sub), dims = n_dims)
dims_Rtsne <- out_Rtsne$Y
colnames(dims_Rtsne) <- c("tSNE_1", "tSNE_2")
head(dims_Rtsne)

stopifnot(nrow(dims_Rtsne) == length(population))

colnames(dims_Rtsne) <- c("dimension_x", "dimension_y")
dims_Rtsne <- cbind(as.data.frame(dims_Rtsne), population, type = "tSNE")

head(dims_Rtsne)
str(dims_Rtsne)


# generate plot
d_plot <- dims_Rtsne

ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) + 
  facet_wrap(~ type, scales = "free") + 
  geom_point(size = 0.7, alpha = 0.5) + 
  scale_color_manual(values = colors) + 
  labs(x = "dimension x", y = "dimension y") + 
  theme_bw() + 
  theme(aspect.ratio = 1, 
        legend.key.height = unit(4, "mm"))
```


```{r, fig.width = 6}
# -------------------------
# Dimension reduction: UMAP
# -------------------------

# run umap
set.seed(123)
out_umap <- umap(d_sub)
dims_umap <- out_umap$layout
colnames(dims_umap) <- c("UMAP_1", "UMAP_2")
head(dims_umap)

stopifnot(nrow(dims_umap) == length(population))

colnames(dims_umap) <- c("dimension_x", "dimension_y")
dims_umap <- cbind(as.data.frame(dims_umap), population, type = "UMAP")

head(dims_umap)
str(dims_umap)


# generate plot
d_plot <- dims_umap

ggplot(d_plot, aes(x = dimension_x, y = dimension_y, color = population)) + 
  facet_wrap(~ type, scales = "free") + 
  geom_point(size = 0.7, alpha = 0.5) + 
  scale_color_manual(values = colors) + 
  labs(x = "dimension x", y = "dimension y") + 
  theme_bw() + 
  theme(aspect.ratio = 1, 
        legend.key.height = unit(4, "mm"))
```


# Clustering

We can also use the clustering datasets to calculate a new clustering using an algorithm of our choice, and then evaluate the performance of the clustering by comparing against the ground truth cell population labels. Common metrics for evaluating clustering performance include the mean F1 score and adjusted Rand index.

Below, we use the FlowSOM clustering algorithm (Van Gassen et al. 2015) to calculate a new clustering on the `Samusik_01` dataset, and then calculate clustering performance using the ground truth labels.

Note that for simplicity in this vignette, we only calculate the adjusted Rand index. This calculation does not take into account the number of cells per cluster, so could potentially be dominated by one or two large clusters. For more sophisticated evaluations based on the mean F1 score, where we (i) weight clusters equally (instead of weighting cells equally), and (ii) check for multi-mapping populations (i.e. prevent multiple clusters mapping to the same ground truth population), see the code in our GitHub repository from our previous publication (Weber and Robinson, 2016), available at https://github.com/lmweber/cytometry-clustering-comparison.


```{r}
suppressPackageStartupMessages(library(HDCytoData))
suppressPackageStartupMessages(library(FlowSOM))
suppressPackageStartupMessages(library(flowCore))
suppressPackageStartupMessages(library(mclust))
suppressPackageStartupMessages(library(umap))
suppressPackageStartupMessages(library(ggplot2))
```


```{r}
# ---------
# Load data
# ---------

d_SE <- Samusik_01_SE()

dim(d_SE)
```


```{r}
# -------------
# Preprocessing
# -------------

# select 'cell type' marker columns for defining clusters
d_sub <- assay(d_SE[, colData(d_SE)$marker_class == "type"])

# extract cell population labels
population <- rowData(d_SE)$population_id

dim(d_sub)
stopifnot(nrow(d_sub) == length(population))

# transform data using asinh with cofactor 5
cofactor <- 5
d_sub <- asinh(d_sub / cofactor)

# create flowFrame object (required input format for FlowSOM)
d_FlowSOM <- flowFrame(d_sub)
```


```{r}
# -----------
# Run FlowSOM
# -----------

# set seed for reproducibility
set.seed(123)

# run FlowSOM (initial steps prior to meta-clustering)
out <- ReadInput(d_FlowSOM, transform = FALSE, scale = FALSE)
out <- BuildSOM(out)
out <- BuildMST(out)

# optional FlowSOM visualization
#PlotStars(out)

# extract cluster labels (pre meta-clustering) from output object
labels_pre <- out$map$mapping[, 1]

# specify final number of clusters for meta-clustering
k <- 40

# run meta-clustering
seed <- 123
out <- metaClustering_consensus(out$map$codes, k = k, seed = seed)

# extract cluster labels from output object
labels <- out[labels_pre]

# summary of cluster sizes and number of clusters
table(labels)
length(table(labels))
```


```{r}
# -------------------------------
# Evaluate clustering performance
# -------------------------------

# calculate adjusted Rand index
# note: this calculation weights all cells equally, which may not be 
# appropriate for some datasets (see above)

stopifnot(nrow(d_sub) == length(labels))
stopifnot(length(population) == length(labels))

# remove "unassigned" cells from cluster evaluation (but note these were
# included for clustering)
ix_unassigned <- population == "unassigned"
d_sub_eval <- d_sub[!ix_unassigned, ]
population_eval <- population[!ix_unassigned]
labels_eval <- labels[!ix_unassigned]

stopifnot(nrow(d_sub_eval) == length(labels_eval))
stopifnot(length(population_eval) == length(labels_eval))

# calculate adjusted Rand index
adjustedRandIndex(population_eval, labels_eval)
```


```{r, fig.width = 7}
# ------------
# Plot results
# ------------

# subsample cells for faster runtimes in vignette
n <- 4000

set.seed(1004)
ix <- sample(seq_len(nrow(d_sub)), n)

d_sub <- d_sub[ix, ]
population <- population[ix]
labels <- labels[ix]

dim(d_sub)
stopifnot(nrow(d_sub) == length(population))
stopifnot(nrow(population) == length(labels))


# run umap
set.seed(1234)
out_umap <- umap(d_sub)
dims_umap <- out_umap$layout
colnames(dims_umap) <- c("UMAP_1", "UMAP_2")

stopifnot(nrow(dims_umap) == length(population))
stopifnot(nrow(population) == length(labels))

d_plot <- cbind(
  as.data.frame(dims_umap), 
  population, 
  labels = as.factor(labels), 
  type = "UMAP"
)


# generate plots
colors <- c(rainbow(24), "gray75")

ggplot(d_plot, aes(x = UMAP_1, y = UMAP_2, color = population)) + 
  geom_point(size = 0.7, alpha = 0.5) + 
  scale_color_manual(values = colors) + 
  ggtitle("Ground truth population labels") + 
  theme_bw() + 
  theme(aspect.ratio = 1, 
        legend.key.height = unit(4, "mm"))

ggplot(d_plot, aes(x = UMAP_1, y = UMAP_2, color = labels)) + 
  geom_point(size = 0.7, alpha = 0.5) + 
  ggtitle("FlowSOM cluster labels") + 
  theme_bw() + 
  theme(aspect.ratio = 1, 
        legend.key.height = unit(4, "mm"))
```


# Differential analysis

In this section, we use the semi-simulated differential analysis datasets (`Weber_AML_sim` and `Weber_BCR_XL_sim`) to demonstrate how to perform differential analyses using the `diffcyt` package (Weber et al. 2019).

For more more details on how to use the `diffcyt` package, see the [Bioconductor vignette](http://bioconductor.org/packages/diffcyt).

For a complete workflow for performing differential discovery analyses in high-dimensional cytometry data, including exploratory analyses, differential testing, and visualizations, see Nowicka et al. (2017, 2019) (also available as a [Bioconductor workflow package](http://bioconductor.org/packages/cytofWorkflow)).

We perform two sets of differential analyses: testing for differential abundance (DA) of cell populations (using the `Weber_AML_sim` dataset), and testing for differential states (DS) within cell populations (using the `Weber_BCR_XL_sim` dataset). In both cases, clusters are defined using "cell type" markers, while for DS testing we also use additional "cell state" markers to test for differential expression within clusters. See our paper introducing the `diffcyt` framework (Weber et al. 2019) for more details. For extended evaluations showing how to calculate performance using the ground truth labels (spike-in cells), see the code in our GitHub repository accompanying our previous publication (Weber et al. 2019), available at https://github.com/lmweber/diffcyt-evaluations.


## Differential abundance (DA)

```{r}
# suppressPackageStartupMessages(library(diffcyt))
suppressPackageStartupMessages(library(SummarizedExperiment))
```


```{r}
# ---------
# Load data
# ---------

d_SE <- Weber_AML_sim_main_5pc_SE()
```


```{r}
# ---------------
# Set up metadata
# ---------------

# set column names
colnames(d_SE) <- colData(d_SE)$marker_name

# split input data into one matrix per sample
d_input <- split(as.data.frame(assay(d_SE)), rowData(d_SE)$sample_id)

# extract sample information
experiment_info <- metadata(d_SE)$experiment_info
experiment_info

# extract marker information
marker_info <- colData(d_SE)
marker_info
```


```{r}
# # -----------------------------------
# # Differential abundance (DA) testing
# # -----------------------------------
# 
# # create design matrix
# design <- createDesignMatrix(
#   experiment_info, cols_design = c("group_id", "patient_id")
# )
# design
# 
# # create contrast matrix
# # note: testing condition CN vs. healthy
# contrast <- createContrast(c(0, 1, 0, 0, 0, 0, 0))
# contrast
# 
# # test for differential abundance (DA) of clusters
# out_DA <- diffcyt(
#   d_input, 
#   experiment_info, 
#   marker_info, 
#   design = design, 
#   contrast = contrast, 
#   analysis_type = "DA", 
#   seed_clustering = 1234
# )
# 
# # display results for top DA clusters
# topTable(out_DA, format_vals = TRUE)
```


## Differential states (DS)

```{r}
# ---------
# Load data
# ---------

d_SE <- Weber_BCR_XL_sim_main_SE()
```


```{r}
# ---------------
# Set up metadata
# ---------------

# set column names
colnames(d_SE) <- colData(d_SE)$marker_name

# split input data into one matrix per sample
d_input <- split(as.data.frame(assay(d_SE)), rowData(d_SE)$sample_id)

# extract sample information
experiment_info <- metadata(d_SE)$experiment_info
experiment_info

# extract marker information
marker_info <- colData(d_SE)
marker_info
```


```{r}
# # -------------------------------
# # Differential state (DS) testing
# # -------------------------------
# 
# # create design matrix
# design <- createDesignMatrix(
#   experiment_info, cols_design = c("group_id", "patient_id")
# )
# design
# 
# # create contrast matrix
# # note: testing condition spike vs. base
# contrast <- createContrast(c(0, 1, 0, 0, 0, 0, 0, 0, 0))
# contrast
# 
# # test for differential abundance (DA) of clusters
# out_DS <- diffcyt(
#   d_input, 
#   experiment_info, 
#   marker_info, 
#   design = design, 
#   contrast = contrast, 
#   analysis_type = "DS", 
#   seed_clustering = 1234
# )
# 
# # display results for top DA clusters
# topTable(out_DS, format_vals = TRUE)
```