## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = TRUE, crop = NULL ) library(BiocStyle) ## ----load-pkg----------------------------------------------------------------- suppressPackageStartupMessages({ library(sketchR) library(TENxPBMCData) library(scuttle) library(scran) library(scater) library(SingleR) library(celldex) library(cowplot) library(SummarizedExperiment) library(SingleCellExperiment) library(beachmat.hdf5) library(snifter) library(uwot) library(bluster) library(class) }) ## ----process-data------------------------------------------------------------- # load data pbmc3k <- TENxPBMCData(dataset = "pbmc3k") # set row and column names colnames(pbmc3k) <- paste0("Cell", seq_len(ncol(pbmc3k))) rownames(pbmc3k) <- uniquifyFeatureNames( ID = rowData(pbmc3k)$ENSEMBL_ID, names = rowData(pbmc3k)$Symbol_TENx ) # normalize and log-transform counts pbmc3k <- logNormCounts(pbmc3k) # find highly variable genes dec <- modelGeneVar(pbmc3k) top_hvgs <- getTopHVGs(dec, n = 2000) # perform dimensionality reduction set.seed(100) pbmc3k <- runPCA(pbmc3k, subset_row = top_hvgs) reducedDim(pbmc3k, "TSNE") <- fitsne( reducedDim(pbmc3k, "PCA"), n_jobs = 2L, random_state = 123L, perplexity = 30, n_components = 2L, pca = FALSE) reducedDim(pbmc3k, "UMAP") <- umap( X = reducedDim(pbmc3k, "PCA"), n_components = 2, n_neighbors = 50, pca = NULL, scale = FALSE, ret_model = TRUE, n_threads = 2L)$embedding # predict cell type labels ref_monaco <- MonacoImmuneData() pred_monaco_main <- SingleR(test = pbmc3k, ref = ref_monaco, labels = ref_monaco$label.main) pbmc3k$labels_main <- pred_monaco_main$labels ## ----sketch------------------------------------------------------------------- idx800gs <- geosketch(reducedDim(pbmc3k, "PCA"), N = 800, seed = 123) pbmc3k$sketched <- seq_len(ncol(pbmc3k)) %in% idx800gs table(pbmc3k$sketched) ## ----hvgs--------------------------------------------------------------------- ## Find highly variable genes dec_sketched <- modelGeneVar(pbmc3k[, pbmc3k$sketched]) top_hvgs_sketched <- getTopHVGs(dec_sketched, n = 2000) ## ----pca---------------------------------------------------------------------- set.seed(123) # don't scale features before PCA doscale <- FALSE # subset to sketched cells and run PCA, using the HVGs from the sketched subset pbmc3k_sketched <- pbmc3k[, pbmc3k$sketched] pca <- calculatePCA(pbmc3k_sketched, ncomponents = 30, subset_row = top_hvgs_sketched, scale = doscale) # store the PCA projection for the sketched cells as well as the weights for # the HVGs and the mean value for each HVG across the sketched cells projection <- pca rotation <- attr(pca, "rotation") centers <- rowMeans(assay(pbmc3k_sketched[top_hvgs_sketched, ], "logcounts")) # center the full data set using the mean values from the sketched subset and # project all cells onto the extracted principal components # note that if doscale = TRUE, we also need to calculate the standard # deviations across the sketched cells and use them here pca <- scale(t(assay(pbmc3k[names(centers), ], "logcounts")), center = centers, scale = doscale) %*% rotation # add the PCA coordinates to pbmc3k stopifnot(all(rownames(pca) == colnames(pbmc3k))) reducedDim(pbmc3k, "PCA_sketch") <- as.matrix(pca) # check that the "re-projections" of the sketched cells are the same as the # original coordinates obtained from applying PCA to the sketched cells # ... via correlations vapply(seq_len(ncol(projection)), function(i) { cor(projection[, i], reducedDim(pbmc3k, "PCA_sketch")[rownames(projection), i] ) }, NA_real_) # ... via plotting plot_grid(plotlist = lapply(seq_len(ncol(projection))[1:2], function(i) { ggplot(data.frame( Original = projection[, i], Reprojected = reducedDim(pbmc3k, "PCA_sketch")[pbmc3k$sketched, i]), aes(x = Original, y = Reprojected)) + geom_point() + labs(title = paste0("PCA dimension ", i)) + theme_bw() })) ## ----plot-pca----------------------------------------------------------------- plot_grid(plotlist = list( plotReducedDim(pbmc3k, dimred = "PCA", colour_by = "labels_main", ncomponents = c(1, 2)) + ggtitle("PCA"), plotReducedDim(pbmc3k, dimred = "PCA_sketch", colour_by = "labels_main", ncomponents = c(1, 2)) + ggtitle("PCA (sketch)") )) ## ----plot-pca-sketched-------------------------------------------------------- plotReducedDim(pbmc3k, dimred = "PCA_sketch", colour_by = "sketched", ncomponents = c(1, 2)) ## ----tsne--------------------------------------------------------------------- set.seed(123) # extract tSNE embedding from sketched cells tsne <- fitsne( reducedDim(pbmc3k[, pbmc3k$sketched], "PCA_sketch"), n_jobs = 2L, random_state = 123L, perplexity = 30, n_components = 2L, pca = FALSE) # project all cells proj <- project( x = tsne, new = reducedDim(pbmc3k, "PCA_sketch"), old = reducedDim(pbmc3k[, pbmc3k$sketched], "PCA_sketch"), perplexity = 30, k = 1L) rownames(proj) <- colnames(pbmc3k) reducedDim(pbmc3k, "TSNE_sketch") <- proj # representations of sketched cells are approximately retained # ... correlations vapply(seq_len(ncol(tsne)), function(i) { cor(tsne[, i], reducedDim(pbmc3k, "TSNE_sketch")[pbmc3k$sketched, i] ) }, NA_real_) # ... plotting plot_grid(plotlist = lapply(seq_len(ncol(tsne)), function(i) { ggplot(data.frame( Original = tsne[, i], Reprojected = reducedDim(pbmc3k, "TSNE_sketch")[pbmc3k$sketched, i]), aes(x = Original, y = Reprojected)) + geom_point() + labs(title = paste0("tSNE dimension ", i)) + theme_bw() })) ## ----plot-tsne---------------------------------------------------------------- plot_grid(plotlist = list( plotReducedDim(pbmc3k, dimred = "TSNE", colour_by = "labels_main", ncomponents = c(1, 2)) + ggtitle("t-SNE"), plotReducedDim(pbmc3k, dimred = "TSNE_sketch", colour_by = "labels_main", ncomponents = c(1, 2)) + ggtitle("t-SNE (sketch)") )) ## ----plot-tsne-sketched------------------------------------------------------- plotReducedDim(pbmc3k, dimred = "TSNE_sketch", colour_by = "sketched", ncomponents = c(1, 2)) ## ----umap--------------------------------------------------------------------- set.seed(123) # extract UMAP embedding from sketched cells umap <- umap( X = reducedDim(pbmc3k[, pbmc3k$sketched], "PCA_sketch"), n_components = 2, n_neighbors = 50, pca = NULL, scale = FALSE, ret_model = TRUE, n_threads = 2L) # project all cells proj <- umap_transform( X = reducedDim(pbmc3k, "PCA_sketch"), model = umap, n_threads = 2L) rownames(proj) <- colnames(pbmc3k) reducedDim(pbmc3k, "UMAP_sketch") <- proj # representations of sketched cells are approximately retained # ... correlations vapply(seq_len(ncol(umap$embedding)), function(i) { cor(umap$embedding[, i], reducedDim(pbmc3k, "UMAP_sketch")[pbmc3k$sketched, i] ) }, NA_real_) # ... plotting plot_grid(plotlist = lapply(seq_len(ncol(umap$embedding)), function(i) { ggplot(data.frame( Original = umap$embedding[, i], Reprojected = reducedDim(pbmc3k, "UMAP_sketch")[pbmc3k$sketched, i]), aes(x = Original, y = Reprojected)) + geom_point() + labs(title = paste0("UMAP dimension ", i)) + theme_bw() })) ## ----plot-umap---------------------------------------------------------------- plot_grid(plotlist = list( plotReducedDim(pbmc3k, dimred = "UMAP", colour_by = "labels_main", ncomponents = c(1, 2)) + ggtitle("UMAP"), plotReducedDim(pbmc3k, dimred = "UMAP_sketch", colour_by = "labels_main", ncomponents = c(1, 2)) + ggtitle("UMAP (sketch)") )) ## ----plot-umap-sketched------------------------------------------------------- plotReducedDim(pbmc3k, dimred = "UMAP_sketch", colour_by = "sketched", ncomponents = c(1, 2)) ## ----clustering--------------------------------------------------------------- # subset to only the sketched cells # note that since the PCA_sketch embedding was obtained by applying PCA to the # sketched cells only, subsetting like this is equivalent to applying PCA # again to the sketched cells only (see above for an illustration) pca_sketch <- reducedDim(pbmc3k[, pbmc3k$sketched], "PCA_sketch") set.seed(123) # run clustering on the sketched cells clst <- as.character(clusterRows( pca_sketch, BLUSPARAM = SNNGraphParam( k = 10, type = "rank", num.threads = 2L, cluster.fun = "leiden", cluster.args = list(resolution = 0.15)))) # predict labels of all cells using 1-nearest neighbors and add cluster labels # to pbmc3k preds <- knn(train = pca_sketch, test = reducedDim(pbmc3k, "PCA_sketch"), cl = clst, k = 1) |> as.character() pbmc3k$clusters <- preds # check that labels are retained for sketched cells all(clst == pbmc3k$clusters[pbmc3k$sketched]) ## ----session-info------------------------------------------------------------- sessionInfo()