## ----style, echo=FALSE, results='hide', message=FALSE, cache=FALSE--------- library(BiocStyle) library(knitr) opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE, cache=TRUE) opts_chunk$set(fig.asp=1) ## -------------------------------------------------------------------------- library(BiocFileCache) bfc <- BiocFileCache("raw_data", ask = FALSE) grun.fname <- bfcrpath(bfc, file.path("ftp://ftp.ncbi.nlm.nih.gov/geo/series", "GSE81nnn/GSE81076/suppl/GSE81076%5FD2%5F3%5F7%5F10%5F17%2Etxt%2Egz")) ## -------------------------------------------------------------------------- gse81076.df <- read.table(grun.fname, sep='\t', header=TRUE, stringsAsFactors=FALSE, row.names=1) dim(gse81076.df) head(rownames(gse81076.df)) head(colnames(gse81076.df)) ## -------------------------------------------------------------------------- donor.names <- sub("^(D[0-9]+).*", "\\1", colnames(gse81076.df)) table(donor.names) plate.id <- sub("^D[0-9]+(.*)_.*", "\\1", colnames(gse81076.df)) table(plate.id) ## -------------------------------------------------------------------------- gene.symb <- gsub("__chr.*$", "", rownames(gse81076.df)) is.spike <- grepl("^ERCC-", gene.symb) table(is.spike) library(org.Hs.eg.db) gene.ids <- mapIds(org.Hs.eg.db, keys=gene.symb, keytype="SYMBOL", column="ENSEMBL") gene.ids[is.spike] <- gene.symb[is.spike] keep <- !is.na(gene.ids) & !duplicated(gene.ids) gse81076.df <- gse81076.df[keep,] rownames(gse81076.df) <- gene.ids[keep] summary(keep) ## -------------------------------------------------------------------------- library(SingleCellExperiment) sce.gse81076 <- SingleCellExperiment(list(counts=as.matrix(gse81076.df)), colData=DataFrame(Donor=donor.names, Plate=plate.id), rowData=DataFrame(Symbol=gene.symb[keep])) isSpike(sce.gse81076, "ERCC") <- grepl("^ERCC-", rownames(gse81076.df)) sce.gse81076 ## -------------------------------------------------------------------------- library(scater) sce.gse81076 <- calculateQCMetrics(sce.gse81076, compact=TRUE) QC <- sce.gse81076$scater_qc low.lib <- isOutlier(QC$all$log10_total_counts, type="lower", nmad=3) low.genes <- isOutlier(QC$all$log10_total_features_by_counts, type="lower", nmad=3) high.spike <- isOutlier(QC$feature_control_ERCC$pct_counts, type="higher", nmad=3) data.frame(LowLib=sum(low.lib), LowNgenes=sum(low.genes), HighSpike=sum(high.spike, na.rm=TRUE)) ## -------------------------------------------------------------------------- discard <- low.lib | low.genes | high.spike sce.gse81076 <- sce.gse81076[,!discard] summary(discard) ## -------------------------------------------------------------------------- library(scran) library(BiocSingular) set.seed(1000) # for irlba. clusters <- quickCluster(sce.gse81076, use.ranks=FALSE, BSPARAM=IrlbaParam()) table(clusters) sce.gse81076 <- computeSumFactors(sce.gse81076, min.mean=0.1, clusters=clusters) summary(sizeFactors(sce.gse81076)) ## -------------------------------------------------------------------------- sce.gse81076 <- computeSpikeFactors(sce.gse81076, general.use=FALSE) summary(sizeFactors(sce.gse81076, "ERCC")) ## -------------------------------------------------------------------------- sce.gse81076 <- normalize(sce.gse81076) ## ----var-gse81076, fig.cap="Variance of normalized log-expression values for each gene in the GSE81076 dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."---- block <- paste0(sce.gse81076$Plate, "_", sce.gse81076$Donor) fit <- trendVar(sce.gse81076, block=block, parametric=TRUE) dec <- decomposeVar(sce.gse81076, fit) plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance of log-expression", pch=16) is.spike <- isSpike(sce.gse81076) points(dec$mean[is.spike], dec$total[is.spike], col="red", pch=16) curve(fit$trend(x), col="dodgerblue", add=TRUE) ## -------------------------------------------------------------------------- dec.gse81076 <- dec dec.gse81076$Symbol <- rowData(sce.gse81076)$Symbol dec.gse81076 <- dec.gse81076[order(dec.gse81076$bio, decreasing=TRUE),] head(dec.gse81076) ## ---- echo=FALSE, results="hide"------------------------------------------- rm(gse81076.df) gc() ## -------------------------------------------------------------------------- muraro.fname <- bfcrpath(bfc, file.path("ftp://ftp.ncbi.nlm.nih.gov/geo/series", "GSE85nnn/GSE85241/suppl", "GSE85241%5Fcellsystems%5Fdataset%5F4donors%5Fupdated%2Ecsv%2Egz")) ## -------------------------------------------------------------------------- gse85241.df <- read.table(muraro.fname, sep='\t', header=TRUE, row.names=1, stringsAsFactors=FALSE) dim(gse85241.df) head(rownames(gse85241.df)) head(colnames(gse85241.df)) ## -------------------------------------------------------------------------- donor.names <- sub("^(D[0-9]+).*", "\\1", colnames(gse85241.df)) table(donor.names) plate.id <- sub("^D[0-9]+\\.([0-9]+)_.*", "\\1", colnames(gse85241.df)) table(plate.id) ## -------------------------------------------------------------------------- gene.symb <- gsub("__chr.*$", "", rownames(gse85241.df)) is.spike <- grepl("^ERCC-", gene.symb) table(is.spike) library(org.Hs.eg.db) gene.ids <- mapIds(org.Hs.eg.db, keys=gene.symb, keytype="SYMBOL", column="ENSEMBL") gene.ids[is.spike] <- gene.symb[is.spike] keep <- !is.na(gene.ids) & !duplicated(gene.ids) gse85241.df <- gse85241.df[keep,] rownames(gse85241.df) <- gene.ids[keep] summary(keep) ## -------------------------------------------------------------------------- sce.gse85241 <- SingleCellExperiment(list(counts=as.matrix(gse85241.df)), colData=DataFrame(Donor=donor.names, Plate=plate.id), rowData=DataFrame(Symbol=gene.symb[keep])) isSpike(sce.gse85241, "ERCC") <- grepl("^ERCC-", rownames(gse85241.df)) sce.gse85241 ## -------------------------------------------------------------------------- sce.gse85241 <- calculateQCMetrics(sce.gse85241, compact=TRUE) QC <- sce.gse85241$scater_qc low.lib <- isOutlier(QC$all$log10_total_counts, type="lower", nmad=3) low.genes <- isOutlier(QC$all$log10_total_features_by_counts, type="lower", nmad=3) high.spike <- isOutlier(QC$feature_control_ERCC$pct_counts, type="higher", nmad=3) data.frame(LowLib=sum(low.lib), LowNgenes=sum(low.genes), HighSpike=sum(high.spike, na.rm=TRUE)) ## -------------------------------------------------------------------------- discard <- low.lib | low.genes | high.spike sce.gse85241 <- sce.gse85241[,!discard] summary(discard) ## -------------------------------------------------------------------------- set.seed(1000) clusters <- quickCluster(sce.gse85241, use.ranks=FALSE, BSPARAM=IrlbaParam()) table(clusters) sce.gse85241 <- computeSumFactors(sce.gse85241, min.mean=0.1, clusters=clusters) summary(sizeFactors(sce.gse85241)) sce.gse85241 <- computeSpikeFactors(sce.gse85241, general.use=FALSE) summary(sizeFactors(sce.gse85241, "ERCC")) sce.gse85241 <- normalize(sce.gse85241) ## ----var-gse85241, fig.cap="Variance of normalized log-expression values for each gene in the GSE85241 dataset, plotted against the mean log-expression. The blue line represents the mean-dependent trend fitted to the variances of the spike-in transcripts (red)."---- block <- paste0(sce.gse85241$Plate, "_", sce.gse85241$Donor) fit <- trendVar(sce.gse85241, block=block, parametric=TRUE) dec <- decomposeVar(sce.gse85241, fit) plot(dec$mean, dec$total, xlab="Mean log-expression", ylab="Variance of log-expression", pch=16) is.spike <- isSpike(sce.gse85241) points(dec$mean[is.spike], dec$total[is.spike], col="red", pch=16) curve(fit$trend(x), col="dodgerblue", add=TRUE) ## -------------------------------------------------------------------------- dec.gse85241 <- dec dec.gse85241$Symbol <- rowData(sce.gse85241)$Symbol dec.gse85241 <- dec.gse85241[order(dec.gse85241$bio, decreasing=TRUE),] head(dec.gse85241) ## ---- echo=FALSE, results="hide"------------------------------------------- rm(gse85241.df) gc() ## -------------------------------------------------------------------------- universe <- intersect(rownames(dec.gse85241), rownames(dec.gse81076)) mean.bio <- (dec.gse85241[universe,"bio"] + dec.gse81076[universe,"bio"])/2 chosen <- universe[mean.bio > 0] length(chosen) ## -------------------------------------------------------------------------- rescaled <- batchelor::multiBatchNorm( sce.gse85241[universe,], sce.gse81076[universe,] ) rescaled.gse85241 <- rescaled[[1]] rescaled.gse81076 <- rescaled[[2]] ## -------------------------------------------------------------------------- set.seed(100) unc.gse81076 <- logcounts(rescaled.gse81076)[chosen,] unc.gse85241 <- logcounts(rescaled.gse85241)[chosen,] mnn.out <- batchelor::fastMNN( GSE81076=unc.gse81076, GSE85241=unc.gse85241, k=20, d=50, BSPARAM=IrlbaParam(deferred=TRUE) ) mnn.out ## -------------------------------------------------------------------------- dim(reducedDim(mnn.out, "corrected")) ## -------------------------------------------------------------------------- # Using an Rle for pretty-printing of batch IDs # (as all cells from the same batch are consecutive). Rle(mnn.out$batch) ## -------------------------------------------------------------------------- metadata(mnn.out)$merge.info$pairs[[1]] ## ---- echo=FALSE, results="hide"------------------------------------------- gc() ## ----tsne-batch, fig.width=10, fig.asp=0.6, fig.cap="t-SNE plots of the pancreas datasets, before and after MNN correction. Each point represents a cell and is coloured by the batch of origin."---- # Adding uncorrected values. sce <- mnn.out assay(sce, "original") <- cbind(unc.gse81076, unc.gse85241) # Using irlba to set up the t-SNE, for speed. set.seed(100) osce <- runPCA(sce, exprs_values="original", ntop=Inf, BSPARAM=IrlbaParam()) osce <- runTSNE(osce, use_dimred="PCA") ot <- plotTSNE(osce, colour_by="batch") + ggtitle("Original") # Corrected. set.seed(100) sce <- runTSNE(sce, use_dimred="corrected") ct <- plotTSNE(sce, colour_by="batch") + ggtitle("Corrected") multiplot(ot, ct, cols=2) ## ----tsne-markers, fig.width=10, fig.height=10, fig.cap="t-SNE plots after MNN correction, where each point represents a cell and is coloured by its corrected expression of key marker genes for known cell types in the pancreas."---- # Replacing the row names for easier reference. rowData(sce)$ENSEMBL <- rownames(sce) rowData(sce)$SYMBOL <- mapIds(org.Hs.eg.db, keytype="ENSEMBL", keys=rownames(sce), column="SYMBOL") rownames(sce) <- uniquifyFeatureNames(rownames(sce), rowData(sce)$SYMBOL) ct.gcg <- plotTSNE(sce, by_exprs_values="reconstructed", colour_by="GCG") ct.ins <- plotTSNE(sce, by_exprs_values="reconstructed", colour_by="INS") ct.sst <- plotTSNE(sce, by_exprs_values="reconstructed", colour_by="SST") ct.ppy <- plotTSNE(sce, by_exprs_values="reconstructed", colour_by="PPY") multiplot(ct.gcg + ggtitle("Alpha cells"), ct.ins + ggtitle("Beta cells"), ct.sst + ggtitle("Delta cells"), ct.ppy + ggtitle("PP cells"), cols=2) ## -------------------------------------------------------------------------- metadata(mnn.out)$merge.info$lost.var ## ---- echo=FALSE, results="hide"------------------------------------------- gc() ## -------------------------------------------------------------------------- mnn.out2 <- batchelor::fastMNN( GSE81076=unc.gse81076, GSE85241=unc.gse85241, k=20, d=50, auto.order=c(2,1), BSPARAM=IrlbaParam(deferred=TRUE) ) metadata(mnn.out2)$merge.order # batch 2 (GSE85241) is first in the order. metadata(mnn.out2)$merge.info$pairs[[1]] # 'first' now refers to GSE85241. ## -------------------------------------------------------------------------- Rle(mnn.out2$batch) # same as mnn.out$batch ## ---- echo=FALSE, results="hide"------------------------------------------- gc() ## -------------------------------------------------------------------------- all.donors <- unique(rescaled.gse85241$Donor) table(rescaled.gse85241$Donor) by.donor.85241 <- vector("list", length(all.donors)) names(by.donor.85241) <- sort(all.donors) for (x in all.donors) { by.donor.85241[[x]] <- rescaled.gse85241[,rescaled.gse85241$Donor==x] } ## -------------------------------------------------------------------------- adj.donors <- c(D101="A", D102="A", D10631="A", D17="B", D1713="B", D172444="B", D2="C", D3="C", D71="D", D72="D", D73="D", D74="D")[rescaled.gse81076$Donor] table(adj.donors) all.donors <- unique(adj.donors) by.donor.81076 <- vector("list", length(all.donors)) names(by.donor.81076) <- sort(all.donors) for (x in all.donors) { by.donor.81076[[x]] <- rescaled.gse81076[,adj.donors==x] } ## -------------------------------------------------------------------------- all.batches <- c(by.donor.85241, by.donor.81076) # Cosine normalizing for consistency with fastMNN() defaults. all.logcounts <- lapply(all.batches, logcounts) scaled <- lapply(all.logcounts, batchelor::cosineNorm) set.seed(1000) # for irlba. pc.all <- do.call(batchelor::multiBatchPCA, c(scaled, list(d=50, BSPARAM=IrlbaParam(deferred=TRUE)) )) names(pc.all) ## -------------------------------------------------------------------------- pcs.85241 <- pc.all[seq_along(by.donor.85241)] mnn.out.85241 <- do.call(batchelor::fastMNN, c(pcs.85241, list(pc.input=TRUE))) Rle(mnn.out.85241$batch) ## -------------------------------------------------------------------------- pcs.81076 <- tail(pc.all, length(by.donor.81076)) mnn.out.81076 <- do.call(batchelor::fastMNN, c(pcs.81076, list(pc.input=TRUE))) Rle(mnn.out.81076$batch) ## -------------------------------------------------------------------------- mnn.out3 <- batchelor::fastMNN( GSE81076=mnn.out.81076, GSE85241=mnn.out.85241, pc.input=TRUE, k=100 # see comments below. ) Rle(mnn.out3$batch) # by dataset Rle(c(mnn.out.81076$batch, mnn.out.85241$batch)) # by donor ## ----tsne-hmerge, fig.width=10, fig.asp=0.5, fig.cap="t-SNE plots after correcting for donor effects within each data set, and after correcting for batch effects between data sets (final). Each point represents a cell that is coloured according to its donor of origin (left, middle) or the data set of origin (right)."---- set.seed(1000) par(mfrow=c(1,3)) library(Rtsne) tout.85241 <- Rtsne(mnn.out.85241$corrected, pca=FALSE) plot(tout.85241$Y[,1], tout.85241$Y[,2], main="GSE85241 donors", col=as.factor(mnn.out.85241$batch), xlab="tSNE1", ylab="tSNE2") tout.81076 <- Rtsne(mnn.out.81076$corrected, pca=FALSE) plot(tout.81076$Y[,1], tout.81076$Y[,2], main="GSE81076 donors", col=as.factor(mnn.out.81076$batch), xlab="tSNE1", ylab="tSNE2") tout.all <- Rtsne(mnn.out3$corrected, pca=FALSE) plot(tout.all$Y[,1], tout.all$Y[,2], main="Final", col=as.factor(mnn.out3$batch), xlab="tSNE1", ylab="tSNE2") ## -------------------------------------------------------------------------- original.plate <- unlist(lapply(rescaled, "[[", i="Plate")) original.names <- unlist(lapply(rescaled, colnames)) # Needs unique names: trigger error otherwise. stopifnot(anyDuplicated(original.names)==0L) m <- match(rownames(mnn.out3$corrected), original.names) new.plate <- original.plate[m] ## ---- echo=FALSE, results="hide"------------------------------------------- gc() ## -------------------------------------------------------------------------- mnn.out.auto <- do.call(batchelor::fastMNN, c(pcs.81076, list(pc.input=TRUE, auto.order=TRUE))) names(by.donor.81076) # supplied order metadata(mnn.out.auto)$merge.order # automatically defined order ## -------------------------------------------------------------------------- Rle(mnn.out.auto$batch) ## ---- echo=FALSE, results="hide"------------------------------------------- gc() ## -------------------------------------------------------------------------- snn.gr <- buildSNNGraph(sce, use.dimred="corrected") clusters <- igraph::cluster_walktrap(snn.gr) table(clusters$membership, sce$batch) ## ----tsne-cluster, fig.cap="t-SNE plot after MMN correction, where each point represents a cell and is coloured by its cluster identity."---- sce$Cluster <- factor(clusters$membership) plotTSNE(sce, colour_by="Cluster") ## -------------------------------------------------------------------------- m.out <- findMarkers(sce, clusters$membership, block=sce$batch, direction="up", assay.type="original") demo <- m.out[["10"]] # probably alpha cells. demo <- demo[demo$Top <= 5,] as.data.frame(demo[,1:3]) # only first three columns for brevity. ## ---- echo=FALSE, results="hide"------------------------------------------- # Checking that cluster 10 is what we think it is. stopifnot(all(sapply(demo["GCG",-(1:3)], sign)==1)) ## -------------------------------------------------------------------------- assay(sce, "reconstructed") summary(assay(sce)["INS",]) ## -------------------------------------------------------------------------- rotations <- metadata(pc.all)$rotation cor.exp <- tcrossprod(mnn.out3$corrected, rotations["ENSG00000115263",,drop=FALSE]) summary(cor.exp) ## -------------------------------------------------------------------------- saveRDS(file="pancreas_data.rds", sce) ## -------------------------------------------------------------------------- sessionInfo()