## ----nanotubes----------------------------------------------------------- library(nanotubes) ## ----packages, results='hide', message=FALSE, warning=FALSE-------------- # CRAN packages for data manipulation and plotting library(knitr) library(pheatmap) library(ggseqlogo) library(viridis) library(magrittr) library(ggforce) library(ggthemes) library(tidyverse) # CAGEfightR and related packages library(CAGEfightR) library(GenomicRanges) library(SummarizedExperiment) library(GenomicFeatures) library(BiocParallel) library(InteractionSet) library(Gviz) # Bioconductor packages for differential expression library(DESeq2) library(limma) library(edgeR) library(sva) # Bioconductor packages for enrichment analyses library(TFBSTools) library(motifmatchr) library(pathview) # Bioconductor data packages library(BSgenome.Mmusculus.UCSC.mm9) library(TxDb.Mmusculus.UCSC.mm9.knownGene) library(org.Mm.eg.db) library(JASPAR2016) ## ----scriptwise, results='hide', message=FALSE, warning=FALSE------------ # Rename these for easier access bsg <- BSgenome.Mmusculus.UCSC.mm9 txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene odb <- org.Mm.eg.db # Script wide settings register(MulticoreParam(3)) # Parallel execution when possible theme_set(theme_light()) # White theme for ggplot2 figures ## ----studyDesign--------------------------------------------------------- data(nanotubes) kable(nanotubes, caption = "The initial design matrix for the nanotubes experiment") ## ----fnames-------------------------------------------------------------- # Setup paths to file on hard drive bw_plus <- system.file("extdata", nanotubes$BigWigPlus, package = "nanotubes", mustWork = TRUE) bw_minus <- system.file("extdata", nanotubes$BigWigMinus, package = "nanotubes", mustWork = TRUE) # Save as named BigWigFileList bw_plus <- BigWigFileList(bw_plus) bw_minus <- BigWigFileList(bw_minus) names(bw_plus) <- names(bw_minus) <- nanotubes$Name ## ----quantifyCTSSs------------------------------------------------------- CTSSs <- quantifyCTSSs(plusStrand = bw_plus, minusStrand = bw_minus, genome = seqinfo(bsg), design = nanotubes) ## ----inspectCTSSs-------------------------------------------------------- # Get a summary CTSSs # Extract CTSS positions rowRanges(CTSSs) # Extract CTSS counts assay(CTSSs, "counts") %>% head ## ----pooled-------------------------------------------------------------- CTSSs <- CTSSs %>% calcTPM() %>% calcPooled() ## ----tagClustering------------------------------------------------------- TCs <- quickTSSs(CTSSs) ## ----TCfiltering--------------------------------------------------------- TSSs <- TCs %>% calcTPM() %>% subsetBySupport(inputAssay="TPM", unexpressed=1, minSamples=4) ## ----bidirClustering----------------------------------------------------- BCs <- quickEnhancers(CTSSs) ## ----BCfiltering--------------------------------------------------------- BCs <- subsetBySupport(BCs, inputAssay="counts", unexpressed=0, minSamples=4) ## ----TSSannotation------------------------------------------------------- # Annotate with transcript IDs TSSs <- assignTxID(TSSs, txModels = txdb, swap="thick") # Annotate with transcript context TSSs <- assignTxType(TSSs, txModels = txdb, swap="thick") ## ----BCannotation-------------------------------------------------------- # Annotate with transcript context BCs <- assignTxType(BCs, txModels = txdb, swap="thick") # Keep only non-exonic BCs as enhancer candidates Enhancers <- subset(BCs, txType %in% c("intergenic", "intron")) ## ----cleanClusters------------------------------------------------------- # Clean colData TSSs$totalTags <- NULL Enhancers$totalTags <- NULL # Clean rowData rowData(TSSs)$balance <- NA rowData(TSSs)$bidirectionality <- NA rowData(Enhancers)$txID <- NA # Add labels for making later retrieval easy rowData(TSSs)$clusterType <- "TSS" rowData(Enhancers)$clusterType <- "Enhancer" ## ----combineClusters----------------------------------------------------- RSE <- combineClusters(object1=TSSs, object2 = Enhancers, removeIfOverlapping="object1") ## ----finalTPM------------------------------------------------------------ RSE <- calcTPM(RSE) ## ----genomeTracks-------------------------------------------------------- # Genome track axis_track <- GenomeAxisTrack() # Annotation track tx_track <- GeneRegionTrack(txdb, name = "Gene Models", col = NA, fill = "bisque4", shape = "arrow", showId = TRUE) ## ----simpleTSS, fig.width=5, fig.height=5, fig.cap='Genome browser example of TSS candidate'---- # Extract 100 bp around the first TSS. plot_region <- RSE %>% rowRanges %>% subset(clusterType == "TSS") %>% .[1] %>% add(100) %>% unstrand() # CTSSs track ctss_track <- CTSSs %>% rowRanges %>% subsetByOverlaps(plot_region) %>% trackCTSS(name = "CTSSs") # Cluster track cluster_track <- RSE %>% subsetByOverlaps(plot_region) %>% trackClusters(name = "Clusters", col = NA, showId=TRUE) # Plot at tracks together plotTracks(list(axis_track, ctss_track, cluster_track, tx_track), from = start(plot_region), to=end(plot_region), chromosome = seqnames(plot_region)) ## ----simpleEnhancer, fig.width=5, fig.height=5, fig.cap='Genome browser example of enhancer candidate'---- # Make plotting region plot_region <- RSE %>% rowRanges %>% subset(clusterType == "Enhancer") %>% .[1] %>% add(100) %>% unstrand() # CTSSs track ctss_track <- CTSSs %>% rowRanges %>% subsetByOverlaps(plot_region) %>% trackCTSS(name = "CTSSs") # Cluster track cluster_track <- RSE %>% rowRanges %>% subsetByOverlaps(plot_region) %>% trackClusters(name = "Clusters", col = NA, showId=TRUE) # Plot at tracks together plotTracks(list(axis_track, ctss_track, cluster_track, tx_track), from = start(plot_region), to=end(plot_region), chromosome = as.character(seqnames(plot_region))) ## ----simplifyTxTypes----------------------------------------------------- cluster_info <- RSE %>% rowData() %>% as.data.frame() ## ----plotTxTypes1, fig.width=5, fig.height=3, fig.cap="Number of clusters within each annotation category"---- # Number of clusters ggplot(cluster_info, aes(x=txType, fill=clusterType)) + geom_bar(alpha=0.75, position="dodge", color="black") + scale_fill_colorblind("Cluster type") + labs(x="Cluster annotation", y="Frequency") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ## ----plotTxTypes2, fig.width=5, fig.height=3, fig.cap="Expression of clusters within each annotation category"---- # Expression of clusters ggplot(cluster_info, aes(x=txType, y=log2(score/ncol(RSE)), fill=clusterType)) + geom_violin(alpha=0.75, draw_quantiles = c(0.25, 0.50, 0.75)) + scale_fill_colorblind("Cluster type") + labs(x="Cluster annotation", y="log2(TPM)") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ## ----highTSSs------------------------------------------------------------ # Select highly expressed TSSs highTSSs <- subset(RSE, clusterType == 'TSS' & score / ncol(RSE) >= 10) # Calculate IQR as 10%-90% interval highTSSs <- calcShape(highTSSs, pooled=CTSSs, shapeFunction=shapeIQR, lower = 0.10, upper = 0.90) ## ----TSSShapes, fig.width=5, fig.height=3, fig.cap="Bimodal distribution of Interquartile Ranges (IQRs) of highly expressed TSSs"---- highTSSs %>% rowData %>% as.data.frame %>% ggplot(aes(x=IQR)) + geom_histogram(binwidth=1, fill="hotpink", alpha=0.75) + geom_vline(xintercept = 10, linetype="dashed", alpha=0.75, color="black") + facet_zoom(xlim = c(0,100)) + labs(x="10-90% IQR", y="Frequency") ## ----classifyShapes------------------------------------------------------ # Divide into groups rowData(highTSSs)$shape <- ifelse(rowData(highTSSs)$IQR < 10, "Sharp", "Broad") # Count group sizes table(rowData(highTSSs)$shape) ## ----BSGenome------------------------------------------------------------ promoter_seqs <- highTSSs %>% rowRanges() %>% swapRanges() %>% promoters(upstream=40, downstream=10) %>% getSeq(bsg, .) ## ----ggseqlogo, fig.width=5, fig.height=2.5, fig.cap='Sequence logos of core promoter regions of Sharp and Broad classes of TSSs'---- promoter_seqs %>% as.character %>% split(rowData(highTSSs)$shape) %>% ggseqlogo(data=., ncol=2, nrow=1) + theme_logo() + theme(axis.title.x=element_blank(), axis.text.x=element_blank(), axis.ticks.x=element_blank()) ## ----orientation--------------------------------------------------------- rowData(RSE)$clusterType <- RSE %>% rowData() %>% use_series("clusterType") %>% as_factor() %>% fct_relevel("TSS") ## ----findLinks----------------------------------------------------------- all_links <- RSE %>% swapRanges %>% findLinks(maxDist = 5e4L, directional="clusterType", inputAssay="TPM", method="kendall") all_links ## ----linkCorrelations---------------------------------------------------- # Subset to only positive correlation cor_links <- subset(all_links, estimate > 0) # Sort based on correlation cor_links <- cor_links[order(cor_links$estimate, decreasing = TRUE)] ## ----browseLinks, fig.width=5, fig.height=5, fig.cap="Genome browser example of TSS-enhancer link candidates"---- # Make plotting region plot_region <- cor_links[1] %>% anchors %>% GRangesList() %>% unlist() %>% reduce(ignore.strand=TRUE, min.gapwidth=1e5) %>% add(1000) # Cluster track cluster_track <- RSE %>% subsetByOverlaps(plot_region) %>% trackClusters(name = "Clusters", col = NA, showId=TRUE) # Cluster track link_track <- cor_links %>% subsetByOverlaps(plot_region) %>% trackLinks(name="Links", interaction.measure = "p.value", interaction.dimension.transform = "log", col.outside="grey", plot.anchors=FALSE, col.interactions="black") # Plot at tracks together plotTracks(list(axis_track, link_track, cluster_track, tx_track), from = start(plot_region), to=end(plot_region), chromosome = as.character(seqnames(plot_region))) ## ----findStretches------------------------------------------------------- # Subset to only enhancers Enhancers <- subset(RSE, clusterType == "Enhancer") # Find stretches stretches <- findStretches(Enhancers, inputAssay = "TPM", mergeDist = 12500L, minSize = 5, method = "kendall") ## ----stretchTypes-------------------------------------------------------- # Annotate stretches <- assignTxType(stretches, txModels=txdb) # Sort by correlation stretches <- stretches[order(stretches$aveCor, decreasing=TRUE)] # Inspect stretches ## ----browseStretches, fig.width=5, fig.height=5, fig.cap="Genome browser example of enhancer stretch"---- # Make plotting region plot_region <- stretches["chr17:26666593-26675486"] + 1000 # Cluster track cluster_track <- RSE %>% subsetByOverlaps(plot_region) %>% trackClusters(name = "Clusters", col = NA, showId=TRUE) # CTSS track ctss_track <- CTSSs %>% subsetByOverlaps(plot_region) %>% trackCTSS(name="CTSSs") # SE track stretch_track <- stretches %>% subsetByOverlaps(plot_region) %>% AnnotationTrack(name="Stretches", fill="hotpink", col=NULL) # Plot at tracks together plotTracks(list(axis_track, stretch_track, cluster_track, ctss_track), from = start(plot_region), to=end(plot_region), chromosome = as.character(seqnames(plot_region))) ## ----vstBlind------------------------------------------------------------ # Create DESeq2 object with blank design dds_blind <- DESeqDataSet(RSE, design = ~ 1) # Normalize and log transform vst_blind <- vst(dds_blind, blind = TRUE) ## ----PCA, fig.width=4, fig.height=4, fig.cap="PCA-plot of variance stabilized expression."---- plotPCA(vst_blind, "Class") ## ----batchVar------------------------------------------------------------ # Extract pca results pca_res <- plotPCA(vst_blind, "Class", returnData=TRUE) # Define a new variable using PC1 batch_var <- ifelse(pca_res$PC1 > 0, "A", "B") # Attach the batch variable as a factor to the experiment RSE$Batch <- factor(batch_var) # Show the new design RSE %>% colData() %>% subset(select=c(Class, Batch)) %>% kable(caption = "Design matrix after adding new batch covariate.") ## ----finalDesign--------------------------------------------------------- # Specify design dds <- DESeqDataSet(RSE, design = ~ Batch + Class) # Fit DESeq2 model dds <- DESeq(dds) ## ----results------------------------------------------------------------- # Extract results res <- results(dds, contrast=c("Class", "Nano", "Ctrl"), alpha=0.05, independentFiltering=TRUE, tidy = TRUE) %>% bind_cols(as.data.frame(rowData(RSE))) %>% as_tibble # Show the top hits res %>% top_n(-10, padj) %>% dplyr::select(cluster=row, clusterType, txType, baseMean, log2FoldChange, padj) %>% kable(caption = "Top differentially expressed TSS and enhancer candidates") ## ----diagnosticPlot, fig.width=5, fig.height=5, fig.cap="Diagnostic MA plot of the differential expression analysis"---- ggplot(res, aes(x=log2(baseMean), y=log2FoldChange, color=padj < 0.05)) + geom_point(alpha=0.25) + geom_hline(yintercept = 0, linetype="dashed", alpha=0.75) + facet_grid(clusterType~.) ## ----DEtable------------------------------------------------------------- table(clusterType=rowRanges(RSE)$clusterType, DE=res$padj<0.05) ## ----vstGuided----------------------------------------------------------- # Guided variance stabilizing transformation vst_guided <- varianceStabilizingTransformation(dds, blind=FALSE) ## ----ComBat-------------------------------------------------------------- # Design matrix of wanted effects bio_effects <- model.matrix(~Class, data=colData(RSE)) # Run ComBat = assay(RSE, "ComBat") <- ComBat(dat=assay(vst_guided), batch=RSE$Batch, # Unwanted batch mod=bio_effects) ## ----correctedPCA, fig.width=4, fig.height=4, fig.cap='PCA-plot of batch corrected expression.'---- # Overwrite assay assay(vst_guided) <- assay(RSE, "ComBat") # Plot as before plotPCA(vst_guided, "Class") ## ----findTop10----------------------------------------------------------- # Find top 10 DE enhancers top10 <- res %>% filter(clusterType == "Enhancer", padj < 0.05) %>% group_by(log2FoldChange >= 0) %>% top_n(5, wt=abs(log2FoldChange)) %>% pull(row) # Extract expression values in tidy format tidyEnhancers <- assay(RSE, "ComBat")[top10,] %>% t %>% as.data.frame %>% rownames_to_column("Sample") %>% mutate(Class=RSE$Class) %>% gather(key="Enhancer", value="Expression", -Sample, -Class, factor_key=TRUE) ## ----ploTop10, fig.width=6, fig.height=5, fig.cap="Expression profile of top 10 differentially expressed enhancer candidates."---- ggplot(tidyEnhancers, aes(x=Class, y=Expression, fill=Class)) + geom_dotplot(stackdir="center", binaxis="y", dotsize=3) + facet_wrap(~Enhancer, ncol=2, scales="free_y") ## ----promoter_seqs------------------------------------------------------- cluster_seqs <- RSE %>% rowRanges %>% swapRanges() %>% unstrand() %>% add(500) %>% getSeq(bsg, .) ## ----TFBStools----------------------------------------------------------- # Extract motifs as Position motif_pfms <- getMatrixSet(JASPAR2016, opts = list(species="10090")) # Look at the IDs and names of the first few motifs: head(name(motif_pfms)) ## ----motifmatch---------------------------------------------------------- # Find matches motif_hits <- matchMotifs(motif_pfms, subject=cluster_seqs) # Matches are returned as a sparse matrix: motifMatches(motif_hits)[1:5, 1:5] ## ----fishers------------------------------------------------------------- # 2x2 table for fishers table(FOSJUN = motifMatches(motif_hits)[,"MA0099.2"], DE = res$padj < 0.05) %>% print() %>% fisher.test() ## ----assignGeneIDs------------------------------------------------------- RSE <- assignGeneID(RSE, geneModels=txdb) ## ----quantifyGenes------------------------------------------------------- GSE <- RSE %>% subset(clusterType == "TSS") %>% quantifyGenes(genes="geneID", inputAssay="counts") ## ----geneLevelExamples--------------------------------------------------- rowRanges(GSE["100038347",]) ## ----symbols------------------------------------------------------------- # Translate symbols rowData(GSE)$symbol <- mapIds(odb, keys=rownames(GSE), column="SYMBOL", keytype="ENTREZID") ## ----limmaNorm----------------------------------------------------------- # Create DGElist object dge <- DGEList(counts=assay(GSE, "counts"), genes=as.data.frame(rowData(GSE))) # Calculate normalization factors dge <- calcNormFactors(dge) ## ----limmaVoom----------------------------------------------------------- # Design matrix mod <- model.matrix(~ Batch + Class, data = colData(GSE)) # Model mean-variance using voom v <- voom(dge, design=mod) # Fit and shrink DE model fit <- lmFit(v, design=mod) eb <- eBayes(fit, robust=TRUE) # Summarize the results dt <- decideTests(eb) ## ----limmaSummary-------------------------------------------------------- # Global summary dt %>% summary %>% kable(caption="Global summary of differentially expressed genes.") ## ----limmaTop------------------------------------------------------------ # Inspect top htis topTable(eb, coef="ClassNano") %>% dplyr::select(symbol, nClusters, AveExpr, logFC, adj.P.Val) %>% kable(caption="Top differentially expressed genes.") ## ----goanna-------------------------------------------------------------- # Find enriched GO-terms GO <- goana(eb, coef = "ClassNano", species = "Mm", trend = TRUE) # Show top hits topGO(GO, ontology = "BP", number = 10) %>% kable(caption="Top enriched or depleted GO-terms.") ## ----kegga--------------------------------------------------------------- # Find enriched KEGG-terms KEGG <- kegga(eb, coef="ClassNano", species = "Mm", trend = TRUE) # Show top hits topKEGG(KEGG, number = 10) %>% knitr::kable(caption="Top enriched of depleted KEGG-terms.") ## ----pathview, message = FALSE, fig.width=6, fig.height=6, fig.cap="Detailed view of differentially expressed gene in the KEGG TNF-signalling pathway."---- # Visualize a KEGG DE_genes <- Filter(function(x) x != 0, dt[, "ClassNano"]) # This will save a png file to a temporary directory pathview(DE_genes, species="mmu", pathway.id="mmu04668", kegg.dir = tempdir()) # Show the png file grid.newpage() grid.raster(png::readPNG("mmu04668.pathview.png")) ## ----subsetByComposition------------------------------------------------- # Filter away lowly expressed RSE_filtered <- RSE %>% subset(clusterType == "TSS" & !is.na(geneID)) %>% subsetByComposition(inputAssay="counts", genes="geneID", unexpressed=0.1, minSamples=5) ## ----TSSstructure, fig.width=5, fig.height=2.5, fig.cap="Overview of alternative TSS usage within genes."---- RSE_filtered %>% rowData %>% as.data.frame %>% as_tibble %>% dplyr::count(geneID) %>% ggplot(aes(x = n, fill = n >= 2)) + geom_bar(alpha=0.75) + scale_fill_colorblind("Multi-TSS") + labs(x = "Number of TSSs per gene", y = "Frequency") ## ----edgeRNorm----------------------------------------------------------- # Annotate with symbols like before: rowData(RSE_filtered)$symbol <- mapIds(odb, keys=rowData(RSE_filtered)$geneID, column="SYMBOL", keytype="ENTREZID") # Extract gene info TSS_info <- RSE_filtered %>% rowData %>% subset(select=c(score, txType, geneID, symbol)) %>% as.data.frame # Build DGEList dge <- DGEList(counts=assay(RSE_filtered, "counts"), genes=TSS_info) ## ----diffSpliceDGE------------------------------------------------------- # Estimate normalization factors dge <- calcNormFactors(dge) # Estimate dispersion and fit GLMs disp <- estimateDisp(dge, design = mod, tagwise = FALSE) QLfit <- glmQLFit(disp, design=mod, robust = TRUE) # Apply diffSpliceDGE ds <- diffSpliceDGE(QLfit, coef = "ClassNano", geneid = "geneID") ## ----dtuTSS-------------------------------------------------------------- dtu_TSSs <- topSpliceDGE(ds, test = "exon") dplyr::select(dtu_TSSs, txType, geneID, symbol, logFC, FDR) %>% kable(caption = "Top differentially used TSSs") ## ----dtuGene------------------------------------------------------------- dtu_genes <- topSpliceDGE(ds, test = "Simes") dplyr::select(dtu_genes, geneID, symbol, NExons, FDR) %>% kable(row.names = FALSE, caption = "Top genes showing any differential TSS usage.") ## ----heatmap, fig.width=3, fig.height=4, fig.cap="Heatmap showing expression of TSSs within Fblim1"---- RSE_filtered %>% subset(geneID == "74202") %>% assay("ComBat") %>% t %>% pheatmap(color = magma(100), cluster_cols = FALSE, main="Fblim1") ## ----dtubrowser, fig.width=5, fig.height=5, fig.cap="Genome-browser example of differential TSS usage within Fblim1"---- # Define plot area plot_region <- subset(RSE_filtered, geneID == "74202") %>% rowRanges %>% reduce(min.gapwidth=1e6) %>% unstrand() %>% add(5e3L) # Create cluster track cluster_track <- subsetByOverlaps(RSE_filtered, plot_region) %>% trackClusters(name = "Clusters", col = NA, showId=TRUE) # CTSS tracks for each group ctrl_track <- subset(CTSSs, select=Class == "Ctrl") %>% calcPooled() %>% subsetByOverlaps(plot_region) %>% trackCTSS(name="Ctrl") nano_track <- subset(CTSSs, select=Class == "Nano") %>% calcPooled() %>% subsetByOverlaps(plot_region) %>% trackCTSS(name="Nano") # Plot at tracks together plotTracks(list(axis_track, tx_track, cluster_track, Ctrl=ctrl_track, nano_track), from = start(plot_region), to=end(plot_region), chromosome = seqnames(plot_region))