## ----nanotubes-----------------------------------------------------------
library(nanotubes)

## ----packages, results='hide', message=FALSE, warning=FALSE--------------
# CRAN packages for data manipulation and plotting
library(knitr)
library(pheatmap)
library(ggseqlogo)
library(viridis)
library(magrittr)
library(ggforce)
library(ggthemes)
library(tidyverse)

# CAGEfightR and related packages
library(CAGEfightR)
library(GenomicRanges)
library(SummarizedExperiment)
library(GenomicFeatures)
library(BiocParallel)
library(InteractionSet)
library(Gviz)

# Bioconductor packages for differential expression
library(DESeq2)
library(limma)
library(edgeR)
library(sva)

# Bioconductor packages for enrichment analyses
library(TFBSTools)
library(motifmatchr)
library(pathview)

# Bioconductor data packages
library(BSgenome.Mmusculus.UCSC.mm9)
library(TxDb.Mmusculus.UCSC.mm9.knownGene)
library(org.Mm.eg.db)
library(JASPAR2016)

## ----scriptwise, results='hide', message=FALSE, warning=FALSE------------
# Rename these for easier access
bsg <- BSgenome.Mmusculus.UCSC.mm9
txdb <- TxDb.Mmusculus.UCSC.mm9.knownGene
odb <- org.Mm.eg.db

# Script wide settings
register(MulticoreParam(3)) # Parallel execution when possible
theme_set(theme_light()) # White theme for ggplot2 figures

## ----studyDesign---------------------------------------------------------
data(nanotubes)
kable(nanotubes, 
      caption = "The initial design matrix for the nanotubes experiment")

## ----fnames--------------------------------------------------------------
# Setup paths to file on hard drive
bw_plus <- system.file("extdata", nanotubes$BigWigPlus, 
                        package = "nanotubes", 
                        mustWork = TRUE)
bw_minus <- system.file("extdata", nanotubes$BigWigMinus, 
                        package = "nanotubes", 
                        mustWork = TRUE)

# Save as named BigWigFileList
bw_plus <- BigWigFileList(bw_plus)
bw_minus <- BigWigFileList(bw_minus)
names(bw_plus) <- names(bw_minus) <- nanotubes$Name

## ----quantifyCTSSs-------------------------------------------------------
CTSSs <- quantifyCTSSs(plusStrand = bw_plus,
                       minusStrand = bw_minus,
                       genome = seqinfo(bsg),
                       design = nanotubes)

## ----inspectCTSSs--------------------------------------------------------
# Get a summary
CTSSs

# Extract CTSS positions
rowRanges(CTSSs)

# Extract CTSS counts
assay(CTSSs, "counts") %>%
    head

## ----pooled--------------------------------------------------------------
CTSSs <- CTSSs %>%
    calcTPM() %>%
    calcPooled()

## ----tagClustering-------------------------------------------------------
TCs <- quickTSSs(CTSSs)

## ----TCfiltering---------------------------------------------------------
TSSs <- TCs %>%
    calcTPM() %>%
	subsetBySupport(inputAssay="TPM", 
	                unexpressed=1, 
	                minSamples=4)

## ----bidirClustering-----------------------------------------------------
BCs <- quickEnhancers(CTSSs)

## ----BCfiltering---------------------------------------------------------
BCs <- subsetBySupport(BCs, inputAssay="counts", unexpressed=0, minSamples=4)

## ----TSSannotation-------------------------------------------------------
# Annotate with transcript IDs
TSSs <- assignTxID(TSSs, txModels = txdb, swap="thick")

# Annotate with transcript context
TSSs <- assignTxType(TSSs, txModels = txdb, swap="thick")

## ----BCannotation--------------------------------------------------------
# Annotate with transcript context
BCs <- assignTxType(BCs, txModels = txdb, swap="thick")

# Keep only non-exonic BCs as enhancer candidates
Enhancers <- subset(BCs, txType %in% c("intergenic", "intron"))

## ----cleanClusters-------------------------------------------------------
# Clean colData
TSSs$totalTags <- NULL
Enhancers$totalTags <- NULL

# Clean rowData
rowData(TSSs)$balance <- NA
rowData(TSSs)$bidirectionality <- NA
rowData(Enhancers)$txID <- NA

# Add labels for making later retrieval easy
rowData(TSSs)$clusterType <- "TSS"
rowData(Enhancers)$clusterType <- "Enhancer"

## ----combineClusters-----------------------------------------------------
RSE <- combineClusters(object1=TSSs, 
                       object2 = Enhancers, 
                       removeIfOverlapping="object1")

## ----finalTPM------------------------------------------------------------
RSE <- calcTPM(RSE)

## ----genomeTracks--------------------------------------------------------
# Genome track
axis_track <- GenomeAxisTrack()

# Annotation track
tx_track <- GeneRegionTrack(txdb, 
                            name = "Gene Models", 
                            col = NA,
                            fill = "bisque4", 
                            shape = "arrow", 
                            showId = TRUE)

## ----simpleTSS, fig.width=5, fig.height=5, fig.cap='Genome browser example of TSS candidate'----
# Extract 100 bp around the first TSS.
plot_region <- RSE %>% 
    rowRanges %>% 
    subset(clusterType == "TSS") %>% 
    .[1] %>%
    add(100) %>%
    unstrand()

# CTSSs track
ctss_track <- CTSSs %>%
    rowRanges %>%
    subsetByOverlaps(plot_region) %>%
    trackCTSS(name = "CTSSs")

# Cluster track
cluster_track <- RSE %>%
    subsetByOverlaps(plot_region) %>%
    trackClusters(name = "Clusters", 
                  col = NA, 
                  showId=TRUE)

# Plot at tracks together
plotTracks(list(axis_track, 
                ctss_track,
                cluster_track,
                tx_track),
           from = start(plot_region), 
           to=end(plot_region), 
           chromosome = seqnames(plot_region))

## ----simpleEnhancer, fig.width=5, fig.height=5, fig.cap='Genome browser example of enhancer candidate'----
# Make plotting region
plot_region <- RSE %>% 
    rowRanges %>% 
    subset(clusterType == "Enhancer") %>% 
    .[1] %>%
    add(100) %>%
    unstrand()

# CTSSs track
ctss_track <- CTSSs %>%
    rowRanges %>%
    subsetByOverlaps(plot_region) %>%
    trackCTSS(name = "CTSSs")

# Cluster track
cluster_track <- RSE %>%
    rowRanges %>%
    subsetByOverlaps(plot_region) %>%
    trackClusters(name = "Clusters", 
                  col = NA, 
                  showId=TRUE)

# Plot at tracks together
plotTracks(list(axis_track, 
                ctss_track,
                cluster_track,
                tx_track),
           from = start(plot_region), 
           to=end(plot_region), 
           chromosome = as.character(seqnames(plot_region)))

## ----simplifyTxTypes-----------------------------------------------------
cluster_info <- RSE %>%
    rowData() %>%
    as.data.frame()

## ----plotTxTypes1, fig.width=5, fig.height=3, fig.cap="Number of clusters within each annotation category"----
# Number of clusters
ggplot(cluster_info, aes(x=txType, fill=clusterType)) +
	geom_bar(alpha=0.75, position="dodge", color="black") +
	scale_fill_colorblind("Cluster type") +
	labs(x="Cluster annotation", y="Frequency") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

## ----plotTxTypes2, fig.width=5, fig.height=3, fig.cap="Expression of clusters within each annotation category"----
# Expression of clusters
ggplot(cluster_info, aes(x=txType, 
                         y=log2(score/ncol(RSE)), 
                         fill=clusterType)) +
	geom_violin(alpha=0.75, draw_quantiles = c(0.25, 0.50, 0.75)) +
	scale_fill_colorblind("Cluster type") +
	labs(x="Cluster annotation", y="log2(TPM)") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

## ----highTSSs------------------------------------------------------------
# Select highly expressed TSSs
highTSSs <- subset(RSE, clusterType == 'TSS' & score / ncol(RSE) >= 10)

# Calculate IQR as 10%-90% interval 
highTSSs <- calcShape(highTSSs, 
                      pooled=CTSSs, 
                      shapeFunction=shapeIQR, 
                      lower = 0.10, 
                      upper = 0.90)

## ----TSSShapes, fig.width=5, fig.height=3, fig.cap="Bimodal distribution of Interquartile Ranges (IQRs) of highly expressed TSSs"----
highTSSs %>%
	rowData %>%
	as.data.frame %>%
	ggplot(aes(x=IQR)) +
	geom_histogram(binwidth=1, fill="hotpink", alpha=0.75) +
	geom_vline(xintercept = 10, linetype="dashed", alpha=0.75, color="black") +
    facet_zoom(xlim = c(0,100)) +
	labs(x="10-90% IQR", y="Frequency")

## ----classifyShapes------------------------------------------------------
# Divide into groups
rowData(highTSSs)$shape <- ifelse(rowData(highTSSs)$IQR < 10, "Sharp", "Broad")

# Count group sizes
table(rowData(highTSSs)$shape)

## ----BSGenome------------------------------------------------------------
promoter_seqs <- highTSSs %>%
	rowRanges() %>%
	swapRanges() %>%
	promoters(upstream=40, downstream=10) %>%
	getSeq(bsg, .)

## ----ggseqlogo, fig.width=5, fig.height=2.5, fig.cap='Sequence logos of core promoter regions of Sharp and Broad classes of TSSs'----
promoter_seqs %>%
	as.character %>%
	split(rowData(highTSSs)$shape) %>%
	ggseqlogo(data=., ncol=2, nrow=1) +
	theme_logo() +
	theme(axis.title.x=element_blank(),
	      axis.text.x=element_blank(),
	      axis.ticks.x=element_blank())

## ----orientation---------------------------------------------------------
rowData(RSE)$clusterType <- RSE %>%
    rowData() %>%
    use_series("clusterType") %>%
    as_factor() %>%
    fct_relevel("TSS")

## ----findLinks-----------------------------------------------------------
all_links <- RSE %>%
    swapRanges %>%
    findLinks(maxDist = 5e4L,
              directional="clusterType",
              inputAssay="TPM",
              method="kendall")
all_links

## ----linkCorrelations----------------------------------------------------
# Subset to only positive correlation
cor_links <- subset(all_links, estimate > 0)

# Sort based on correlation
cor_links <- cor_links[order(cor_links$estimate, decreasing = TRUE)]

## ----browseLinks, fig.width=5, fig.height=5, fig.cap="Genome browser example of TSS-enhancer link candidates"----
# Make plotting region
plot_region <- cor_links[1] %>% 
    anchors %>% 
    GRangesList() %>% 
    unlist() %>% 
    reduce(ignore.strand=TRUE, 
           min.gapwidth=1e5) %>%
    add(1000)

# Cluster track
cluster_track <- RSE %>%
    subsetByOverlaps(plot_region) %>%
    trackClusters(name = "Clusters", 
                  col = NA, 
                  showId=TRUE)


# Cluster track
link_track <- cor_links %>%
    subsetByOverlaps(plot_region) %>%
    trackLinks(name="Links",
               interaction.measure = "p.value",
               interaction.dimension.transform = "log",
               col.outside="grey",
               plot.anchors=FALSE,
               col.interactions="black")

# Plot at tracks together
plotTracks(list(axis_track, 
                link_track,
                cluster_track,
                tx_track),
           from = start(plot_region), 
           to=end(plot_region), 
           chromosome = as.character(seqnames(plot_region)))

## ----findStretches-------------------------------------------------------
# Subset to only enhancers
Enhancers <- subset(RSE, clusterType == "Enhancer")

# Find stretches
stretches <- findStretches(Enhancers, 
                           inputAssay = "TPM",
                           mergeDist = 12500L,
                           minSize = 5,
                           method = "kendall")

## ----stretchTypes--------------------------------------------------------
# Annotate
stretches <- assignTxType(stretches, txModels=txdb)

# Sort by correlation
stretches <- stretches[order(stretches$aveCor, decreasing=TRUE)]

# Inspect
stretches

## ----browseStretches, fig.width=5, fig.height=5, fig.cap="Genome browser example of enhancer stretch"----
# Make plotting region
plot_region <- stretches["chr17:26666593-26675486"] + 1000

# Cluster track
cluster_track <- RSE %>%
    subsetByOverlaps(plot_region) %>%
    trackClusters(name = "Clusters", 
                  col = NA, 
                  showId=TRUE)

# CTSS track
ctss_track <- CTSSs %>%
    subsetByOverlaps(plot_region) %>%
    trackCTSS(name="CTSSs")

# SE track
stretch_track <- stretches %>%
    subsetByOverlaps(plot_region) %>%
    AnnotationTrack(name="Stretches", fill="hotpink", col=NULL)

# Plot at tracks together
plotTracks(list(axis_track, 
                stretch_track,
                cluster_track,
                ctss_track),
           from = start(plot_region), 
           to=end(plot_region), 
           chromosome = as.character(seqnames(plot_region)))

## ----vstBlind------------------------------------------------------------
# Create DESeq2 object with blank design
dds_blind <- DESeqDataSet(RSE, design = ~ 1)

# Normalize and log transform
vst_blind <- vst(dds_blind, blind = TRUE)

## ----PCA, fig.width=4, fig.height=4, fig.cap="PCA-plot of variance stabilized expression."----
plotPCA(vst_blind, "Class")

## ----batchVar------------------------------------------------------------
# Extract pca results
pca_res <- plotPCA(vst_blind, "Class", returnData=TRUE)

# Define a new variable using PC1
batch_var <- ifelse(pca_res$PC1 > 0, "A", "B")

# Attach the batch variable as a factor to the experiment
RSE$Batch <- factor(batch_var)

# Show the new design
RSE %>%
    colData() %>%
    subset(select=c(Class, Batch)) %>%
    kable(caption = "Design matrix after adding new batch covariate.")

## ----finalDesign---------------------------------------------------------
# Specify design
dds <- DESeqDataSet(RSE, design = ~ Batch + Class)

# Fit DESeq2 model
dds <- DESeq(dds)

## ----results-------------------------------------------------------------
# Extract results
res <- results(dds,
               contrast=c("Class", "Nano", "Ctrl"),
               alpha=0.05, 
               independentFiltering=TRUE, 
               tidy = TRUE) %>%
	bind_cols(as.data.frame(rowData(RSE))) %>%
	as_tibble

# Show the top hits
res %>%
    top_n(-10, padj) %>%
    dplyr::select(cluster=row, 
                  clusterType, 
                  txType, 
                  baseMean, 
                  log2FoldChange, 
                  padj) %>%
    kable(caption = "Top differentially expressed TSS and enhancer candidates")

## ----diagnosticPlot, fig.width=5, fig.height=5, fig.cap="Diagnostic MA plot of the differential expression analysis"----
ggplot(res, aes(x=log2(baseMean), y=log2FoldChange, color=padj < 0.05)) +
	geom_point(alpha=0.25) +
	geom_hline(yintercept = 0, linetype="dashed", alpha=0.75) +
	facet_grid(clusterType~.)

## ----DEtable-------------------------------------------------------------
table(clusterType=rowRanges(RSE)$clusterType, DE=res$padj<0.05)

## ----vstGuided-----------------------------------------------------------
# Guided variance stabilizing transformation
vst_guided <- varianceStabilizingTransformation(dds, blind=FALSE)


## ----ComBat--------------------------------------------------------------
# Design matrix of wanted effects
bio_effects <- model.matrix(~Class, data=colData(RSE))

# Run ComBat =
assay(RSE, "ComBat") <- ComBat(dat=assay(vst_guided),
                                    batch=RSE$Batch, # Unwanted batch
                                    mod=bio_effects)


## ----correctedPCA, fig.width=4, fig.height=4, fig.cap='PCA-plot of batch corrected expression.'----
# Overwrite assay 
assay(vst_guided) <- assay(RSE, "ComBat")

# Plot as before
plotPCA(vst_guided, "Class")

## ----findTop10-----------------------------------------------------------
# Find top 10 DE enhancers
top10 <- res %>%
	filter(clusterType == "Enhancer", padj < 0.05) %>%
	group_by(log2FoldChange >= 0) %>%
	top_n(5, wt=abs(log2FoldChange)) %>%
	pull(row)

# Extract expression values in tidy format
tidyEnhancers <- assay(RSE, "ComBat")[top10,] %>%
	t %>%
	as.data.frame %>%
	rownames_to_column("Sample") %>%
	mutate(Class=RSE$Class) %>%
	gather(key="Enhancer", 
	       value="Expression", 
	       -Sample, -Class, 
	       factor_key=TRUE)

## ----ploTop10, fig.width=6, fig.height=5, fig.cap="Expression profile of top 10 differentially expressed enhancer candidates."----
ggplot(tidyEnhancers, aes(x=Class, y=Expression, fill=Class)) +
	geom_dotplot(stackdir="center", binaxis="y", dotsize=3) +
	facet_wrap(~Enhancer, ncol=2, scales="free_y")

## ----promoter_seqs-------------------------------------------------------
cluster_seqs <- RSE %>% 
    rowRanges %>%
    swapRanges() %>%
    unstrand() %>%
    add(500) %>% 
    getSeq(bsg, .)

## ----TFBStools-----------------------------------------------------------
# Extract motifs as Position
motif_pfms <- getMatrixSet(JASPAR2016, opts = list(species="10090"))

# Look at the IDs and names of the first few motifs:
head(name(motif_pfms))

## ----motifmatch----------------------------------------------------------
# Find matches
motif_hits <- matchMotifs(motif_pfms, subject=cluster_seqs)

# Matches are returned as a sparse matrix:
motifMatches(motif_hits)[1:5, 1:5]

## ----fishers-------------------------------------------------------------
# 2x2 table for fishers
table(FOSJUN = motifMatches(motif_hits)[,"MA0099.2"],
      DE = res$padj < 0.05) %>%
    print() %>%
    fisher.test()

## ----assignGeneIDs-------------------------------------------------------
RSE <- assignGeneID(RSE, geneModels=txdb)

## ----quantifyGenes-------------------------------------------------------
GSE <- RSE %>%
    subset(clusterType == "TSS") %>%
    quantifyGenes(genes="geneID", inputAssay="counts")

## ----geneLevelExamples---------------------------------------------------
rowRanges(GSE["100038347",])

## ----symbols-------------------------------------------------------------
# Translate symbols
rowData(GSE)$symbol <- mapIds(odb, 
                              keys=rownames(GSE), 
                              column="SYMBOL", 
                              keytype="ENTREZID")

## ----limmaNorm-----------------------------------------------------------
# Create DGElist object
dge <- DGEList(counts=assay(GSE, "counts"),
               genes=as.data.frame(rowData(GSE)))

# Calculate normalization factors
dge <- calcNormFactors(dge)

## ----limmaVoom-----------------------------------------------------------
# Design matrix
mod <- model.matrix(~ Batch + Class, data = colData(GSE))

# Model mean-variance using voom
v <- voom(dge, design=mod)

# Fit and shrink DE model
fit <- lmFit(v, design=mod)
eb <- eBayes(fit, robust=TRUE)

# Summarize the results
dt <- decideTests(eb)

## ----limmaSummary--------------------------------------------------------
# Global summary
dt %>% 
    summary %>% 
    kable(caption="Global summary of differentially expressed genes.")

## ----limmaTop------------------------------------------------------------
# Inspect top htis
topTable(eb, coef="ClassNano") %>%
    dplyr::select(symbol, nClusters, AveExpr, logFC, adj.P.Val) %>%
    kable(caption="Top differentially expressed genes.")

## ----goanna--------------------------------------------------------------
# Find enriched GO-terms
GO <- goana(eb, coef = "ClassNano", species = "Mm", trend = TRUE)

# Show top hits
topGO(GO, ontology = "BP", number = 10) %>%
	kable(caption="Top enriched or depleted GO-terms.")

## ----kegga---------------------------------------------------------------
# Find enriched KEGG-terms
KEGG <- kegga(eb, coef="ClassNano", species = "Mm", trend = TRUE)

# Show top hits
topKEGG(KEGG, number = 10) %>%
	knitr::kable(caption="Top enriched of depleted KEGG-terms.")

## ----pathview, message = FALSE, fig.width=6, fig.height=6, fig.cap="Detailed view of differentially expressed gene in the KEGG TNF-signalling pathway."----
# Visualize a KEGG
DE_genes <- Filter(function(x) x != 0, dt[, "ClassNano"])

# This will save a png file to a temporary directory
pathview(DE_genes, species="mmu", pathway.id="mmu04668", kegg.dir = tempdir())

# Show the png file
grid.newpage()
grid.raster(png::readPNG("mmu04668.pathview.png"))

## ----subsetByComposition-------------------------------------------------
# Filter away lowly expressed
RSE_filtered <- RSE %>%
	subset(clusterType == "TSS" & !is.na(geneID)) %>%
	subsetByComposition(inputAssay="counts", 
	                    genes="geneID", 
	                    unexpressed=0.1, 
	                    minSamples=5)

## ----TSSstructure, fig.width=5, fig.height=2.5, fig.cap="Overview of alternative TSS usage within genes."----
RSE_filtered %>% 
    rowData %>% 
    as.data.frame %>% 
    as_tibble %>% 
    dplyr::count(geneID) %>% 
    ggplot(aes(x = n, fill = n >= 2)) + 
    geom_bar(alpha=0.75) +
    scale_fill_colorblind("Multi-TSS") +
    labs(x = "Number of TSSs per gene", y = "Frequency")

## ----edgeRNorm-----------------------------------------------------------
# Annotate with symbols like before:
rowData(RSE_filtered)$symbol <- mapIds(odb, 
                                       keys=rowData(RSE_filtered)$geneID,
                                       column="SYMBOL", 
                                       keytype="ENTREZID")

# Extract gene info
TSS_info <- RSE_filtered %>%
	rowData %>%
	subset(select=c(score, txType, geneID, symbol)) %>%
	as.data.frame

# Build DGEList
dge <- DGEList(counts=assay(RSE_filtered, "counts"),
               genes=TSS_info)

## ----diffSpliceDGE-------------------------------------------------------
# Estimate normalization factors
dge <- calcNormFactors(dge)

# Estimate dispersion and fit GLMs
disp <- estimateDisp(dge, design = mod, tagwise = FALSE)
QLfit <- glmQLFit(disp, design=mod, robust = TRUE)

# Apply diffSpliceDGE
ds <- diffSpliceDGE(QLfit, coef = "ClassNano", geneid = "geneID")

## ----dtuTSS--------------------------------------------------------------
dtu_TSSs <- topSpliceDGE(ds, test = "exon")
dplyr::select(dtu_TSSs, txType, geneID, symbol, logFC, FDR) %>%
    kable(caption = "Top differentially used TSSs")

## ----dtuGene-------------------------------------------------------------
dtu_genes <- topSpliceDGE(ds, test = "Simes")
dplyr::select(dtu_genes, geneID, symbol, NExons, FDR) %>%
    kable(row.names = FALSE, 
          caption = "Top genes showing any differential TSS usage.")

## ----heatmap, fig.width=3, fig.height=4, fig.cap="Heatmap showing expression of TSSs within Fblim1"----
RSE_filtered %>%
	subset(geneID == "74202") %>%
	assay("ComBat") %>%
	t %>%
	pheatmap(color = magma(100), 
	         cluster_cols = FALSE,
	         main="Fblim1")

## ----dtubrowser, fig.width=5, fig.height=5, fig.cap="Genome-browser example of differential TSS usage within Fblim1"----
# Define plot area
plot_region <- subset(RSE_filtered, geneID == "74202") %>%
	rowRanges %>%
	reduce(min.gapwidth=1e6) %>%
	unstrand() %>%
	add(5e3L)

# Create cluster track
cluster_track <- subsetByOverlaps(RSE_filtered, plot_region) %>%
    trackClusters(name = "Clusters", col = NA, showId=TRUE)

# CTSS tracks for each group
ctrl_track <- subset(CTSSs, select=Class == "Ctrl") %>%
	calcPooled() %>%
	subsetByOverlaps(plot_region) %>%
	trackCTSS(name="Ctrl")

nano_track <- subset(CTSSs, select=Class == "Nano") %>%
	calcPooled() %>%
	subsetByOverlaps(plot_region) %>%
	trackCTSS(name="Nano")

# Plot at tracks together
plotTracks(list(axis_track, 
                tx_track, 
                cluster_track, 
                Ctrl=ctrl_track, 
                nano_track),
           from = start(plot_region), 
           to=end(plot_region), 
           chromosome = seqnames(plot_region))