## ----setup, include = FALSE-------------------------------------------------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html
)

## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE--------------------------------------------------------
## Bib setup
library("RefManageR")

## Write bibliography information
bib <- c(
    R = citation(),
    BiocStyle = citation("BiocStyle")[1],
    knitr = citation("knitr")[1],
    RefManageR = citation("RefManageR")[1],
    rmarkdown = citation("rmarkdown")[1],
    sessioninfo = citation("sessioninfo")[1],
    testthat = citation("testthat")[1],
    DeconvoBuddies = citation("DeconvoBuddies")[1]
)

## ----"install", eval = FALSE------------------------------------------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#     install.packages("BiocManager")
# }
# 
# BiocManager::install("DeconvoBuddies")
# 
# ## Check that you have a valid Bioconductor installation
# BiocManager::valid()

## ----"load_packages", message=FALSE, warning=FALSE--------------------------------------------------------------------
## Packages for different types of RNA-seq data structures in R
library("SingleCellExperiment")

## For downloading data
library("spatialLIBD")

## Other helper packages for this vignette
library("dplyr")
library("ggplot2")

## Our main package
library("DeconvoBuddies")

## ----"load snRNA-seq"-------------------------------------------------------------------------------------------------
## Use spatialLIBD to fetch the snRNA-seq dataset
sce_path_zip <- fetch_deconvo_data("sce")

## unzip and load the data
sce_path <- unzip(sce_path_zip, exdir = tempdir())
sce <- HDF5Array::loadHDF5SummarizedExperiment(
    file.path(tempdir(), "sce_DLPFC_annotated")
)

# lobstr::obj_size(sce)
# 172.28 MB

## exclude Ambiguous cell type
sce <- sce[, sce$cellType_broad_hc != "Ambiguous"]
sce$cellType_broad_hc <- droplevels(sce$cellType_broad_hc)

## Check the broad cell type distribution
table(sce$cellType_broad_hc)

## We're going to subset to the first 5k genes to save memory
## In a real application you'll want to use the full dataset
sce <- sce[seq_len(5000), ]

## check the final dimensions of the dataset
dim(sce)

## ----"Run get_mean_ratio"---------------------------------------------------------------------------------------------
# calculate the Mean Ratio of genes for each cell type in sce
marker_stats_MeanRatio <- get_mean_ratio(
    sce = sce, # sce is the SingleCellExperiment with our data
    assay_name = "logcounts", ## assay to use, we recommend logcounts [default]
    cellType_col = "cellType_broad_hc", # column in colData with cell type info
    gene_ensembl = "gene_id", # column in rowData with ensembl gene ids
    gene_name = "gene_name" # column in rowData with gene names/symbols
)

## ----"Explore get_mean_ratio output"----------------------------------------------------------------------------------
## Explore the tibble output
marker_stats_MeanRatio

## genes with the highest MeanRatio are the best marker genes for each cell type
marker_stats_MeanRatio |>
    filter(MeanRatio.rank == 1)

## ----`Run findMarkers_1vALL`------------------------------------------------------------------------------------------
## Run 1vALL DE to find markers for each cell type - takes 5min+

# marker_stats_1vAll <- findMarkers_1vAll(
#     sce = sce, # sce is the SingleCellExperiment with our data
#     assay_name = "counts",
#     cellType_col = "cellType_broad_hc", # column in colData with cell type info
#     mod = "~BrNum" # Control for donor stored in "BrNum" with mod
# )

## load 1vAll data to save time, data is equivalent to the above code
data("marker_stats_1vAll")

## ----"Explore findMarkers_1vALL output"-------------------------------------------------------------------------------
## Explore the tibble output
marker_stats_1vAll

## genes with the highest MeanRatio are the best marker genes for each cell type
marker_stats_1vAll |>
    filter(std.logFC.rank == 1)

## ----"volcano plots"--------------------------------------------------------------------------------------------------
# Create volcano plots of DE stats from 1vALL
marker_stats_1vAll |>
    ggplot(aes(logFC, -log.p.value)) +
    geom_point() +
    facet_wrap(~cellType.target) +
    geom_vline(xintercept = c(1, -1), linetype = "dashed", color = "red")

## ----"join marker_stats"----------------------------------------------------------------------------------------------
## join the two marker_stats tables
marker_stats <- marker_stats_MeanRatio |>
    left_join(marker_stats_1vAll, by = join_by(gene, cellType.target))

## Check stats for our top genes
marker_stats |>
    filter(MeanRatio.rank == 1) |>
    select(gene, cellType.target, MeanRatio, MeanRatio.rank, std.logFC, std.logFC.rank)

## ----"hockey stick plots"---------------------------------------------------------------------------------------------
# create hockey stick plots to compare MeanRatio and standard logFC values.
marker_stats |>
    ggplot(aes(MeanRatio, std.logFC)) +
    geom_point() +
    facet_wrap(~cellType.target)

## ----"plot_gene_express"----------------------------------------------------------------------------------------------
## plot expression of two genes from a list
plot_gene_express(
    sce = sce,
    category = "cellType_broad_hc",
    genes = c("SLC2A1", "CREG2")
)

## ----"plot_marker_express MeanRatio"----------------------------------------------------------------------------------
# plot the top 10 MeanRatio genes for Excit
plot_marker_express(
    sce = sce,
    stats = marker_stats,
    cell_type = "Excit",
    n_genes = 10,
    cellType_col = "cellType_broad_hc"
)

## ----"plot_marker_express logFC"--------------------------------------------------------------------------------------
# plot the top 10 1vAll genes for Excit
plot_marker_express(
    sce = sce,
    stats = marker_stats,
    cell_type = "Excit",
    n_genes = 10,
    rank_col = "std.logFC.rank", ## use logFC cols from 1vALL
    anno_col = "std.logFC.anno",
    cellType_col = "cellType_broad_hc"
)

## ----plot_marker_express_ALL, eval = FALSE----------------------------------------------------------------------------
# # plot the top 10 1vAll genes for all cell types
# print(plot_marker_express_ALL(
#     sce = sce,
#     stats = marker_stats,
#     n_genes = 10,
#     cellType_col = "cellType_broad_hc"
# ))

## ----reproduce3, echo=FALSE-------------------------------------------------------------------------------------------
## Session info
library("sessioninfo")
options(width = 120)
session_info()

## ----vignetteBiblio, results = "asis", echo = FALSE, warning = FALSE, message = FALSE---------------------------------
## Print bibliography
PrintBibliography(bib, .opts = list(hyperlink = "to.doc", style = "html"))