CSOA 0.99.2
CSOA is a tool for scoring gene set signatures in scRNA-seq data. It constructs, for each input gene, a set of cells highly expressing the gene, then assesses all pairs of cell sets for statistical significance. The significant overlaps are ranked, and the top-ranked overlaps receive scores used to calculate per-cell CSOA scores.
To install CSOA, run the following commands in an R session:
if (!require("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("CSOA")
In addition to CSOA, you need to install patchwork, scRNAseq and scuttle for this tutorial.
This tutorial uses an scRNA-seq human pancreas dataset with provided cell type annotations.
After loading the required packages, download the dataset using the
BaronPancreasData
function from scRNAseq
. The dataset will be stored as a SingleCellExperiment
object.
CSOA requires the input scRNA-seq data to be normalized and log-transformed.
We will employ the logNormCounts
function from scuttle
for this step.
library(CSOA)
library(ggplot2)
library(patchwork)
library(scRNAseq)
library(scuttle)
library(Seurat)
sceObj <- BaronPancreasData('human')
sceObj <- logNormCounts(sceObj)
We can take a look at the cell types found in this dataset:
table(colData(sceObj)$label)
#>
#> acinar activated_stellate alpha beta
#> 958 284 2326 2525
#> delta ductal endothelial epsilon
#> 601 1077 252 18
#> gamma macrophage mast quiescent_stellate
#> 255 55 25 173
#> schwann t_cell
#> 13 7
Next, we will define gene sets of acinar, alpha, ductal and gamma markers based on PanglaoDB. The gene sets must be provided as a named list of character vectors. The names of the list will be used as column names when storing the results.
acinarMarkers <- c('PRSS1', 'KLK1', 'CTRC', 'PNLIP', 'AKR1C3', 'CTRB1',
'DUOXA2', 'ALDOB', 'REG3A', 'SERPINA3', 'PRSS3', 'REG1B',
'CFB', 'GDF15', 'MUC1','ANPEP', 'ANGPTL4', 'OLFM4',
'GSTA1', 'LGALS2', 'PDZK1IP1', 'RARRES2', 'CXCL17',
'UBD', 'GSTA2', 'LYZ', 'RBPJL', 'PTF1A', 'CELA3A',
'SPINK1', 'ZG16', 'CEL', 'CELA2A', 'CPB1', 'CELA1',
'PNLIPRP1', 'RNASE1', 'AMY2B', 'CPA2','CPA1', 'CELA3B',
'CTRB2', 'PLA2G1B', 'PRSS2', 'CLPS', 'REG1A', 'SYCN')
alphaMarkers <- c('GCG', 'TTR', 'PCSK2', 'FXYD5', 'LDB2', 'MAFB',
'CHGA', 'SCGB2A1', 'GLS', 'FAP', 'DPP4', 'GPR119',
'PAX6', 'NEUROD1', 'LOXL4', 'PLCE1', 'GC', 'KLHL41',
'FEV', 'PTGER3', 'RFX6', 'SMARCA1', 'PGR', 'IRX1',
'UCP2', 'RGS4', 'KCNK16', 'GLP1R', 'ARX', 'POU3F4',
'RESP18', 'PYY', 'SLC38A5', 'TM4SF4', 'CRYBA2', 'SH3GL2',
'PCSK1', 'PRRG2', 'IRX2', 'ALDH1A1','PEMT', 'SMIM24',
'F10', 'SCGN', 'SLC30A8')
ductalMarkers <- c('CFTR', 'SERPINA5', 'SLPI', 'TFF1', 'CFB', 'LGALS4',
'CTSH', 'PERP', 'PDLIM3', 'WFDC2', 'SLC3A1', 'AQP1',
'ALDH1A3', 'VTCN1', 'KRT19', 'TFF2', 'KRT7', 'CLDN4',
'LAMB3', 'TACSTD2', 'CCL2', 'DCDC2','CXCL2', 'CLDN10',
'HNF1B', 'KRT20', 'MUC1', 'ONECUT1', 'AMBP', 'HHEX',
'ANXA4', 'SPP1', 'PDX1', 'SERPINA3', 'GDF15', 'AKR1C3',
'MMP7', 'DEFB1', 'SERPING1', 'TSPAN8', 'CLDN1', 'S100A10',
'PIGR')
gammaMarkers <- c('PPY', 'ABCC9', 'FGB', 'ZNF503', 'MEIS1', 'LMO3', 'EGR3',
'CHN2', 'PTGFR', 'ENTPD2', 'AQP3', 'THSD7A', 'CARTPT',
'ISL1', 'PAX6', 'NEUROD1', 'APOBEC2', 'SEMA3E', 'SLITRK6',
'SERTM1', 'PXK', 'PPY2P', 'ETV1', 'ARX', 'CMTM8', 'SCGB2A1',
'FXYD2', 'SCGN')
geneSets <- list(acinarMarkers, alphaMarkers, ductalMarkers, gammaMarkers)
names(geneSets) <- c('CSOA_acinar', 'CSOA_alpha', 'CSOA_ductal', 'CSOA_gamma')
Before running CSOA, we will convert the SingleCellExperiment
object to a
Seurat
object in order to employ the VlnPlot
function for visualization.
seuratObj <- as.Seurat(sceObj)
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
seuratObj <- runCSOA(seuratObj, geneSets)
#> Computing percentile sets...
#> Assessing gene overlaps...
#> 194 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 105 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 248 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 18 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
We can now display the results:
plots <- lapply(names(geneSets), function(x) {
VlnPlot(seuratObj, x, group.by = 'label') + NoLegend() +
theme(axis.text = element_text(size=10),
axis.title.x = element_blank(),
plot.title = element_text (size=11))
})
(plots[[1]] | plots[[2]]) / (plots[[3]] | plots[[4]])
Alternatively, CSOA can be run on a SingleCellExperiment
object.
The results will be stored as a column in the object’s colData
matrix:
sceObj <- runCSOA(sceObj, geneSets)
#> Computing percentile sets...
#> Warning in percentileSets(geneSetExp, percentile): 1 gene(s) had no top cells
#> at the indicated percentile. These are now excluded from the gene signature.
#> Assessing gene overlaps...
#> 194 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 105 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 248 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 18 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
We verify that the results are identical:
sceObj <- runCSOA(sceObj, geneSets)
#> Warning in percentileSets(geneSetExp, percentile): 1 gene(s) had no top cells
#> at the indicated percentile. These are now excluded from the gene signature.
vapply(names(geneSets),
function(x) identical(seuratObj[[]][[x]], colData(sceObj)[[x]]),
logical(1))
#> CSOA_acinar CSOA_alpha CSOA_ductal CSOA_gamma
#> TRUE TRUE TRUE TRUE
Additionally, CSOA can be run directly on an expression matrix. The expMat
function extracts the log-transformed and normalized expression matrix—data
forSeurat
, logcounts
for SingleCellExperiment
—from the input object,
and converts it to a dense matrix.
CSOA requires
a dense expression matrix, as opposed to the sparse matrix class (dgCMatrix
)
used by Seurat
and SingleCellExperiment
. To satisfy this requirement,
the expMat
function will be employed. This function extracts the
geneSetExp <- expMat(seuratObj)
#> Warning in asMethod(object): sparse->dense coercion: allocating vector of size
#> 1.3 GiB
Note: runCSOA
also accepts the sparse matrix class used by Seurat
and
SingleCellExperiment
(dgCMatrix
) as input. Internally, runCSOA
filters the
dgCMatrix
as to contain only the genes present in the input gene sets, then
densifies the matrix using expMat
.
When running directly on an expression matrix, runCSOA
will return a list in
which the first element is the expression matrix and the second is the data
frame of CSOA scores.
res <- runCSOA(geneSetExp, geneSets)
#> Warning in percentileSets(geneSetExp, percentile): 1 gene(s) had no top cells
#> at the indicated percentile. These are now excluded from the gene signature.
head(res[[2]])
#> CSOA_acinar CSOA_alpha CSOA_ductal CSOA_gamma
#> human1_lib1.final_cell_0001 0.7534719 0.0001117549 0.004928853 0.0000000000
#> human1_lib1.final_cell_0002 0.5009556 0.0008294738 0.023359437 0.0005709145
#> human1_lib1.final_cell_0003 0.5308047 0.0000000000 0.000000000 0.0000000000
#> human1_lib1.final_cell_0004 0.5052516 0.0010344557 0.011338792 0.0036947608
#> human1_lib1.final_cell_0005 0.5371408 0.0011376886 0.008792743 0.0000000000
#> human1_lib1.final_cell_0006 0.3647260 0.0001892905 0.006612631 0.0000000000
The results are identical with those obtained earlier for the Seurat object:
vapply(names(geneSets),
function(x) identical(seuratObj[[]][[x]], res[[2]][[x]]), logical(1))
#> CSOA_acinar CSOA_alpha CSOA_ductal CSOA_gamma
#> TRUE TRUE TRUE TRUE
sessionInfo()
#> R version 4.5.1 Patched (2025-08-23 r88802)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.3 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.22-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] Seurat_5.3.0 SeuratObject_5.2.0
#> [3] sp_2.2-0 stringr_1.5.2
#> [5] scuttle_1.19.0 scRNAseq_2.23.0
#> [7] SingleCellExperiment_1.31.1 SummarizedExperiment_1.39.2
#> [9] Biobase_2.69.1 GenomicRanges_1.61.4
#> [11] Seqinfo_0.99.2 IRanges_2.43.2
#> [13] S4Vectors_0.47.2 BiocGenerics_0.55.1
#> [15] generics_0.1.4 MatrixGenerics_1.21.0
#> [17] matrixStats_1.5.0 patchwork_1.3.2
#> [19] ggplot2_4.0.0 CSOA_0.99.2
#> [21] BiocStyle_2.37.1
#>
#> loaded via a namespace (and not attached):
#> [1] ProtGenerics_1.41.0 spatstat.sparse_3.1-0 bitops_1.0-9
#> [4] httr_1.4.7 RColorBrewer_1.1-3 tools_4.5.1
#> [7] sctransform_0.4.2 alabaster.base_1.9.5 R6_2.6.1
#> [10] HDF5Array_1.37.0 lazyeval_0.2.2 uwot_0.2.3
#> [13] rhdf5filters_1.21.0 withr_3.0.2 gridExtra_2.3
#> [16] progressr_0.16.0 cli_3.6.5 spatstat.explore_3.5-2
#> [19] fastDummies_1.7.5 labeling_0.4.3 alabaster.se_1.9.0
#> [22] sass_0.4.10 S7_0.2.0 spatstat.data_3.1-8
#> [25] ggridges_0.5.7 pbapply_1.7-4 Rsamtools_2.25.3
#> [28] dichromat_2.0-0.1 parallelly_1.45.1 RSQLite_2.4.3
#> [31] RApiSerialize_0.1.4 BiocIO_1.19.0 ica_1.0-3
#> [34] spatstat.random_3.4-1 dplyr_1.1.4 wesanderson_0.3.7
#> [37] Matrix_1.7-4 ggbeeswarm_0.7.2 abind_1.4-8
#> [40] lifecycle_1.0.4 yaml_2.3.10 rhdf5_2.53.4
#> [43] SparseArray_1.9.1 BiocFileCache_2.99.6 Rtsne_0.17
#> [46] grid_4.5.1 blob_1.2.4 promises_1.3.3
#> [49] ExperimentHub_2.99.5 crayon_1.5.3 miniUI_0.1.2
#> [52] lattice_0.22-7 beachmat_2.25.5 cowplot_1.2.0
#> [55] GenomicFeatures_1.61.6 KEGGREST_1.49.1 poibin_1.6
#> [58] magick_2.9.0 pillar_1.11.1 knitr_1.50
#> [61] rjson_0.2.23 bayesbio_1.0.0 future.apply_1.20.0
#> [64] codetools_0.2-20 glue_1.8.0 spatstat.univar_3.1-4
#> [67] data.table_1.17.8 vctrs_0.6.5 png_0.1-8
#> [70] gypsum_1.5.0 spam_2.11-1 gtable_0.3.6
#> [73] kernlab_0.9-33 cachem_1.1.0 xfun_0.53
#> [76] S4Arrays_1.9.1 mime_0.13 tidygraph_1.3.1
#> [79] survival_3.8-3 tinytex_0.57 fitdistrplus_1.2-4
#> [82] ROCR_1.0-11 nlme_3.1-168 bit64_4.6.0-1
#> [85] alabaster.ranges_1.9.1 filelock_1.0.3 RcppAnnoy_0.0.22
#> [88] GenomeInfoDb_1.45.11 bslib_0.9.0 irlba_2.3.5.1
#> [91] vipor_0.4.7 KernSmooth_2.23-26 DBI_1.2.3
#> [94] ggrastr_1.0.2 tidyselect_1.2.1 bit_4.6.0
#> [97] compiler_4.5.1 curl_7.0.0 httr2_1.2.1
#> [100] h5mread_1.1.1 textshape_1.7.5 DelayedArray_0.35.3
#> [103] plotly_4.11.0 stringfish_0.17.0 bookdown_0.44
#> [106] rtracklayer_1.69.1 scales_1.4.0 lmtest_0.9-40
#> [109] ggeasy_0.1.6 sgof_2.3.5 rappdirs_0.3.3
#> [112] digest_0.6.37 goftest_1.2-3 spatstat.utils_3.1-5
#> [115] alabaster.matrix_1.9.0 rmarkdown_2.29 XVector_0.49.1
#> [118] htmltools_0.5.8.1 pkgconfig_2.0.3 ensembldb_2.33.2
#> [121] dbplyr_2.5.1 fastmap_1.2.0 UCSC.utils_1.5.0
#> [124] rlang_1.1.6 htmlwidgets_1.6.4 shiny_1.11.1
#> [127] farver_2.1.2 jquerylib_0.1.4 zoo_1.8-14
#> [130] jsonlite_2.0.0 BiocParallel_1.43.4 RCurl_1.98-1.17
#> [133] magrittr_2.0.4 dotCall64_1.2 Rhdf5lib_1.31.0
#> [136] Rcpp_1.1.0 kerntools_1.2.0 ggnewscale_0.5.2
#> [139] viridis_0.6.5 reticulate_1.43.0 stringi_1.8.7
#> [142] alabaster.schemas_1.9.0 ggraph_2.2.2 MASS_7.3-65
#> [145] AnnotationHub_3.99.6 plyr_1.8.9 parallel_4.5.1
#> [148] listenv_0.9.1 ggrepel_0.9.6 deldir_2.0-4
#> [151] Biostrings_2.77.2 graphlayouts_1.2.2 splines_4.5.1
#> [154] tensor_1.5.1 igraph_2.1.4 spatstat.geom_3.5-0
#> [157] RcppHNSW_0.6.0 reshape2_1.4.4 BiocVersion_3.22.0
#> [160] XML_3.99-0.19 evaluate_1.0.5 RcppParallel_5.1.11-1
#> [163] BiocManager_1.30.26 tweenr_2.0.3 httpuv_1.6.16
#> [166] RANN_2.6.2 tidyr_1.3.1 purrr_1.1.0
#> [169] polyclip_1.10-7 alabaster.sce_1.9.0 qs_0.27.3
#> [172] future_1.67.0 scattermore_1.2 ggforce_0.5.0
#> [175] xtable_1.8-4 AnnotationFilter_1.33.0 restfulr_0.0.16
#> [178] RSpectra_0.16-2 later_1.4.4 viridisLite_0.4.2
#> [181] tibble_3.3.0 beeswarm_0.4.0 memoise_2.0.1
#> [184] AnnotationDbi_1.71.1 GenomicAlignments_1.45.4 cluster_2.1.8.1
#> [187] globals_0.18.0