--- title: "Introduction to ClonalSim" author: "Gabriele Bucci" date: "`r Sys.Date()`" output: rmarkdown::html_vignette: toc: true number_sections: true vignette: > %\VignetteIndexEntry{Introduction to ClonalSim} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include=FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6, warning = FALSE, message = FALSE ) ``` # Introduction **ClonalSim** is an R/Bioconductor package for simulating tumor clonal evolution with realistic sequencing noise. It generates mutational profiles of heterogeneous tumor samples with hierarchical clonal structure, making it ideal for: - **Benchmarking** variant callers and mutation detection pipelines - **Testing** clonal deconvolution algorithms (PyClone, SciClone, etc.) - **Education** about tumor heterogeneity and clonal evolution - **Method development** for phylogenetic analysis ## Key Features ClonalSim implements a **two-stage realistic noise model**: 1. **Biological Noise**: Intra-tumor heterogeneity using Beta distribution 2. **Technical Noise**: Sequencing artifacts including: - Depth overdispersion (negative binomial) - Stochastic read sampling (binomial) - Base calling errors (Illumina-like) # Installation ```{r install, eval=FALSE} # From Bioconductor (when available) if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("ClonalSim") # From GitHub (development version) BiocManager::install("gbucci/ClonalSim") ``` ```{r library} library(ClonalSim) ``` # Basic Usage ## Simple Simulation The main function is `simulateTumor()`, which generates a complete simulation: ```{r basic_sim} # Set seed for reproducibility set.seed(123) sim <- simulateTumor( subclone_freqs = c(0.15, 0.25, 0.30, 0.30), n_mut_per_clone = c(20, 25, 30, 15), n_mut_founder = 10 ) # Display summary sim ``` ## Accessing Results ```{r access_results} # Get mutation data mutations <- getMutations(sim) head(mutations) # Get simulation parameters params <- getSimParams(sim) params$subclone_freqs # Get true vs observed VAF true_vaf <- getTrueVAF(sim) obs_vaf <- getObservedVAF(sim) # Compare summary(true_vaf) summary(obs_vaf) ``` ## Visualization ClonalSim provides built-in plotting functions. **Note:** Make sure `ggplot2` is loaded: ```{r plot_vaf_density, fig.cap="VAF density plot showing mixed tumor sample"} # ggplot2 is already loaded above plot(sim, type = "vaf_density") ``` The density plot shows the distribution of variant allele frequencies as they would appear in real sequencing data. Expected peaks correspond to clone frequencies. ```{r plot_vaf_scatter, fig.cap="VAF scatter plot colored by mutation type"} plot(sim, type = "vaf_scatter") ``` ```{r plot_depth, fig.cap="Sequencing depth distribution"} plot(sim, type = "depth_histogram") ``` ```{r plot_matrix, fig.cap="Clonal matrix showing mutation presence"} plot(sim, type = "clone_matrix") ``` # Understanding the Noise Model ## Biological Noise: Beta Distribution Intra-tumor heterogeneity is modeled using a **Beta distribution**, which is more appropriate than Gaussian for frequency data (bounded 0-1). ```{r bio_noise_demo, fig.cap="Effect of biological noise concentration parameter"} # High heterogeneity (low concentration) set.seed(123) sim_high_het <- simulateTumor( subclone_freqs = c(0.3, 0.4, 0.3), n_mut_per_clone = c(30, 40, 30), biological_noise = list(enabled = TRUE, concentration = 20) ) # Low heterogeneity (high concentration) set.seed(123) sim_low_het <- simulateTumor( subclone_freqs = c(0.3, 0.4, 0.3), n_mut_per_clone = c(30, 40, 30), biological_noise = list(enabled = TRUE, concentration = 100) ) # Compare VAF spread par(mfrow = c(1, 2)) hist(getTrueVAF(sim_high_het), main = "High Heterogeneity\n(concentration = 20)", xlab = "True VAF", breaks = 30, col = "skyblue") hist(getTrueVAF(sim_low_het), main = "Low Heterogeneity\n(concentration = 100)", xlab = "True VAF", breaks = 30, col = "lightcoral") ``` ## Technical Sequencing Noise ### Depth Overdispersion Real NGS data shows **overdispersion** (variance > mean) due to GC bias, mappability, and PCR artifacts. ClonalSim uses negative binomial distribution: ```{r depth_demo, fig.cap="Depth distributions: negative binomial vs Poisson"} # Realistic (negative binomial) set.seed(123) sim_nb <- simulateTumor( sequencing_noise = list( mean_depth = 100, depth_variation = "negative_binomial", depth_dispersion = 20 ) ) # Unrealistic (Poisson) set.seed(124) sim_poisson <- simulateTumor( sequencing_noise = list( mean_depth = 100, depth_variation = "poisson" ) ) par(mfrow = c(1, 2)) hist(getMutations(sim_nb)$Depth, main = "Negative Binomial\n(realistic)", xlab = "Depth", breaks = 30, col = "skyblue") hist(getMutations(sim_poisson)$Depth, main = "Poisson\n(unrealistic)", xlab = "Depth", breaks = 30, col = "lightcoral") ``` ### Binomial Read Sampling Each sequencing read is independently sampled, introducing **stochastic variation**. This is especially important at low depth or low VAF: ```{r binomial_demo, fig.cap="Stochastic variation from binomial sampling"} # With binomial sampling (realistic) set.seed(123) sim_binom <- simulateTumor( sequencing_noise = list(binomial_sampling = TRUE) ) # Without binomial sampling (deterministic) set.seed(123) sim_det <- simulateTumor( sequencing_noise = list(binomial_sampling = FALSE) ) # Compare True_VAF vs observed VAF par(mfrow = c(1, 2)) with(getMutations(sim_binom), { plot(True_VAF, VAF, main = "With Binomial Sampling", xlab = "True VAF", ylab = "Observed VAF", pch = 16, col = rgb(0,0,1,0.3)) abline(0, 1, col = "red", lty = 2) }) with(getMutations(sim_det), { plot(True_VAF, VAF, main = "Without Binomial Sampling", xlab = "True VAF", ylab = "Observed VAF", pch = 16, col = rgb(0,0,1,0.3)) abline(0, 1, col = "red", lty = 2) }) ``` # Common Use Cases ## Low Purity Tumor Many clinical samples have significant normal cell contamination: ```{r low_purity} set.seed(123) sim_low_purity <- simulateTumor( subclone_freqs = c(0.05, 0.10, 0.15), # Sum = 0.30 (30% tumor) n_mut_per_clone = c(20, 25, 30) ) cat("Tumor purity:", sum(getSimParams(sim_low_purity)$subclone_freqs), "\n") plot(sim_low_purity, type = "vaf_density") ``` ## High Coverage Sequencing Deep targeted sequencing with uniform coverage: ```{r high_coverage} set.seed(123) sim_deep <- simulateTumor( sequencing_noise = list( enabled = TRUE, mean_depth = 500, depth_dispersion = 100, # More uniform error_rate = 0.0005 ) ) summary(getMutations(sim_deep)$Depth) plot(sim_deep, type = "depth_histogram") ``` ## Low Coverage Sequencing Exome sequencing with variable coverage: ```{r low_coverage} set.seed(123) sim_exome <- simulateTumor( sequencing_noise = list( mean_depth = 50, depth_dispersion = 10, # More variable error_rate = 0.002 ) ) summary(getMutations(sim_exome)$Depth) ``` ## Ideal Data (No Noise) For testing algorithms without noise: ```{r no_noise} set.seed(123) sim_ideal <- simulateTumor( biological_noise = list(enabled = FALSE), sequencing_noise = list(enabled = FALSE) ) # VAF exactly matches expected frequencies head(getMutations(sim_ideal)[, c("Clone", "True_VAF", "VAF")]) ``` ## Including Germline Variants Real tumor sequencing data contains both somatic and germline variants. Germline variants (heterozygous diploid) appear at VAF ~0.5 **regardless of tumor purity** because they are heterozygous in both tumor and normal cells: ```{r germline_variants, fig.cap="VAF distribution showing germline peak at 0.5"} # 70% purity tumor with germline variants set.seed(789) sim_germline <- simulateTumor( subclone_freqs = c(0.3, 0.4), # 70% tumor purity n_mut_per_clone = c(40, 50), germline_variants = list( enabled = TRUE, n_variants = 150, # Number of germline SNPs vaf_expected = 0.5 # Heterozygous diploid ) ) # Check mutation types table(getMutations(sim_germline)$Type) # Compare VAFs germline_vafs <- getMutations(sim_germline)[getMutations(sim_germline)$Type == "germline", "VAF"] somatic_vafs <- getMutations(sim_germline)[getMutations(sim_germline)$Type != "germline", "VAF"] cat("Germline VAF (expected ~0.5):", round(mean(germline_vafs), 3), "\n") cat("Somatic VAF mean:", round(mean(somatic_vafs), 3), "\n") # Plot showing germline peak plot(sim_germline, type = "vaf_density") ``` Key biological insight: Unlike somatic mutations (whose VAF depends on tumor purity), germline variants maintain VAF ~0.5 because: - In tumor cells: heterozygous germline variant → VAF = 0.5 - In normal cells: heterozygous germline variant → VAF = 0.5 - Mixed sample: 0.5 × purity + 0.5 × (1 - purity) = 0.5 This creates a characteristic peak at VAF = 0.5 in real sequencing data that is independent of tumor cellularity. # Bioconductor Integration ## Export to GRanges ```{r granges} library(GenomicRanges) gr <- toGRanges(sim) gr # Access metadata mcols(gr)[1:5, ] # Subset by chromosome gr_chr1 <- gr[seqnames(gr) == "chr1"] length(gr_chr1) ``` ## Export to VCF ```{r vcf} # Get VRanges object vr <- suppressWarnings(toVCF(sim, sample_name = "TumorSample")) vr # Write to VCF file (using tempdir to avoid polluting user's filesystem) vcf_file <- tempfile(fileext = ".vcf") suppressWarnings(toVCF(sim, output_file = vcf_file)) file.exists(vcf_file) ``` ## Export for Other Tools ### PyClone Format ```{r pyclone} pyclone_file <- tempfile(fileext = ".tsv") toPyClone(sim, file = pyclone_file, sample_id = "sample1") head(read.table(pyclone_file, header = TRUE, sep = "\t")) ``` ### SciClone Format ```{r sciclone} sciclone_file <- tempfile(fileext = ".tsv") toSciClone(sim, file = sciclone_file) head(read.table(sciclone_file, header = TRUE, sep = "\t")) ``` ### Simple CSV ```{r csv} df <- toDataFrame(sim) csv_file <- tempfile(fileext = ".csv") write.csv(df, csv_file, row.names = FALSE) head(read.csv(csv_file)) ``` # Benchmarking Workflow Here's a typical workflow for benchmarking a variant caller: ```{r benchmark_workflow, eval=FALSE} # 1. Generate ground truth set.seed(42) sim <- simulateTumor( subclone_freqs = c(0.2, 0.3, 0.5), n_mut_per_clone = c(50, 75, 50) ) # 2. Export to VCF (ground truth) toVCF(sim, output_file = "ground_truth.vcf") # 3. Get true positive set true_mutations <- getMutations(sim) # 4. Run your variant caller on simulated data # (your variant caller code here) # 5. Compare results # - Sensitivity: TP / (TP + FN) # - Precision: TP / (TP + FP) # - F1 score: 2 * (Precision * Sensitivity) / (Precision + Sensitivity) # 6. Evaluate VAF estimation accuracy # cor(true_mutations$VAF, called_vaf) ``` # Advanced: Custom Clonal Structures ## Complex Hierarchies ```{r complex_hierarchy} set.seed(123) sim_complex <- simulateTumor( subclone_freqs = c(0.1, 0.15, 0.2, 0.25, 0.3), n_mut_per_clone = c(30, 40, 50, 40, 30), n_mut_shared = list( "2 3 4 5" = 20, # Shared by clones 2,3,4,5 "3 4 5" = 15, # Shared by clones 3,4,5 "4 5" = 10, # Shared by clones 4,5 "1 2" = 8 # Shared by clones 1,2 ) ) plot(sim_complex, type = "vaf_density") ``` ## Accessing Clonal Structure ```{r clonal_structure} structure <- getClonalStructure(sim_complex) print(structure) ``` # Session Information ```{r sessioninfo} sessionInfo() ``` # References 1. McGranahan N, Swanton C. *Clonal Heterogeneity and Tumor Evolution: Past, Present, and the Future.* Cell. 2017 2. Dentro SC, Wedge DC, Van Loo P. *Principles of Reconstructing the Subclonal Architecture of Cancers.* Cold Spring Harb Perspect Med. 2017 3. Roth A, et al. *PyClone: statistical inference of clonal population structure in cancer.* Nat Methods. 2014 4. Miller CA, et al. *SciClone: inferring clonal architecture and tracking the spatial and temporal patterns of tumor evolution.* PLoS Comput Biol. 2014