\documentclass[a4paper]{article} \usepackage[top=1in, bottom=1in, left=1in, right=1in]{geometry} \usepackage[utf8]{inputenc} \usepackage{listings} \usepackage{xcolor} \usepackage{hyperref} \definecolor{mygreen}{rgb}{0,0.6,0} \definecolor{mygray}{rgb}{0.95,0.95,0.95} %\VignetteIndexEntry{Application example} \title{% Vignette rKOMICS \\ \large application example} \author{Manon Geerts \& Frederik Van Den Broeck} \begin{document} \SweaveOpts{concordance=TRUE} \lstset{language=R, breaklines=true, commentstyle=\color{mygreen}, backgroundcolor=\color{mygray}} \maketitle \section{The rKOMICS package} The core features of the rKOMICS package include data aggregation, analyses and visualization that allows to examine, summarize and extract meaningful information from minicircle sequence alignments as obtained by KOMICS or a custom bioinformatic pipeline, and from USEARCH cluster format (UC) files as generated by USEARCH or VSEARCH. In addition to storing data files, rKOMICS stores the analyses and visualization results into single list objects that can be called by the user at a later stage. \\ rKOMICS incorporates multiple methods of visualizations using the ggplot2 R package to plot the foundation of graphs. By adding ggplot2 functions to the rKOMICS visualization functions, the user has direct control over the finishing touches of the graph's appearances. Our package also utilizes sample-specific metadata that allows multi-group data visualizations to facilitate exploratory analysis. The overall data set can be examined using barplots, heatmaps, PCA plots and box plots that are generated for each specified minimum percent identity. This makes it possible to visualize population structure and diversity based on minicircle sequence composition. \\ To show the functionality of rKOMICS, we performed an example analysis using whole-genome sequencing data from a recently published study on the history of diversification of the \emph{Leishmania braziliensis} species complex in Peru. This species complex comprises two closely related species: the lowland and zoonotic \emph{L. braziliensis} parasite circulating in a diverse range of wild mammals in Neotropical rainforests, and the highland anthroponotic \emph{L. peruviana} parasite that is largely endemic to the Pacific slopes of the Peruvian Andes. A total of 67 \emph{Leishmania} parasites from 47 localities in Peru were cultured and subjected to whole genome sequencing.\\ \begin{lstlisting} data(exData, package = "rKOMICS") table(exData$species) \end{lstlisting} <>== library(rKOMICS) data(exData, package = "rKOMICS") table(exData$species) @ \section{Required R-packages} \begin{lstlisting} library(ggplot2) library(rKOMICS) library(ggpubr) library(viridis) \end{lstlisting} <>= library(ggplot2) library(ggpubr) library(viridis) @ \section{Quality of the assembly} The \texttt{msc.quality} function allows you to examine the quality of the assembly by alignment of reads to the assembled minicircles (see \url{https://github.com/FreBio/komics} for tutorial). We found that on average 77\% of all mapped reads aligned with a mapping quality larger than 20 (Figure 1) and on average 84\% aligned in proper pairs. On average 93\% of all CSB3-containing reads aligned against the assembled minicircle contigs and 88.5\% aligned perfectly, suggesting that KOMICS was able to retrieve a large proportion of the minicircle classes.\\ \begin{lstlisting} map <- msc.quality(mapstats = system.file("extdata", exData$mapstats, package = "rKOMICS"), groups = exData$species) lapply(map$proportions, mean)$MR_HQ \end{lstlisting} <>= map <- msc.quality(mapstats = system.file("extdata", exData$mapstats, package = "rKOMICS"),exData$species) lapply(map$proportions, mean)$MR_HQ @ \begin{lstlisting} map$plots$MR_HQ + labs(caption = paste0('Proportion of mapped reads with high quality, ', Sys.Date())) \end{lstlisting} <>= map$plots$MR_HQ + labs(caption = paste0('Proportion of mapped reads with high quality, ', Sys.Date())) @ \emph{\\Figure 1: Proportion of reads with a mapping quality higher than 20 per Leihmania isolate. \\} \section{Minicircle copy numbers} The depth statistics (see \url{https://github.com/FreBio/komics} for tutorial) include average, median, minimum and maximum per site read depth of every minicircle contig that has been assembled. The \texttt{msc.depth} function allows you to summarize those statistics (Figure 2) and to estimate minicircle copy numbers by standardizing median read depths per minicircle contig to the median genome-wide read depths.\\ \begin{lstlisting} depth <- msc.depth(depthstats = system.file("extdata", exData$depthstats, package = "rKOMICS"), groups = exData$species, HCN = exData$medGWD/2) # haploid copy number depth$CN \end{lstlisting} <>= depth <- msc.depth(depthstats = system.file("extdata", exData$depthstats, package = "rKOMICS"), groups = exData$species, HCN = exData$medGWD/2) # haploid copy number depth$CN @ \emph{\\Figure 2: Minicirle copy number distribution per Leihmania isolate.\\ } \section{Inspect minicircle lengths} A combined total of 7,760 minicircles were assembled for 67 \emph{Leishmania} isolates. When examining the length distribution of the circularized minicircle sequences using the function \texttt{msc.length}, we found that the majority of minicircles (95.2\%) were 720-760 bp long, which is within the expected length range of minicircles in \emph{Leishmania} parasites (Figure 3). 294 minicircle contigs (4.8\%) showed twice this length (1400-1700 bp) (Figure 3), which may suggest that these are artificial minicircle dimers introduced by the assembly process, and were removed. \\ \begin{lstlisting} bf <- msc.length(file = system.file("extdata", "all.minicircles.fasta", package = "rKOMICS"), samples = exData$samples, groups = exData$subspecies) bf$plot \end{lstlisting} <>= bf <- msc.length(file = system.file("extdata", "all.minicircles.fasta", package = "rKOMICS"), samples = exData$samples,groups = exData$subspecies) bf$plot @ \emph{\\Figure 3: Length distribution of the 7,760 assembled minicircles in 67 Leishmania isolates.\\} \begin{lstlisting} c(length(bf$length), length(which(bf$length < 800)), length(which(bf$length > 1400))) \end{lstlisting} <>= c(length(bf$length),length(which(bf$length < 800)),length(which(bf$length > 1400))) @ \section{Filter minicircle sequences} For downstream analyses, we only retained the circularized minicircles of the expected length (720-760bp) using the \texttt{preprocess} function (Figure 4; coloured barplots), resulting in a final set of 5,849 minicircles.\\ \begin{lstlisting} pre <- preprocess(files = system.file("extdata", exData$fastafiles, package = "rKOMICS"), groups = exData$species, circ = TRUE, min = 500, max = 1200, writeDNA = FALSE) pre$summary \end{lstlisting} <>= pre <- preprocess(files = system.file("extdata", exData$fastafiles,package = "rKOMICS"),groups = exData$species,circ = TRUE, min = 500, max = 1200, writeDNA = FALSE) pre$summary @ \begin{lstlisting} pre$plot + labs(caption = paste0('N of MC sequences before and after filtering, ', Sys.Date())) \end{lstlisting} <>= pre$plot + labs(caption = paste0('N of MC sequences before and after filtering, ', Sys.Date())) @ \emph{\\Figure 4: Gray barplots show the total number of minicircles found per Leishmania isolate, and coloured barplots indicate the number obtained after retaining only circularized minicircles of the expected length.\\} \section{Clustering results} We used the function \texttt{msc.uc} to then examine the combined number of minicircle sequence classes (MSCs) (based on overall identity) across all 67 isolates, and we identified a total of 3,811 MSCs at 100\% identity. This number decreased sharply to 918 MSCs at 97\% identity and 603 MSCs at 95\% identity (Figure 5). The proportion of perfectly aligned minicircle sequences (i.e. alignments without any insertion/deletion) during the clustering process decreased from 100\% (only perfect alignments) at 100\% identity to 79\% (79\% of the alignments were perfect) at 97\% identity and 68\% at 95\% identity (Figure 5). \\ \begin{lstlisting} ucs <- msc.uc(files = system.file("extdata", exData$ucs, package = "rKOMICS")) c(ucs$MSCs["100"],ucs$MSCs["97"],ucs$MSCs["95"]) \end{lstlisting} <>= ucs <- msc.uc(files = system.file("extdata", exData$ucs, package = "rKOMICS")) @ <>= c(ucs$MSCs["100"],ucs$MSCs["97"],ucs$MSCs["95"]) @ \begin{lstlisting} ucs$plots$`MSCs and perfect aligments` \end{lstlisting} <>= ucs$plots$`MSCs and perfect aligments` @ \emph{\\Figure 5: Number of MSCs (blue) and proportion of perfect alignments (red) as obtained following clustering analyses for a range of percent identities.\\} While insertions were mostly 1 bp long (Figure 6; left), the number of insertions per alignment increased with decreasing percent identity (Figure 6; right). Most notably, below 97\% identity, we found a steady increase in alignments with 3 or 4 insertions (Figure 6; right). Similar results were obtained for deletions (results not shown). Hence, we decided to focus most of our downstream analyses at the 97\% identity threshold, as this would capture sufficient minicircle sequence classes (Figure 5) while minimizing the number of alignment gaps (Figure 6).\\ \begin{lstlisting} ucs$plots$insertions \end{lstlisting} <>= ucs$plots$insertions @ \emph{\\Figure 6: Length and number of insertions in MSC alignments following clustering analysis for a range of percent identities.\\} \section{Build MSC matrix} Clustering based on a percent identity, performed with the VSEARCH tool, will generate files in uc format. The \texttt{msc.matrix} function will transform every input file into a cluster matrix. The columns of the matrix correspond to the samples and the rows of the matrix correspond to the minicircle sequence cluster (MSC). The absence of a MSC in a sample is indicated with the value of zero while the presence of a MSC in a sample will be indicated with a value >= 1.\\ \begin{lstlisting} matrices <- msc.matrix(files = system.file("extdata", exData$ucs, package = "rKOMICS"), samples = sort(exData$samples), groups = exData$species[order(exData$samples)]) ### or: data(matrices, package = "rKOMICS") # rowSums(matrices[["id97"]]) # --> frequency of MSC across all samples colSums(matrices[["id97"]]) # --> number of MSC per sample \end{lstlisting} <>= data(matrices, package = "rKOMICS") colSums(matrices[["id97"]]) # --> number of MSC per sample @ The \texttt{msc.heatmap} function generates a heatmap of the input cluster matrix that summarizes the presence or absence of Minicircle Cluster Sequences (MCSs) between groups of samples.\\ \begin{lstlisting} msc.heatmap(clustmatrix = matrices[["id97"]], groups = exData$species, samples = exData$samples) \end{lstlisting} <>= msc.heatmap(clustmatrix = matrices[["id97"]], groups = exData$species, samples = exData$samples) @ \emph{\\Figure 7: Heatmap of the MSC matrix at a percent identity of 97.\\} \section{MSC Richness} Focusing on the results at 97\% identity, we observed that the Andean and near-clonal \emph{L. peruviana} parasites harbored substantially less MSCs (mean = 62 MSCs per isolate) compared to the Amazonian and recombining \emph{L. braziliensis} parasites (mean = 124 MSCs per isolate).\\ \begin{lstlisting} richness <- msc.richness(matrices, samples = exData$samples, groups = exData$species) apply(richness$table[which(richness$table$group=="L. peruviana"),-(1:2)], 2, mean) \end{lstlisting} <>= richness <- msc.richness(matrices, samples = exData$samples, groups = exData$species) apply(richness$table[which(richness$table$group=="L. peruviana"),-(1:2)], 2, mean) @ \begin{lstlisting} apply(richness$table[which(richness$table$group=="L. braziliensis"),-(1:2)], 2, mean) \end{lstlisting} <>= apply(richness$table[which(richness$table$group=="L. braziliensis"),-(1:2)], 2, mean) @ \begin{lstlisting} apply(richness$table[which(richness$table$group=="hybrid"),-(1:2)], 2, mean) \end{lstlisting} <>= apply(richness$table[which(richness$table$group=="hybrid"),-(1:2)], 2, mean) @ \begin{lstlisting} richness$plot \end{lstlisting} <>= richness$plot @ \emph{\\Figure 8: Number of MSC per isolate across different percent identities. \\} \section{Similarity} Using the function \texttt{msc.similarity}, we found that 48.9\% and 25.9\% of the MSCs were unique to \emph{L. braziliensis} and \emph{L. peruviana}, respectively, while hybrid \emph{L. braziliensis} x \emph{L. peruviana} parasites shared MSCs with both parents (Figure 9). This confirms that hybrid parasites inherited minicircles from both Leishmania parental species. We also confirmed that the Andean and near-clonal \emph{L. peruviana} parasites harbored substantially less MSCs (mean = 62 MSCs per isolate) compared to the Amazonian and recombining \emph{L. braziliensis} parasites (mean = 124 MSCs per isolate). \\ \begin{lstlisting} sim <- msc.similarity(matrices, samples = exData$samples, groups = exData$species) sim$relfreq.plot + scale_fill_viridis(discrete = TRUE) \end{lstlisting} <>= sim <- msc.similarity(matrices, samples = exData$samples, groups = exData$species) sim$relfreq.plot + scale_fill_viridis(discrete = TRUE) @ \emph{\\Figure 9: Barplots show the proportion of minicircle sequence classes that are unique or shared between L. braziliensis, L. peruviana and their hybrids, for each \% identity threshold used during the clustering analyses.\\} \begin{lstlisting} c(sim$relfreq$id97["2"]*100, sim$relfreq$id97["3"]*100) \end{lstlisting} <>= c(sim$relfreq$id97["2"]*100, sim$relfreq$id97["3"]*100) @ \section{PCA} Principal Component Analysis based on minicircle sequence similarity (i.e. MSC presence/absence per isolate) separated \emph{L. braziliensis} from \emph{L. peruviana} on the first axis and three \emph{L. peruviana} populations on the second axis (Figure 10). The three \emph{L. peruviana} populations correspond to the Porculla lineage that circulates in the tropical deciduous forests of Peru, and the two Surco lineages that circulate in desert shrubland on the Pacific Coast (Surco North/Central and Surco Central/South). Hybrids did not cluster with either parental species, in contrast to what was observed for the uniparentally inherited kinetoplast, but instead occupied an intermediate position between \emph{L. braziliensis} and the \emph{L. peruviana} Surco Central/South lineage (Figure 10), again consistent with mixing of the parental minicircle populations.\\ \begin{lstlisting} res.pca <- lapply(matrices, function(x) msc.pca(x, samples = exData$samples, groups = exData$species, n=30, labels=FALSE, title=NULL)) res.pca$id97$plot \end{lstlisting} <>= res.pca <- lapply(matrices, function(x) msc.pca(x, samples = exData$samples, groups = exData$species, n=30, labels=FALSE, title=NULL)) res.pca$id97$plot @ \emph{\\Figure 10: Principal Component Analysis based on sequence similarity between MSCs at 97\% identity.\\} \end{document}