--- title: "FASTA p1875 db10 / WU160118 / R444589 - Postprocessing / Summary" author: - name: Christian Panse affiliation: Functional Genomics Center Zurich email: cp@fgcz.ethz.ch - name: Bernd Roschitzki affiliation: Functional Genomics Center Zurich package: NestLink bibliography: - NestLink.bib output: BiocStyle::html_document abstract: | This vignette contains code snippets to display results of the NestLink project (p1875). Briefly, it analyzes the amno acid frequencies of the in-silico composed and measured flycodes. vignette: | %\VignetteIndexEntry{2. Analyze flycode detectability using ESP and SSRC prediction} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} date: '2017-11-02, 2017-11-03, 2017-11-04, 2017-11-06, 2017-11-08, 2018-01-30, 2018-02-01, 2018-12-13' --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` # Load requiered R packages ```{r message=FALSE} library(NestLink) ``` # Display ESP ESP_Prediction was generated using an application https://genepattern.broadinstitute.org @pmid19169246. ```{r fig.retina=1} library(ggplot2) ESP <- rbind(getFC(), getNB()) p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) + geom_histogram(bins = 50, alpha = 0.4, position="identity") + labs(x = "detectability in LC-MS (ESP prediction)") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) print(p) ESP <- rbind(getFC(), NB.unambiguous(getNB())) p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) + geom_histogram(bins = 50, alpha = 0.4, position="identity") + labs(x = "detectability in LC-MS (ESP prediction)") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) print(p) ESP <- rbind(getFC(), NB.unique(getNB())) p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) + geom_histogram(bins = 50, alpha = 0.4, position="identity") + labs(x = "detectability in LC-MS (ESP prediction)") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) print(p) ESP <- rbind(getNB(), NB.unambiguous(getNB()), NB.unique(getNB())) p <- ggplot(ESP, aes(x = ESP_Prediction, fill = cond)) + geom_histogram(bins = 50, alpha = 0.4, position="identity") + labs(x = "detectability in LC-MS (ESP prediction)") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black")) print(p) table(ESP$cond) ``` # Charge State Frequency ## Load workunit 160118 ```{r} # library(ExperimentHub) # eh <- ExperimentHub(); # load(query(eh, c("NestLink", "WU160118.RData"))[[1]]) load(getExperimentHubFilename("WU160118.RData")) WU <- WU160118 ``` filtering ```{r} PATTERN <- "^GS[ASTNQDEFVLYWGP]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$" idx <- grepl(PATTERN, WU$pep_seq) WU <- WU[idx & WU$pep_score > 25,] ``` determine charge state frequency through counting ```{r} WU$chargeInt <- as.integer(substr(WU$query_charge, 0, 1)) table(WU$chargeInt) ``` in percent ```{r} round(100 * (table(WU$chargeInt) / nrow(WU)), 1) ``` as histograms ```{r fig.retina=1, fig.height=8} library(scales) ggplot(WU, aes(x = moverz * chargeInt, fill = (query_charge), colour = (query_charge))) + facet_wrap(~ datfilename, ncol = 2) + geom_histogram(binwidth= 10, alpha=.3, position="identity") + labs(title = "Precursor mass to charge frequency plot", subtitle = "Plotting frequency of precursor masses for each charge state", x = "Precursor mass [neutral mass]", y = "Frequency [counts]", fill = "Charge State", colour = "Charge State") + scale_x_continuous(breaks = pretty_breaks(8)) + theme_light() ``` We confirmed this prediction by experimental data and found that 99.9 percent of flycode precursor ions correspond to doubly charge species (data not shown). The omission of positively charged residues is also critical in order to render trypsin a site-specific protease. ## FlyCode Mass Distribution ```{r} WU$suffix <- substr(WU$pep_seq, 10, 100) ggplot(WU, aes(x = moverz * chargeInt, fill = suffix, colour = suffix)) + geom_histogram(binwidth= 10, alpha=.2, position="identity") + #facet_wrap(~ substr(pep_seq, 10, 100)) + theme_light() ``` ```{r} ggplot(WU, aes(x = moverz * chargeInt, fill = suffix)) + geom_histogram(binwidth= 10, alpha=.9, position="identity") + facet_wrap(~ substr(pep_seq, 10, 100)) + theme_light() ``` ## Mascot Ion Score Distribution ```{r} ggplot(WU, aes(x = pep_score, fill = query_charge, colour = query_charge)) + geom_histogram(binwidth= 2, alpha=.5, position="identity") + facet_wrap(~ substr(pep_seq, 10, 100)) + theme_light() ``` # Compare Real and in-silico FlyCodes Reads the flycodes from p1875 db10 fasta file ```{r} # FC <- read.table(system.file("extdata/FC.tryptic", package = "NestLink"), # header = TRUE) FC <- getFC() FC$peptide <- (as.character(FC$peptide)) idx <- grep (PATTERN, FC$peptide) FC$cond <- "FC" FC$pim <- parentIonMass(FC$peptide) FC <- FC[nchar(FC$peptide) >2, ] FC$ssrc <- sapply(FC$peptide, ssrc) FC$peptideLength <- nchar(as.character(FC$peptide)) FC <- FC[idx,] head(FC) ``` ## AA design frequency ```{r} aa_pool_x7 <- c(rep('A', 18), rep('S', 6), rep('T', 12), rep('N', 1), rep('Q', 1), rep('D', 11), rep('E', 11), rep('V', 12), rep('L', 2), rep('F', 1), rep('Y', 4), rep('W', 1), rep('G', 8), rep('P', 12)) aa_pool_x7.post <- c(rep('WR', 20), rep('WLTVR', 20), rep('WQEGGR', 20), rep('WLR', 20), rep('WQSR', 20)) ``` ```{r} FC.pep_seq.freq <- table(unlist(strsplit(substr(FC$peptide, 3, 9), ""))) FC.freq <- round(FC.pep_seq.freq / sum(FC.pep_seq.freq) * 100, 3) FC.pep_seq.freq.post <- table(unlist((substr(FC$peptide, 10, 100)))) FC.freq.post <- round(FC.pep_seq.freq.post / sum(FC.pep_seq.freq.post) * 100, 3) WU.pep_seq.freq <- table(unlist(strsplit(substr(WU$pep_seq, 3, 9), ""))) WU.freq <- round(WU.pep_seq.freq / sum(WU.pep_seq.freq) * 100,3) WU.pep_seq.freq.post <- table(unlist(substr(WU$pep_seq, 10, 100))) WU.freq.post <- round(WU.pep_seq.freq.post / sum(WU.pep_seq.freq.post) *100,3) ``` ```{r} table(aa_pool_x7) FC.freq WU.freq ``` ```{r} AAfreq1 <- data.frame(aa = table(aa_pool_x7)) names(AAfreq1) <- c('AA', 'freq') AAfreq1 <-rbind(AAfreq1, df<-data.frame(AA=c('C','H','I','M','K','R'), freq=rep(0,6))) AAfreq1$cond <- 'theoretical frequency' AAfreq2 <- data.frame(aa=FC.freq) names(AAfreq2) <- c('AA', 'freq') AAfreq2$cond <- "db10.fasta" AAfreq3 <- data.frame(aa=WU.freq) names(AAfreq3) <- c('AA', 'freq') AAfreq3$cond <- "WU" #FC.all <- read.table(system.file("extdata/FC.tryptic", package = "NestLink"), # header = TRUE) FC.all <- getFC() PATTERN.all <- "^GS[A-Z]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$" FC.all <- as.character(FC.all$peptide)[grep(PATTERN.all, as.character(FC.all$peptide))] FC.all.pep_seq.freq <- table(unlist(strsplit(substr(FC.all, 3, 9), ""))) FC.all.freq <- round(FC.all.pep_seq.freq / sum(FC.all.pep_seq.freq) * 100, 3) FC.all.freq[20] <- 0 names(FC.all.freq) <- c(names(FC.all.freq)[1:19], "K") AAfreq4 <- data.frame(AA=names(FC.all.freq), freq=as.numeric(FC.all.freq)) AAfreq4$cond <- "sequenced frequency" AAfreq <- do.call('rbind', list(AAfreq1, AAfreq2, AAfreq3, AAfreq4)) AAfreq$freq <- as.numeric(AAfreq$freq) ``` ```{r fig.retina=1} ggplot(data=AAfreq, aes(x=AA, y=freq, fill=cond)) + geom_bar(stat="identity", position="dodge") + labs(title = "amino acid frequency plot") + theme_light() ``` ```{r} AAfreq1.post <- data.frame(aa=table(aa_pool_x7.post)) names(AAfreq1.post) <- c('AA', 'freq') AAfreq1.post$cond <- "aa_pool_x7" AAfreq2.post <- data.frame(aa=FC.freq.post) names(AAfreq2.post) <- c('AA', 'freq') AAfreq2.post$cond <- "db10.fasta" AAfreq3.post <- data.frame(aa=WU.freq.post) names(AAfreq3.post) <- c('AA', 'freq') AAfreq3.post$cond <- "WU" AAfreq.post <- do.call('rbind', list(AAfreq1.post, AAfreq2.post, AAfreq3.post)) ``` ```{r NestLink_AAFreqPercent, fig.retina=1, fig.path='Figs/'} df<-AAfreq[(AAfreq$cond != 'WU' & AAfreq$cond != 'db10.fasta'), ] df$freq<- as.numeric(df$freq) ggplot(data=df, aes(x=AA, y=freq, fill=cond)) + geom_bar(stat="identity", position="dodge") + labs(title = "Amino acid frequency plot") + ylab("Frequency [%]") + theme_light() ``` ```{r NestLink_FCAAfreqAbsolut, fig.retina=1, fig.path='Figs/'} FCfreq <- data.frame(table(unlist(strsplit(substr(FC$peptide, 3, 9), "")))) colnames(FCfreq) <- c('AA', 'freq') ggplot(data=FCfreq, aes(x=AA, y=freq)) + geom_bar(stat="identity", position="dodge") + labs(title = "Amino acid frequency plot") + theme_light() ``` ```{r fig.retina=1} ggplot(data=AAfreq.post, aes(x=AA, y=freq, fill=cond)) + geom_bar(stat="identity", position="dodge") + labs(title = "suffix frequency plot") + theme_light() ``` ## SSRC Prediction vs. Measured Retention Time The RT is predicted using the method described in @pmid15238601 and implemented using the `r Biocpkg("specL")` package @pmid25712692. ```{r fig.retina=1, fig.height=12} WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc) WU$suffix <- substr(WU$pep_seq, 10,100) ggplot(WU, aes(x = RTINSECONDS, y = ssrc)) + geom_point(aes(alpha=pep_score, colour = datfilename)) + facet_wrap(~ datfilename * suffix, ncol = 5, nrow = 8) + geom_smooth(method = 'lm') ``` ```{r} cor(WU$RTINSECONDS, WU$ssrc, method = 'spearman') ``` ## SSRC and Retention Time Distribution Considers only Mascot Result File F255737 ```{r} WU <- WU160118 PATTERN <- "^GS[ASTNQDEFVLYWGP]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$" idx <- grepl(PATTERN, WU$pep_seq) # Mascot score cut-off valye 25 WU <- WU[idx & WU$pep_score > 25,] ``` ```{r fig.retina=1} WU$suffix <- substr(WU$pep_seq, 10, 100) WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc) ggplot(WU, aes(x = RTINSECONDS, fill = datfilename)) + geom_histogram(bins = 50) ggplot(WU, aes(x = ssrc, fill = datfilename)) + geom_histogram(bins = 50) ggplot(WU, aes(x = RTINSECONDS, fill = suffix)) + geom_histogram(bins = 50) ggplot(WU, aes(x = ssrc, fill = suffix)) + geom_histogram(bins = 50) ``` ```{r} WU <- WU[WU$datfilename == "F255737",] WU <- aggregate(WU$RTINSECONDS ~ WU$pep_seq, FUN = min) names(WU) <-c("pep_seq", "RTINSECONDS") WU$suffix <- substr(WU$pep_seq, 10, 100) WU$peptide_mass <- parentIonMass(as.character(WU$pep_seq)) WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc) WU$datfilename <- "F255737" ``` ```{r fig.retina=1, fig.width=12, fig.height=4} ggplot(WU, aes(x = RTINSECONDS, y = peptide_mass)) + geom_point(aes(colour = suffix)) + facet_wrap(~ datfilename * suffix, ncol=5) ggplot(WU, aes(x = RTINSECONDS, fill = suffix)) + geom_histogram(bins = 20) + facet_wrap(~ datfilename * suffix, ncol=5) ``` ```{r fig.retina=1, fig.width=12, fig.height=4} ggplot(WU, aes(x = ssrc, y = peptide_mass)) + geom_point(aes(colour = suffix)) + facet_wrap(~ datfilename * suffix, ncol = 5) ggplot(WU, aes(x = ssrc,fill = suffix)) + geom_histogram(bins = 20) + facet_wrap(~ datfilename * suffix, ncol=5) ``` ```{r fig.retina=1, fig.width=8, fig.height=4} ggplot(WU, aes(x = RTINSECONDS, y = peptide_mass)) + geom_point(aes(colour = suffix), size = 3.0, alpha = 0.1) ``` ```{r fig.retina=1, fig.width=8, fig.height=4} ggplot(WU, aes(x = ssrc, y = peptide_mass)) + geom_point(aes(colour = suffix), size = 3.0, alpha = 0.1) ``` ## Mass vs. RT Panels - Mascot Ion Score cut-off ```{r fig.retina=1, fig.height=12} WU <- do.call('rbind', lapply(c(25,35,45), function(cutoff){ WU <- WU160118 PATTERN <- "^GS[ASTNQDEFVLYWGP]{7}(WR|WLTVR|WQEGGR|WLR|WQSR)$" idx <- grepl(PATTERN, WU$pep_seq) WU <- WU[idx & WU$pep_score > cutoff, ] WU <- WU[WU$datfilename == "F255737",] WU <- aggregate(WU$RTINSECONDS ~ WU$pep_seq, FUN=min) names(WU) <-c("pep_seq","RTINSECONDS") WU$suffix <- substr(WU$pep_seq, 10, 100) WU$peptide_mass <- parentIonMass(as.character(WU$pep_seq)) WU$ssrc <- sapply(as.character(WU$pep_seq), ssrc) WU$datfilename <- "F255737" WU$mascotScoreCutOff <- cutoff WU})) ggplot(WU, aes(x = RTINSECONDS, y = peptide_mass)) + geom_point(aes(colour = suffix), size = 3.0, alpha = 0.1) + facet_wrap(~ mascotScoreCutOff, ncol = 1) ggplot(WU, aes(x = ssrc, y = peptide_mass)) + geom_point(aes(colour = suffix), size = 3.0, alpha = 0.1) + facet_wrap(~ mascotScoreCutOff, ncol = 1 ) ``` # Session info Here is the output of the `sessionInfo()` command. ```{r sessionInfo, echo=FALSE} sessionInfo() ``` # References