## ----load_libraries----------------------------------------------------------- suppressPackageStartupMessages({ library("QFeatures") library("dplyr") library("tidyr") library("ggplot2") library("msqrob2") library("stringr") library("ExploreModelMatrix") library("kableExtra") library("ComplexHeatmap") library("scater") }) ## ----file_location------------------------------------------------------------ peptideFile <- "https://github.com/statOmics/msqrob2data/raw/refs/heads/main/dda/cptacAvsB_lab3/peptides.txt" ## ----import_data-------------------------------------------------------------- peptides <- data.table::fread(peptideFile, check.names = TRUE) head(peptides) ## ----extract_quantCols-------------------------------------------------------- quantCols <- grep("Intensity[.]", names(peptides), value = TRUE) ## ----add_species_info--------------------------------------------------------- peptides <- peptides |> mutate(species = grepl(pattern = "UPS",Proteins) |> as.factor() |> recode("TRUE"="ups","FALSE" = "yeast")) ## ----make_annotation---------------------------------------------------------- (annot <- data.frame(quantCols = quantCols) |> #1. mutate(sampleId = gsub("Intensity[.]6","", quantCols), #2. condition = substr(sampleId,1,1), #3. rep = substr(sampleId, 3, 3)) #4. ) ## ----convert_to_QFeatures----------------------------------------------------- (qf <- readQFeatures(assayData = peptides, colData = annot, name = "peptides", fnames = "Sequence") ) ## ----convert0_to_NA----------------------------------------------------------- qf <- zeroIsNA(qf, "peptides") # convert 0 to NA ## ----log2_transformation------------------------------------------------------ qf <- logTransform(qf, base = 2, i = "peptides", name = "peptides_log") ## ----failed_protein_inference_filtering--------------------------------------- qf <- filterFeatures( qf, ~ Proteins != "" & ## Remove failed protein inference !grepl(";", Proteins)) ## Remove protein groups ## ----filtering_decoys_contaminants-------------------------------------------- qf <- filterFeatures(qf, ~ Reverse != "+" & ## Remove decoys Potential.contaminant != "+") ## Remove contaminants ## ----assess_NA_features------------------------------------------------------- nNaRes <- nNA(qf, "peptides") range(nNaRes$nNAcols$pNA) ## ----plot_NA_features--------------------------------------------------------- data.frame(nNaRes$nNArows) |> ggplot() + aes(x = nNA) + geom_histogram() + ggtitle("Number of missing values for each peptide") + theme_bw() ## ----filter_features_with_many_NA--------------------------------------------- nObs <- 3 n <- ncol(qf[["peptides_log"]]) (qf <- filterNA(qf, i = "peptides_log", pNA = (n - nObs) / n)) ## ----normalisation_needed----------------------------------------------------- qf[, , "peptides_log"] |> #1. longForm(colvars = colnames(colData(qf))) |> #2. data.frame() |> filter(!is.na(value)) |> ggplot() + #3. aes(x = value, colour = condition, group = colname) + geom_density() + theme_minimal() ## ----normalize_peptides------------------------------------------------------- qf <- sweep( #Subtract log2 norm factor column-by-column (MARGIN = 2) qf, MARGIN = 2, STATS = nfLogMedian(qf,"peptides_log"), i = "peptides_log", name = "peptides_norm" ) ## ----assess_normalisation----------------------------------------------------- qf[, , "peptides_norm"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = value, colour = condition, group = colname) + geom_density() + labs(subtitle = "Normalised log2 precursor intensities") + theme_minimal() ## ----summarize_peptides_to_proteins, warning=FALSE---------------------------- qf <- aggregateFeatures(qf, i = "peptides_norm", fcol = "Proteins", na.rm = TRUE, name = "proteins" ) ## ----qc_normalisation_peptides------------------------------------------------ qf[, , "peptides_norm"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = value, colour = condition, group = colname) + geom_density() + theme_minimal() + labs(subtitle = "Normalised log2 precursor intensities") ## ----qc_normalisation_peptides_boxplot---------------------------------------- qf[, , "peptides_norm"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = sampleId, y = value, colour = condition, group = colname) + xlab("sample") + geom_boxplot() + theme_minimal() + labs(subtitle = "Normalised log2 precursor intensities") ## ----qc_normalisation_proteins------------------------------------------------ qf[, , "proteins"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = value, colour = condition, group = colname) + geom_density() + theme_minimal() + labs(subtitle = "Normalised log2 protein intensities") ## ----qc_normalisation_proteins_boxplot---------------------------------------- qf[, , "proteins"] |> longForm(colvars = colnames(colData(qf))) |> data.frame() |> filter(!is.na(value)) |> ggplot() + aes(x = sampleId, y = value, colour = condition, group = colname) + xlab("sample") + geom_boxplot() + theme_minimal() + labs(subtitle = "Normalised log2 protein intensities") ## ----qc_identifications------------------------------------------------------- qf[,,"peptides_norm"] |> longForm(colvars = colnames(colData(qf)), rowvars= c("Sequence", "Proteins")) |> data.frame() |> filter(!is.na(value)) |> group_by(condition, sampleId) |> summarise(Precursors = length(unique(Sequence)), `Protein Groups` = length(unique(Proteins))) |> pivot_longer(-(1:2), names_to = "Feature", values_to = "IDs") |> ggplot(aes(x = sampleId, y = IDs, fill = condition)) + geom_col() + #scale_fill_observable() + facet_wrap(~Feature, scales = "free_y") + labs(title = "Peptide and protein group identificiations per sample", x = "Sample", y = "Identifications") + theme_bw() + theme(axis.text.x = element_text(angle = 90)) ## ----qc_mds_proteins---------------------------------------------------------- getWithColData(qf, "proteins") |> as("SingleCellExperiment") |> runMDS(exprs_values = 1) |> plotMDS(colour_by = "condition", shape_by = "rep", point_size = 3) ## ----qc_correlation_proteins-------------------------------------------------- corMat <- qf[["proteins"]] |> assay() |> cor(method = "spearman", use = "pairwise.complete.obs") colnames(corMat) <- qf$sampleId rownames(corMat) <- qf$sampleId corMat |> ggcorrplot::ggcorrplot() + scale_fill_viridis_c() + labs(title = "Correlation matrix", fill = "Correlation") + theme(axis.text.x = element_text(angle = 90)) ## ----define_model------------------------------------------------------------- model <- ~ condition ## ----fit_models, warning=FALSE------------------------------------------------ qf <- msqrob( qf, i = "proteins", formula = model, robust = TRUE) ## ----print_models------------------------------------------------------------- models <- rowData(qf[["proteins"]])[["msqrobModels"]] models[1:3] ## ----explore_model_matrix----------------------------------------------------- vd <- ExploreModelMatrix::VisualizeDesign( sampleData = colData(qf), designFormula = model, textSizeFitted = 4 ) vd$plotlist ## ----make_contrasts----------------------------------------------------------- (L <- makeContrast( "conditionB = 0", parameterNames = colnames(vd$designmatrix) )) ## ----hypthesis_tests---------------------------------------------------------- qf <- hypothesisTest(qf, i = "proteins", contrast = L) ## ----collect_results---------------------------------------------------------- inferences <- msqrobCollect(qf[["proteins"]], L) head(inferences) ## ----significant_table, results='asis'---------------------------------------- alpha <- 0.05 #1. sigList <- inferences |> filter(adjPval < alpha) #3. kable(sigList) |> kable_styling(full_width = FALSE) |> scroll_box(height = "250px") #4. ## ----volcanoplot-------------------------------------------------------------- inferences |> plotVolcano() ## ----heatmap------------------------------------------------------------------ sig <- sigList |> pull(feature) #1. quants <- assay(qf,"proteins")[sig,] |> t() |> scale() |> t() #2. colnames(quants) <- qf$sampleId #3. annotations <- columnAnnotation(condition = qf$condition) #3. set.seed(1234) ## annotation colours are randomly generated by default Heatmap( quants, name = "log2 intensity", top_annotation = annotations ) #4. ## ----define_alpha------------------------------------------------------------- alpha <- 0.05 ## ----real_logFC--------------------------------------------------------------- realLogFC <- log2(0.74/0.25) ## ----collect_results_species-------------------------------------------------- inferences <- msqrobCollect(qf[["proteins"]], L) |> mutate(species = rowData(qf[["proteins"]])$species) ## ----assess_logFC------------------------------------------------------------- logFC <- inferences |> filter(!is.na(logFC)) |> ggplot(aes(x = species, y = logFC, color = species)) + #1. geom_boxplot() + #2. theme_bw() + scale_color_manual( values = c("grey20", "firebrick"), #3. name = "", labels = c("yeast", "ups") ) + geom_hline(yintercept = 0, color="grey20") + # 4. geom_hline(yintercept = realLogFC, color="firebrick") #5. logFC ## ----assess_TP_FP_FDP--------------------------------------------------------- inferences |> filter(adjPval < alpha) |> summarise("TP" = sum(species == "ups"), "FP" = sum(species != "ups"), "FDP" = mean(species != "ups")) ## ----FDR_TPR_functions-------------------------------------------------------- computeFDP <- function(pval, tp) { ord <- order(pval) fdp <- cumsum(!tp[ord]) / 1:length(tp) fdp[order(ord)] } computeTPR <- function(pval, tp, nTP = NULL) { if (is.null(nTP)) nTP <- sum(tp) ord <- order(pval) tpr <- cumsum(tp[ord]) / nTP tpr[order(ord)] } ## ----calculate_performance---------------------------------------------------- performance <- inferences |> group_by(contrast) |> na.exclude() |> mutate(tpr = computeTPR(pval, species == "ups"), fdp = computeFDP(pval, species == "ups")) |> arrange(fdp) ## ----calculate_workpoints----------------------------------------------------- workPoints <- performance |> group_by(contrast) |> filter(adjPval < 0.05) |> slice_max(pval) ## ----plot_tpr_fdp------------------------------------------------------------- ggplot(performance) + aes( y = fdp, x = tpr, ) + geom_line() + geom_point(data = workPoints, size = 3) + geom_hline(yintercept = 0.05, linetype = 2) + coord_flip(ylim = c(0, 0.2)) + theme_minimal() ## ----sessionInfo-------------------------------------------------------------- sessionInfo()