## ---- include = FALSE, setup-------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", cols.print = 3 ) multiqc_data_path = system.file("extdata", "wgs/multiqc_data.json", package = "TidyMultiqc") ## ---- eval=FALSE-------------------------------------------------------------- # install.packages("TidyMultiqc") ## ---- eval=FALSE-------------------------------------------------------------- # library(TidyMultiqc) ## ----paged.print=TRUE--------------------------------------------------------- df = TidyMultiqc::load_multiqc(multiqc_data_path) df ## ----------------------------------------------------------------------------- TidyMultiqc::load_multiqc(multiqc_data_path, sections = 'raw') ## ----------------------------------------------------------------------------- df_both = TidyMultiqc::load_multiqc(multiqc_data_path, sections = c('raw', 'general')) ncol(df_both) ## ----------------------------------------------------------------------------- library(magrittr) df %>% ggplot2::ggplot(ggplot2::aes(x=metadata.sample_id, y=general.percent_duplication)) + ggplot2::geom_col() ## ----------------------------------------------------------------------------- t.test(df$general.percent_gc, mu=41) ## ----------------------------------------------------------------------------- TidyMultiqc::load_multiqc( multiqc_data_path, find_metadata = function(sample, parsed) { # Split the sample ID to obtain some metadata segments <- stringr::str_split(sample, "_")[[1]] c( batch = segments[[1]], sample = segments[[2]] ) } ) ## ----------------------------------------------------------------------------- TidyMultiqc::load_multiqc( multiqc_data_path, find_metadata = function(sample, parsed) { # This gives us the path to the fastqc output file filepath = parsed$report_data_sources$FastQC$all_sections[[sample]] # Split into path segments path_segments = stringr::str_split(filepath, "/")[[1]] # The filename is the last path segment filename = dplyr::last(path_segments) # Split the filename using dots and underscores name_segments = stringr::str_split(filename, "[_\\.]")[[1]] # Arbitrarily assign names for the outputs name_segments %>% purrr::set_names(LETTERS[1:length(name_segments)]) } ) ## ----------------------------------------------------------------------------- TidyMultiqc::load_multiqc( multiqc_data_path, find_metadata = function(sample, parsed) { parsed[c( "config_creation_date", "config_version" )] } ) ## ----message=FALSE, warning=FALSE--------------------------------------------- df_both %>% dplyr::select(dplyr::contains('quality')) ## ---- eval = FALSE------------------------------------------------------------ # TidyMultiqc::list_plots(multiqc_data_path) ## ---- echo = FALSE------------------------------------------------------------ TidyMultiqc::list_plots(multiqc_data_path) %>% dplyr::mutate(dplyr::across(dplyr::everything(), ~stringr::str_trunc(., 50))) ## ----------------------------------------------------------------------------- df = TidyMultiqc::load_multiqc( multiqc_data_path, sections = 'plot', plots = "fastqc_per_sequence_quality_scores_plot" ) df ## ----------------------------------------------------------------------------- df$plot.fastqc_per_sequence_quality_scores_plot[[1]] ## ----------------------------------------------------------------------------- df %>% tidyr::unnest(cols = plot.fastqc_per_sequence_quality_scores_plot) ## ----------------------------------------------------------------------------- df %>% tidyr::unnest(cols = plot.fastqc_per_sequence_quality_scores_plot) %>% dplyr::group_by(metadata.sample_id) %>% dplyr::summarise(total_reads = sum(y)) ## ----------------------------------------------------------------------------- df %>% dplyr::mutate( total_reads = purrr::map_dbl(plot.fastqc_per_sequence_quality_scores_plot, ~sum(.$y)), plot.fastqc_per_sequence_quality_scores_plot = NULL ) ## ----------------------------------------------------------------------------- df %>% tidyr::unnest(cols = plot.fastqc_per_sequence_quality_scores_plot) %>% dplyr::group_by(metadata.sample_id) %>% dplyr::mutate(hist = list(HistDat::HistDat(vals = x, counts = y)), .keep = "unused") %>% dplyr::mutate( mean_coverage = hist %>% dplyr::first() %>% mean(), median_coverage = hist %>% dplyr::first() %>% median(), max_coverage = hist %>% dplyr::first() %>% max(), hist= NULL ) %>% dplyr::slice(1) ## ----------------------------------------------------------------------------- df %>% dplyr::mutate( purrr::map_dfr(plot.fastqc_per_sequence_quality_scores_plot, function(plot_df){ hist = HistDat::HistDat(vals=plot_df$x, counts = plot_df$y) list( mean_coverage = mean(hist), median_coverage = median(hist), max_coverage = max(hist) ) }), plot.fastqc_per_sequence_quality_scores_plot = NULL ) ## ----------------------------------------------------------------------------- TidyMultiqc::load_multiqc( multiqc_data_path, sections = 'plot', plots = "fastqc_per_sequence_quality_scores_plot", plot_parsers = list( # This fake parser function takes a plot and just returns the iris dataset xy_line = function(plot_data, name){ list( sample_1 = list( plot_name = list(iris) ) ) } ) )