## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.dim = c(6, 4) ) ## ----setup-------------------------------------------------------------------- library(EZbakR) # Going to show one tidyr trick to help with cB filtering library(tidyr) # Going to use a bit of dplyr magic in my demonstrations library(dplyr) ## ----------------------------------------------------------------------------- simdata <- EZSimulate(nfeatures = 300, nreps = 2) ezbdo <- EZbakRData(simdata$cB, simdata$metadf) ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo, features = "feature") # NOTE: # For real data from fastq2EZbakR, you will often # want to set `features = "XF"`, specifying analysis # of reads mapping to exonic regions of genes. ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo, mutrate_populations = "TC") ## ----echo = T, results = 'hide'----------------------------------------------- # Observe contents of cB standard_fraction_design ## ----echo = FALSE, warning = FALSE-------------------------------------------- knitr::kable(standard_fraction_design, "pipe") ## ----results = "hide"--------------------------------------------------------- # Observe contents of cB tilac_fraction_design ## ----echo = FALSE, warning = FALSE-------------------------------------------- knitr::kable(tilac_fraction_design, "pipe") ## ----results = "hide"--------------------------------------------------------- # Three populations for fun: fd_table <- create_fraction_design(c("TC", "GA", "CG")) fd_table ## ----echo = FALSE, warning = FALSE-------------------------------------------- knitr::kable(fd_table, "pipe") ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo, features = 'feature') ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo, remove_features = c("__no_feature", "feature1")) ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo, filter_condition = `|`) ## ----results = 'hide'--------------------------------------------------------- example_df <- data.frame(feature = c('A', NA, 'C'), other_feature = c(NA, 'Y', 'Z')) replaced_df <- replace_na(example_df, list(feature = '__no_feature', other_feature = 'NA')) replaced_df ## ----echo = FALSE, warning = FALSE-------------------------------------------- example_df <- data.frame(feature = c('A', NA, 'C'), other_feature = c(NA, 'Y', 'Z')) replaced_df <- replace_na(example_df, list(feature = '__no_feature', other_feature = 'NA')) knitr::kable(replaced_df, "pipe") ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo, split_multi_features = TRUE, multi_feature_cols = "feature") ## ----------------------------------------------------------------------------- ezbdo <- CorrectDropout(ezbdo, grouping_factors = "treatment") ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo, pold_from_nolabel = TRUE) ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo, pold_from_nolabel = TRUE, grouping_factors = 'treatment') ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo, strategy = 'hierarchical') ## ----------------------------------------------------------------------------- est <- EZget(ezbdo, type = 'fractions') truth <- simdata$PerRepTruth compare <- dplyr::inner_join(est, truth, by = c('sample', 'feature')) plot(compare$true_fraction_highTC, compare$fraction_highTC) abline(0,1) ## ----------------------------------------------------------------------------- # Simulates a single sample worth of data simdata_iso <- SimulateIsoforms(nfeatures = 300) # We have to manually create the metadf in this case metadf <- data.frame(sample = 'sampleA', tl = 4, condition = 'A') ezbdo <- EZbakRData(simdata_iso$cB, metadf) ## ----------------------------------------------------------------------------- ezbdo <- EstimateFractions(ezbdo) ## ----------------------------------------------------------------------------- ### Hack in the true, simulated isoform levels reads <- simdata_iso$ground_truth %>% dplyr::select(transcript_id, true_count, true_TPM) %>% dplyr::mutate(sample = 'sampleA', effective_length = 10000) %>% dplyr::rename(expected_count = true_count, TPM = true_TPM) # Name of table needs to have "isoform_quant" in it ezbdo[['readcounts']][['simulated_isoform_quant']] <- reads ### Perform deconvolution ezbdo <- EstimateIsoformFractions(ezbdo) ## ----fig.align='center'------------------------------------------------------- est <- EZget(ezbdo, type = 'fractions', features = "transcript_id") truth <- simdata_iso$ground_truth compare <- truth %>% dplyr::inner_join(est, by = c("feature", "transcript_id")) plot(compare$true_fn, compare$fraction_highTC) abline(0,1) ## ----eval=FALSE--------------------------------------------------------------- # library(arrow) # # ### Move into the directory with your cB file # setwd("Path/to/cB/containing/directory") # # # ### This will not load the cB into memory # # You can run `read_csv("cB.csv.gz", n_max = 1)` to check to see what # # the order of the columns are, as this order needs to match your provided # # schema. # ds <- open_dataset("cB.csv.gz", # format = "csv", # schema = schema( # sample = string(), # rname = string(), # GF = string(), # XF = string(), # exon_bin = string(), # bamfile_transcripts = string(), # junction_start = string(), # junction_end = string(), # TC = int64(), # nT = int32(), # sj = bool(), # n = int64() # ), # skip_rows=1) # # # ### Create Arrow dataset # setwd("Path/to/where/you/want/to/write/Arrow/Dataset") # ds %>% # group_by(sample) %>% # write_dataset("fulldataset/", # format = "parquet") ## ----eval = FALSE------------------------------------------------------------- # ds <- arrow::open_dataset("Path/to/where/you/want/to/Arrow/Dataset/") # # # metadf <- tibble(sample = c("WT_1", "WT_2", "WT_ctl", # "KO_1", "KO_2", "KO_ctl"), # tl = c(2, 2, 0, # 2, 2, 0), # genotype = rep(c('WT', 'KO'), each = 3)) # # # ezbado <- EZbakRArrowData(ds, metadf) # ## ----eval = FALSE------------------------------------------------------------- # ezbado <- EstimateFractions(ezbado, # features = c("GF", "XF", # "junction_start", "junction_end"), # filter_cols = c("XF", "junction_start", # "junction_end"), # filter_condition = `|`, # split_multi_features = TRUE, # multi_feature_cols = c("junction_start", # "junction_end")) # # ## ----------------------------------------------------------------------------- library(arrow) simdata <- EZSimulate(nfeatures = 250) outdir <- tempdir() dataset_dir <- file.path(outdir, "arrow_dataset") write_dataset( simdata$cB, path = dataset_dir, format = "parquet", partitioning = "sample" ) ds <- open_dataset(dataset_dir) ezbdo <- EZbakRArrowData(ds, simdata$metadf) ezbdo <- EstimateFractions(ezbdo)