## ----packages----------------------------------------------------------------- library("biomformat"); packageVersion("biomformat") ## ----read-biom-examples------------------------------------------------------- min_dense_file = system.file("extdata", "min_dense_otu_table.biom", package = "biomformat") min_sparse_file = system.file("extdata", "min_sparse_otu_table.biom", package = "biomformat") rich_dense_file = system.file("extdata", "rich_dense_otu_table.biom", package = "biomformat") rich_sparse_file = system.file("extdata", "rich_sparse_otu_table.biom", package = "biomformat") min_dense_file = system.file("extdata", "min_dense_otu_table.biom", package = "biomformat") rich_dense_char = system.file("extdata", "rich_dense_char.biom", package = "biomformat") rich_sparse_char = system.file("extdata", "rich_sparse_char.biom", package = "biomformat") x1 = read_biom(min_dense_file) x2 = read_biom(min_sparse_file) x3 = read_biom(rich_dense_file) x4 = read_biom(rich_sparse_file) x5 = read_biom(rich_dense_char) x6 = read_biom(rich_sparse_char) x1 ## ----accessor-examples-table-------------------------------------------------- biom_data(x1) biom_data(x2) ## ----matrix-coercion---------------------------------------------------------- as(biom_data(x2), "matrix") ## ----observ-meta-------------------------------------------------------------- observation_metadata(x1) observation_metadata(x2) observation_metadata(x3) observation_metadata(x4)[1:2, 1:3] class(observation_metadata(x4)) ## ----plot-examples------------------------------------------------------------ sample_metadata(x1) sample_metadata(x2) sample_metadata(x3) sample_metadata(x4)[1:2, 1:3] class(sample_metadata(x4)) ## ----plot--------------------------------------------------------------------- plot(biom_data(x4)) boxplot(as(biom_data(x4), "vector")) heatmap(as(biom_data(x4), "matrix")) ## ----write-biom-examples------------------------------------------------------ outfile = tempfile() write_biom(x4, outfile) y = read_biom(outfile) identical(x4, y) ## ----compare-files-diff, eval=FALSE------------------------------------------- # # On Unix OSes # system(paste0("diff ", rich_sparse_file, outfile)) # # On windows # system(paste0("FC ", rich_sparse_file, outfile)) ## ----hdf5-available, include=FALSE-------------------------------------------- hdf5_ok <- requireNamespace("rhdf5", quietly = TRUE) ## ----hdf5-read, eval=hdf5_ok-------------------------------------------------- hdf5_file <- system.file("extdata", "rich_sparse_otu_table_hdf5.biom", package = "biomformat") xh <- read_biom(hdf5_file) xh biom_data(xh) ## ----hdf5-write, eval=hdf5_ok------------------------------------------------- hdf5_out <- tempfile(fileext = ".biom") write_hdf5_biom(xh, hdf5_out) xh2 <- read_biom(hdf5_out) # Count matrices are identical identical(biom_data(xh), biom_data(xh2)) # Observation metadata (taxonomy) is preserved identical(observation_metadata(xh), observation_metadata(xh2)) # Sample metadata is preserved identical(sample_metadata(xh), sample_metadata(xh2)) ## ----hdf5-convert, eval=hdf5_ok----------------------------------------------- json_to_hdf5 <- tempfile(fileext = ".biom") write_hdf5_biom(x4, json_to_hdf5) x4_hdf5 <- read_biom(json_to_hdf5) identical(biom_data(x4), biom_data(x4_hdf5)) ## ----tidy-dataframe----------------------------------------------------------- long_df <- as.data.frame(x4) head(long_df) dim(long_df) # n_obs * n_samples rows, feature/sample/count + metadata cols ## ----tidy-colnames------------------------------------------------------------ names(long_df) ## ----tidy-tibble, eval=requireNamespace("tibble", quietly=TRUE)--------------- library(tibble) long_tbl <- as_tibble.biom(x4) long_tbl ## ----purrr-available, include=FALSE------------------------------------------- purrr_ok <- requireNamespace("purrr", quietly = TRUE) && requireNamespace("dplyr", quietly = TRUE) ## ----purrr-summary, eval=purrr_ok--------------------------------------------- library(purrr) library(dplyr) # Total counts per sample (purrr + dplyr) long_df |> group_by(sample_id) |> summarise(total_counts = sum(count), .groups = "drop") |> arrange(desc(total_counts)) ## ----purrr-map, eval=purrr_ok------------------------------------------------- # Per-sample Shannon diversity using purrr::map_dbl long_df |> group_by(sample_id) |> group_split() |> purrr::map_dbl(function(df) { p <- df$count / sum(df$count) p <- p[p > 0] -sum(p * log(p)) }) |> setNames(unique(long_df$sample_id)) ## ----tapply-fallback, eval=!purrr_ok------------------------------------------ # # Base-R equivalent when purrr is not available # tapply(long_df$count, long_df$sample_id, function(counts) { # p <- counts / sum(counts) # p <- p[p > 0] # -sum(p * log(p)) # }) ## ----se-available, include=FALSE---------------------------------------------- se_ok <- requireNamespace("SummarizedExperiment", quietly = TRUE) && requireNamespace("S4Vectors", quietly = TRUE) ## ----se-constructor, eval=se_ok----------------------------------------------- library(SummarizedExperiment) se <- biom_to_SummarizedExperiment(x4) se ## ----se-slots, eval=se_ok----------------------------------------------------- # Count matrix (same as biom_data(x4)) head(assay(se, "counts")) # Sample metadata in colData colData(se)[, 1:3] # Observation (OTU) metadata in rowData rowData(se)[, 1:3] ## ----se-as-coercion, eval=se_ok----------------------------------------------- se2 <- as(x4, "SummarizedExperiment") identical(assay(se, "counts"), assay(se2, "counts")) ## ----make_biom_workflow------------------------------------------------------- # Simulate a small ASV count table (rows = features, cols = samples) set.seed(42) mat <- matrix( sample(0:50, 15, replace = TRUE), nrow = 3, ncol = 5, dimnames = list( paste0("ASV", 1:3), paste0("Samp", 1:5) ) ) # Taxonomy table: one row per feature, one list-valued column "taxonomy" # Each element is a character vector of rank assignments (kingdom -> species). tax <- data.frame( taxonomy = I(list( c("Bacteria", "Firmicutes", "Clostridia", "Clostridiales", "Lachnospiraceae", "Blautia", "NA"), c("Bacteria", "Proteobacteria", "Gammaproteobacteria", "Enterobacteriales", "Enterobacteriaceae", "Escherichia", "coli"), c("Bacteria", "Bacteroidetes", "Bacteroidia", "Bacteroidales", "Bacteroidaceae", "Bacteroides", "fragilis") )), row.names = rownames(mat) ) # Build the biom object x <- make_biom(data = mat, observation_metadata = tax, matrix_element_type = "int") x # Write to a JSON BIOM file and read back tmp <- tempfile(fileext = ".biom") write_biom(x, tmp) y <- read_biom(tmp) # Count data survives the round-trip identical(biom_data(x), biom_data(y)) # Observation metadata is preserved (taxonomy values, not column name) head(observation_metadata(y)) ## ----make_biom_hdf5, eval=requireNamespace("rhdf5", quietly=TRUE)------------- # For large datasets, write HDF5 (BIOM v2) instead hdf5_tmp <- tempfile(fileext = ".biom") write_hdf5_biom(x, hdf5_tmp) z <- read_biom(hdf5_tmp) identical(biom_data(x), biom_data(z)) ## ----named_subsetting--------------------------------------------------------- biom_file <- system.file("extdata", "rich_sparse_otu_table.biom", package = "biomformat") x <- read_biom(biom_file) # All samples for a specific feature biom_data(x, rows = "GG_OTU_3") # A specific set of samples for two features biom_data(x, rows = c("GG_OTU_1", "GG_OTU_3"), columns = c("Sample1", "Sample3", "Sample5")) ## ----session-info------------------------------------------------------------- sessionInfo()