## ----packages-----------------------------------------------------------------
library("biomformat"); packageVersion("biomformat")

## ----read-biom-examples-------------------------------------------------------
min_dense_file   = system.file("extdata", "min_dense_otu_table.biom", 
                               package = "biomformat")
min_sparse_file  = system.file("extdata", "min_sparse_otu_table.biom", 
                               package = "biomformat")
rich_dense_file  = system.file("extdata", "rich_dense_otu_table.biom", 
                               package = "biomformat")
rich_sparse_file = system.file("extdata", "rich_sparse_otu_table.biom", 
                               package = "biomformat")
min_dense_file   = system.file("extdata", "min_dense_otu_table.biom", package = "biomformat")
rich_dense_char  = system.file("extdata", "rich_dense_char.biom", package = "biomformat")
rich_sparse_char  = system.file("extdata", "rich_sparse_char.biom", package = "biomformat")
x1 = read_biom(min_dense_file)
x2 = read_biom(min_sparse_file)
x3 = read_biom(rich_dense_file)
x4 = read_biom(rich_sparse_file)
x5 = read_biom(rich_dense_char)
x6 = read_biom(rich_sparse_char)
x1

## ----accessor-examples-table--------------------------------------------------
biom_data(x1)
biom_data(x2)

## ----matrix-coercion----------------------------------------------------------
as(biom_data(x2), "matrix")

## ----observ-meta--------------------------------------------------------------
observation_metadata(x1)
observation_metadata(x2)
observation_metadata(x3)
observation_metadata(x4)[1:2, 1:3]
class(observation_metadata(x4))

## ----plot-examples------------------------------------------------------------
sample_metadata(x1)
sample_metadata(x2)
sample_metadata(x3)
sample_metadata(x4)[1:2, 1:3]
class(sample_metadata(x4))

## ----plot---------------------------------------------------------------------
plot(biom_data(x4))
boxplot(as(biom_data(x4), "vector"))
heatmap(as(biom_data(x4), "matrix"))

## ----write-biom-examples------------------------------------------------------
outfile = tempfile()
write_biom(x4, outfile)
y = read_biom(outfile)
identical(x4, y)

## ----compare-files-diff, eval=FALSE-------------------------------------------
# # On Unix OSes
# system(paste0("diff ", rich_sparse_file, outfile))
# # On windows
# system(paste0("FC ", rich_sparse_file, outfile))

## ----hdf5-available, include=FALSE--------------------------------------------
hdf5_ok <- requireNamespace("rhdf5", quietly = TRUE)

## ----hdf5-read, eval=hdf5_ok--------------------------------------------------
hdf5_file <- system.file("extdata", "rich_sparse_otu_table_hdf5.biom",
                          package = "biomformat")
xh <- read_biom(hdf5_file)
xh
biom_data(xh)

## ----hdf5-write, eval=hdf5_ok-------------------------------------------------
hdf5_out <- tempfile(fileext = ".biom")
write_hdf5_biom(xh, hdf5_out)

xh2 <- read_biom(hdf5_out)
# Count matrices are identical
identical(biom_data(xh), biom_data(xh2))
# Observation metadata (taxonomy) is preserved
identical(observation_metadata(xh), observation_metadata(xh2))
# Sample metadata is preserved
identical(sample_metadata(xh), sample_metadata(xh2))

## ----hdf5-convert, eval=hdf5_ok-----------------------------------------------
json_to_hdf5 <- tempfile(fileext = ".biom")
write_hdf5_biom(x4, json_to_hdf5)
x4_hdf5 <- read_biom(json_to_hdf5)
identical(biom_data(x4), biom_data(x4_hdf5))

## ----tidy-dataframe-----------------------------------------------------------
long_df <- as.data.frame(x4)
head(long_df)
dim(long_df)   # n_obs * n_samples rows, feature/sample/count + metadata cols

## ----tidy-colnames------------------------------------------------------------
names(long_df)

## ----tidy-tibble, eval=requireNamespace("tibble", quietly=TRUE)---------------
library(tibble)
long_tbl <- as_tibble.biom(x4)
long_tbl

## ----purrr-available, include=FALSE-------------------------------------------
purrr_ok <- requireNamespace("purrr", quietly = TRUE) &&
            requireNamespace("dplyr", quietly = TRUE)

## ----purrr-summary, eval=purrr_ok---------------------------------------------
library(purrr)
library(dplyr)

# Total counts per sample (purrr + dplyr)
long_df |>
  group_by(sample_id) |>
  summarise(total_counts = sum(count), .groups = "drop") |>
  arrange(desc(total_counts))

## ----purrr-map, eval=purrr_ok-------------------------------------------------
# Per-sample Shannon diversity using purrr::map_dbl
long_df |>
  group_by(sample_id) |>
  group_split() |>
  purrr::map_dbl(function(df) {
    p <- df$count / sum(df$count)
    p <- p[p > 0]
    -sum(p * log(p))
  }) |>
  setNames(unique(long_df$sample_id))

## ----tapply-fallback, eval=!purrr_ok------------------------------------------
# # Base-R equivalent when purrr is not available
# tapply(long_df$count, long_df$sample_id, function(counts) {
#   p <- counts / sum(counts)
#   p <- p[p > 0]
#   -sum(p * log(p))
# })

## ----se-available, include=FALSE----------------------------------------------
se_ok <- requireNamespace("SummarizedExperiment", quietly = TRUE) &&
         requireNamespace("S4Vectors",            quietly = TRUE)

## ----se-constructor, eval=se_ok-----------------------------------------------
library(SummarizedExperiment)

se <- biom_to_SummarizedExperiment(x4)
se

## ----se-slots, eval=se_ok-----------------------------------------------------
# Count matrix (same as biom_data(x4))
head(assay(se, "counts"))

# Sample metadata in colData
colData(se)[, 1:3]

# Observation (OTU) metadata in rowData
rowData(se)[, 1:3]

## ----se-as-coercion, eval=se_ok-----------------------------------------------
se2 <- as(x4, "SummarizedExperiment")
identical(assay(se, "counts"), assay(se2, "counts"))

## ----make_biom_workflow-------------------------------------------------------
# Simulate a small ASV count table (rows = features, cols = samples)
set.seed(42)
mat <- matrix(
  sample(0:50, 15, replace = TRUE), nrow = 3, ncol = 5,
  dimnames = list(
    paste0("ASV", 1:3),
    paste0("Samp", 1:5)
  )
)

# Taxonomy table: one row per feature, one list-valued column "taxonomy"
# Each element is a character vector of rank assignments (kingdom -> species).
tax <- data.frame(
  taxonomy = I(list(
    c("Bacteria", "Firmicutes",      "Clostridia",   "Clostridiales",
      "Lachnospiraceae", "Blautia", "NA"),
    c("Bacteria", "Proteobacteria", "Gammaproteobacteria", "Enterobacteriales",
      "Enterobacteriaceae", "Escherichia", "coli"),
    c("Bacteria", "Bacteroidetes",  "Bacteroidia",  "Bacteroidales",
      "Bacteroidaceae", "Bacteroides", "fragilis")
  )),
  row.names = rownames(mat)
)

# Build the biom object
x <- make_biom(data = mat, observation_metadata = tax,
               matrix_element_type = "int")
x

# Write to a JSON BIOM file and read back
tmp <- tempfile(fileext = ".biom")
write_biom(x, tmp)
y <- read_biom(tmp)

# Count data survives the round-trip
identical(biom_data(x), biom_data(y))

# Observation metadata is preserved (taxonomy values, not column name)
head(observation_metadata(y))

## ----make_biom_hdf5, eval=requireNamespace("rhdf5", quietly=TRUE)-------------
# For large datasets, write HDF5 (BIOM v2) instead
hdf5_tmp <- tempfile(fileext = ".biom")
write_hdf5_biom(x, hdf5_tmp)
z <- read_biom(hdf5_tmp)
identical(biom_data(x), biom_data(z))

## ----named_subsetting---------------------------------------------------------
biom_file <- system.file("extdata", "rich_sparse_otu_table.biom",
                          package = "biomformat")
x <- read_biom(biom_file)

# All samples for a specific feature
biom_data(x, rows = "GG_OTU_3")

# A specific set of samples for two features
biom_data(x, rows = c("GG_OTU_1", "GG_OTU_3"),
              columns = c("Sample1", "Sample3", "Sample5"))

## ----session-info-------------------------------------------------------------
sessionInfo()