This section provides examples of utilizing the report tables for
broader analysis and queries.
Package Growth Over Time
Let’s explore how the number of Bioconductor packages has grown over
time:
# Get info table
info <- get_bbs_table("info")
# Count unique packages by branch
package_counts <- info |>
group_by(git_branch) |>
summarise(
n_packages = n_distinct(Package),
.groups = "drop"
) |>
arrange(desc(n_packages))
# Display the counts
package_counts
#> # A tibble: 13 × 2
#> git_branch n_packages
#> <chr> <int>
#> 1 devel 3132
#> 2 RELEASE_3_23 2915
#> 3 RELEASE_3_22 2885
#> 4 RELEASE_3_21 2859
#> 5 RELEASE_3_19 2816
#> 6 RELEASE_3_20 2807
#> 7 RELEASE_3_18 2778
#> 8 RELEASE_3_17 2723
#> 9 RELEASE_3_16 2670
#> 10 RELEASE_3_15 2622
#> 11 RELEASE_3_14 2528
#> 12 RELEASE_3_13 2475
#> 13 master 33
# Visualize package counts by branch
ggplot(package_counts, aes(x = reorder(git_branch, n_packages), y = n_packages)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Number of Packages by Bioconductor Branch",
x = "Branch",
y = "Number of Packages"
) +
theme_minimal()

Build Status Distribution
Understanding the distribution of build statuses helps identify
overall system health:
# Get build summary table
build_summary <- get_bbs_table("build_summary")
# Count build statuses
status_counts <- build_summary |>
count(status) |>
arrange(desc(n))
status_counts
#> # A tibble: 4 × 2
#> status n
#> <chr> <int>
#> 1 OK 14127577
#> 2 WARNINGS 697948
#> 3 ERROR 466206
#> 4 TIMEOUT 18322
# Visualize status distribution
ggplot(status_counts, aes(x = reorder(status, n), y = n)) +
geom_col(aes(fill = status)) +
scale_fill_manual(values = c(
"OK" = "green3",
"WARNING" = "orange",
"ERROR" = "red",
"TIMEOUT" = "darkred"
)) +
coord_flip() +
labs(
title = "Distribution of Build Statuses",
x = "Status",
y = "Count"
) +
theme_minimal() +
theme(legend.position = "none")

Build Stage Analysis
Understanding which build stage most often fails:
# Analyze failures by stage
stage_failures <- build_summary |>
filter(status %in% c("ERROR", "TIMEOUT")) |>
count(stage, status) |>
arrange(desc(n))
stage_failures
#> # A tibble: 8 × 3
#> stage status n
#> <chr> <chr> <int>
#> 1 buildsrc ERROR 242483
#> 2 checksrc ERROR 126893
#> 3 install ERROR 90980
#> 4 checksrc TIMEOUT 9712
#> 5 buildsrc TIMEOUT 8521
#> 6 buildbin ERROR 5850
#> 7 buildbin TIMEOUT 57
#> 8 install TIMEOUT 32
# Visualize
ggplot(stage_failures, aes(x = stage, y = n, fill = status)) +
geom_col() +
scale_fill_manual(values = c("ERROR" = "red", "TIMEOUT" = "darkred")) +
labs(
title = "Build Failures by Stage",
x = "Build Stage",
y = "Number of Failures",
fill = "Status"
) +
theme_minimal()

Most Problematic Packages
Identify packages with the highest error rates:
# Find packages with most errors
package_errors <- build_summary |>
filter(status %in% c("ERROR", "TIMEOUT")) |>
count(package, status) |>
group_by(package) |>
summarise(
total_errors = sum(n),
.groups = "drop"
) |>
arrange(desc(total_errors))
# Top 10 packages with most errors
head(package_errors, 10)
#> # A tibble: 10 × 2
#> package total_errors
#> <chr> <int>
#> 1 lapmix 2254
#> 2 netZooR 1999
#> 3 hypeR 1700
#> 4 ChemmineOB 1688
#> 5 XNAString 1566
#> 6 ccfindR 1566
#> 7 Repitools 1554
#> 8 gpuMagic 1508
#> 9 slalom 1485
#> 10 ADAPT 1467
Maintainer Analysis
Analyze package maintenance patterns:
# Get unique packages per maintainer
maintainer_packages <- info |>
group_by(Maintainer) |>
summarise(
n_packages = n_distinct(Package),
packages = paste(unique(Package), collapse = ", "),
.groups = "drop"
) |>
arrange(desc(n_packages))
# Top maintainers by number of packages
head(maintainer_packages, 10)
#> # A tibble: 10 × 3
#> Maintainer n_packages packages
#> <chr> <int> <chr>
#> 1 Bioconductor Package Maintainer 72 annotate, AnnotationDbi, Annotati…
#> 2 Aaron Lun 68 alabaster.base, alabaster.bumpy, …
#> 3 Marcel Ramos 27 AnVIL, AnVILAz, AnVILBase, AnVILG…
#> 4 Hervé Pagès 26 BiocGenerics, Biostrings, BSgenom…
#> 5 VJ Carey 25 gwascat, ivygapSE, keggorthology,…
#> 6 Laurent Gatto 24 CTdata, hpar, MsDataHub, MsExperi…
#> 7 Michael Love 20 DESeq2, fishpond, nullranges, ply…
#> 8 Jianhong Ou 17 annoLinker, ATACseqQC, ATACseqTFE…
#> 9 Guangchuang Yu 16 ChIPseeker, clusterProfiler, DOSE…
#> 10 Mike Smith 16 BeadDataPackR, BiocWorkflowTools,…
# Distribution of packages per maintainer
ggplot(maintainer_packages, aes(x = n_packages)) +
geom_histogram(binwidth = 1, fill = "steelblue", color = "white") +
labs(
title = "Distribution of Packages per Maintainer",
x = "Number of Packages",
y = "Number of Maintainers"
) +
theme_minimal()

Temporal Analysis
Analyze build patterns over time:
# Analyze build patterns over time
build_summary <- build_summary |>
mutate(
date = as.Date(startedat),
month = format(startedat, "%Y-%m")
)
# Build activity by month
monthly_builds <- build_summary |>
count(month) |>
mutate(month_date = as.Date(paste0(month, "-01")))
ggplot(monthly_builds, aes(x = month_date, y = n)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point(color = "steelblue") +
labs(
title = "Build Activity Over Time",
x = "Month",
y = "Number of Builds"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Error rate over time
monthly_errors <- build_summary |>
group_by(month) |>
summarise(
total = n(),
errors = sum(status %in% c("ERROR", "TIMEOUT")),
error_rate = errors / total * 100,
.groups = "drop"
) |>
mutate(month_date = as.Date(paste0(month, "-01")))
ggplot(monthly_errors, aes(x = month_date, y = error_rate)) +
geom_line(color = "red", linewidth = 1) +
geom_point(color = "red") +
labs(
title = "Build Error Rate Over Time",
x = "Month",
y = "Error Rate (%)"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
