--- title: "Complete Function Reference with Examples" output: rmarkdown::html_vignette: toc: true toc_depth: 3 vignette: > %\VignetteIndexEntry{Complete Function Reference with Examples} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 5, warning = FALSE, message = FALSE, fig.alt = "Example visualization produced by a bayesiansurpriser function." ) ``` # bayesiansurpriser: Complete Reference This vignette demonstrates all major functions in the `bayesiansurpriser` package with working examples and visualizations. ```{r load-packages} library(bayesiansurpriser) library(sf) library(ggplot2) ``` *** # 1. Core Surprise Computation ## 1.1 surprise() - Main Function The primary function for computing Bayesian surprise on data frames and sf objects. ```{r surprise-sf} # Load NC SIDS data nc <- st_read(system.file("shape/nc.shp", package = "sf"), quiet = TRUE) # Compute surprise with default models (uniform, baserate, funnel) result <- surprise(nc, observed = SID74, expected = BIR74) # View structure print(result) ``` ```{r surprise-sf-plot} # Visualize the result ggplot(result) + geom_sf(aes(fill = signed_surprise)) + scale_fill_surprise_diverging() + labs( title = "Bayesian Surprise: NC SIDS Cases (1974)", subtitle = "Blue = fewer than expected, Red = more than expected", fill = "Signed\nSurprise" ) + theme_minimal() ``` ## 1.2 auto_surprise() - Simple Vector API For quick analysis without a data frame: ```{r auto-surprise} # Observed counts and expected values observed <- c(50, 100, 150, 200, 75, 300, 25) expected <- c(10000, 50000, 100000, 25000, 15000, 80000, 5000) result_auto <- auto_surprise(observed, expected) # View surprise values cat("Surprise values:\n") print(round(result_auto$surprise, 4)) cat("\nSigned surprise:\n") print(round(result_auto$signed_surprise, 4)) ``` ```{r auto-surprise-plot} # Visualize df <- data.frame( region = LETTERS[1:7], observed = observed, expected = expected, surprise = result_auto$surprise, signed = result_auto$signed_surprise ) ggplot(df, aes(x = reorder(region, -surprise), y = surprise, fill = signed)) + geom_col() + scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B", midpoint = 0) + labs( title = "Surprise by Region", x = "Region", y = "Surprise (bits)", fill = "Direction" ) + theme_minimal() ``` ## 1.3 compute_surprise() - Low-Level Function Direct computation with a model space: ```{r compute-surprise} # Build a model space mspace <- model_space( bs_model_uniform(), bs_model_baserate(nc$BIR74), bs_model_funnel(nc$BIR74) ) # Compute surprise directly result_low <- compute_surprise( model_space = mspace, observed = nc$SID74, expected = nc$BIR74, return_signed = TRUE ) cat("Computed", length(result_low$surprise), "surprise values\n") cat("Range:", round(range(result_low$surprise), 4), "\n") ``` *** # 2. Model Types ## 2.1 bs_model_uniform() - Uniform Distribution Assumes all observations equally likely: ```{r model-uniform} model_uniform <- bs_model_uniform() print(model_uniform) ``` ## 2.2 bs_model_baserate() - Base Rate Model Expects observations proportional to baseline: ```{r model-baserate} model_baserate <- bs_model_baserate(nc$BIR74) print(model_baserate) ``` ## 2.3 bs_model_gaussian() - Normal Distribution ```{r model-gaussian} # Fit from data (default) model_gaussian_fit <- bs_model_gaussian() print(model_gaussian_fit) # Fixed parameters model_gaussian_fixed <- bs_model_gaussian(mu = 100, sigma = 20, fit_from_data = FALSE) print(model_gaussian_fixed) ``` ## 2.4 bs_model_sampled() - KDE Model Uses kernel density estimation: ```{r model-sampled} model_kde <- bs_model_sampled(sample_frac = 0.2, kernel = "gaussian") print(model_kde) ``` ## 2.5 bs_model_funnel() - de Moivre Funnel Accounts for sampling variance: ```{r model-funnel} model_funnel <- bs_model_funnel(nc$BIR74, type = "count") print(model_funnel) ``` ## 2.6 bs_model_bootstrap() - Bootstrap Model ```{r model-bootstrap} model_boot <- bs_model_bootstrap(n_bootstrap = 100) print(model_boot) ``` ## 2.7 bs_model_gaussian_mixture() - Mixture Model ```{r model-mixture} # Define a two-component mixture model_mix <- bs_model_gaussian_mixture( means = c(50, 150), sds = c(10, 20), weights = c(0.6, 0.4) ) print(model_mix) ``` *** # 3. Model Space Operations ## 3.1 model_space() - Create Model Space ```{r model-space-create} space <- model_space( bs_model_uniform(), bs_model_baserate(nc$BIR74), bs_model_funnel(nc$BIR74), prior = c(0.2, 0.5, 0.3), names = c("Uniform", "Population", "Funnel") ) print(space) ``` ## 3.2 default_model_space() - Quick Default ```{r default-space} default_space <- default_model_space(nc$BIR74) print(default_space) ``` ## 3.3 Model Space Manipulation ```{r model-space-ops} # Add a model space_expanded <- add_model(space, bs_model_gaussian()) cat("After add_model:", n_models(space_expanded), "models\n") cat("Model names:", model_names(space_expanded), "\n") # Remove a model (by index) space_reduced <- remove_model(space_expanded, 4) cat("After remove_model:", n_models(space_reduced), "models\n") # Set new prior space_reweighted <- set_prior(space, c(0.1, 0.6, 0.3)) cat("New prior:", space_reweighted$prior, "\n") # Get model names cat("Model names:", model_names(space), "\n") ``` ## 3.4 bayesian_update() - Update Posterior ```{r bayesian-update} # Update model space with observed data updated_space <- bayesian_update(space, nc$SID74) cat("Prior: ", round(space$prior, 4), "\n") cat("Posterior:", round(updated_space$posterior, 4), "\n") ``` ```{r bayesian-update-plot} # Visualize prior vs posterior plot(updated_space) ``` ## 3.5 cumulative_bayesian_update() - Sequential Updates ```{r cumulative-update} space_simple <- model_space(bs_model_uniform(), bs_model_gaussian()) # Sequential observations (processed one at a time) observations <- c(10, 15, 20, 25, 100, 30, 35, 40) cum_result <- cumulative_bayesian_update(space_simple, observations) cat("Observations processed:", length(cum_result$cumulative_surprise), "\n") cat("Final posterior:", round(cum_result$final_space$posterior, 4), "\n") ``` *** # 4. Accessor Functions ## 4.1 get_surprise() and get_model_space() ```{r accessors} result <- surprise(nc, observed = SID74, expected = BIR74) # Get surprise values surprise_vals <- get_surprise(result, "surprise") signed_vals <- get_surprise(result, "signed") cat("First 5 surprise values:", round(surprise_vals[1:5], 4), "\n") cat("First 5 signed values:", round(signed_vals[1:5], 4), "\n") # Get model space mspace <- get_model_space(result) print(mspace) ``` *** # 5. Temporal and Streaming Analysis ## 5.1 surprise_temporal() - Panel Data ```{r temporal} # Create panel data set.seed(42) panel_data <- data.frame( year = rep(2018:2022, each = 4), region = rep(c("North", "South", "East", "West"), 5), cases = c(45, 80, 55, 70, # 2018 50, 85, 60, 65, # 2019 48, 90, 58, 72, # 2020 55, 95, 52, 68, # 2021 60, 100, 65, 75), # 2022 population = rep(c(10000, 20000, 15000, 18000), 5) ) temporal_result <- surprise_temporal( panel_data, time_col = year, observed = cases, expected = population, region_col = region ) print(temporal_result) ``` ```{r temporal-plot} # Plot temporal evolution plot(temporal_result, type = "time_series") ``` ```{r temporal-heatmap} # Heatmap view plot(temporal_result, type = "heatmap") ``` ## 5.2 update_surprise() - Streaming Updates ```{r streaming} # Initial computation initial <- auto_surprise(c(50, 100, 75), c(10000, 20000, 15000)) cat("Initial surprise:", round(initial$surprise, 4), "\n") # Stream new observations updated <- update_surprise(initial, new_observed = c(55, 110), new_expected = c(11000, 22000)) cat("After update:", round(updated$surprise, 4), "\n") ``` ## 5.3 surprise_rolling() - Rolling Window ```{r rolling} # Time series data ts_data <- c(50, 52, 48, 55, 51, 120, 115, 125, 60, 58, 62, 55) expected_ts <- rep(1000, length(ts_data)) rolling_result <- surprise_rolling( ts_data, expected = expected_ts, window_size = 4, step = 1 ) # Extract mean surprise per window window_means <- sapply(rolling_result$windows, function(w) mean(w$result$surprise)) plot(seq_along(window_means), window_means, type = "b", xlab = "Window Position", ylab = "Mean Surprise", main = "Rolling Window Surprise", pch = 19) abline(h = mean(window_means), lty = 2, col = "red") ``` ## 5.4 get_surprise_at_time() - Extract Time Slice ```{r time-slice} # Get surprise for specific year surprise_2020 <- get_surprise_at_time(temporal_result, 2020) cat("2020 surprise values:", round(surprise_2020$surprise, 4), "\n") ``` ## 5.5 surprise_animate() - Animation-Ready Data ```{r animate} anim_data <- surprise_animate(temporal_result) head(anim_data) ``` *** # 6. Funnel Analysis Functions ## 6.1 compute_funnel_data() ```{r funnel-data} funnel_df <- compute_funnel_data( observed = nc$SID74, sample_size = nc$BIR74, type = "count", limits = c(2, 3) # 2 and 3 SD limits ) head(funnel_df) ``` ```{r funnel-plot} # Funnel plot - shows observed vs expected with control limits ggplot(funnel_df, aes(x = sample_size, y = observed)) + geom_ribbon(aes(ymin = lower_3sd, ymax = upper_3sd), fill = "gray90") + geom_ribbon(aes(ymin = lower_2sd, ymax = upper_2sd), fill = "gray70") + geom_line(aes(y = expected), linetype = "dashed") + geom_point(aes(color = abs(z_score) > 2), size = 2) + scale_color_manual(values = c("FALSE" = "black", "TRUE" = "red")) + labs( title = "Funnel Plot: NC SIDS Cases", x = "Births (Sample Size)", y = "Observed SIDS Cases", color = "Outside 2 SD" ) + theme_minimal() ``` ## 6.2 funnel_zscore() and funnel_pvalue() ```{r funnel-stats} # Compute z-scores (need observed, expected, and sample_size) # For counts, expected is derived from overall rate overall_rate <- sum(nc$SID74) / sum(nc$BIR74) expected_cases <- nc$BIR74 * overall_rate z_scores <- funnel_zscore(nc$SID74, expected_cases, nc$BIR74, type = "count") cat("First 5 z-scores:", round(z_scores[1:5], 3), "\n") # Compute p-values from z-scores p_vals <- funnel_pvalue(z_scores) cat("First 5 p-values:", round(p_vals[1:5], 4), "\n") cat("Significant at 0.05:", sum(p_vals < 0.05, na.rm = TRUE), "counties\n") ``` *** # 7. Normalization Utilities ## 7.1 normalize_prob() - Probability Distribution ```{r normalize-prob} raw <- c(10, 20, 30, 40) normalized <- normalize_prob(raw) cat("Raw:", raw, "\n") cat("Normalized:", normalized, "(sum =", sum(normalized), ")\n") ``` ## 7.2 normalize_rate() - Per-Capita Rates ```{r normalize-rate} counts <- c(50, 100, 150) population <- c(10000, 50000, 100000) rates <- normalize_rate(counts, population, per = 100000) cat("Rates per 100k:", round(rates, 1), "\n") ``` ## 7.3 normalize_zscore() - Z-Score ```{r normalize-zscore} values <- c(10, 20, 30, 40, 50) z <- normalize_zscore(values) cat("Z-scores:", round(z, 3), "\n") ``` ## 7.4 normalize_minmax() - Min-Max Scaling ```{r normalize-minmax} values <- c(10, 20, 30, 40, 50) scaled <- normalize_minmax(values) cat("Min-max scaled:", scaled, "\n") ``` ## 7.5 normalize_robust() - Robust Scaling ```{r normalize-robust} values <- c(10, 20, 30, 40, 500) # With outlier robust <- normalize_robust(values) cat("Robust scaled:", round(robust, 3), "\n") ``` *** # 8. Mathematical Utilities ## 8.1 kl_divergence() - KL-Divergence ```{r kl-divergence} p <- c(0.25, 0.25, 0.25, 0.25) # Uniform q <- c(0.1, 0.2, 0.3, 0.4) # Non-uniform kl <- kl_divergence(p, q) cat("KL(P || Q):", round(kl, 4), "bits\n") ``` ## 8.2 log_sum_exp() - Numerically Stable ```{r log-sum-exp} log_vals <- c(-1000, -1001, -1002) # Very small numbers result <- log_sum_exp(log_vals) cat("log_sum_exp:", round(result, 4), "\n") ``` *** # 9. ggplot2 Integration ## 9.1 Color Scales ### Sequential Scale ```{r scale-sequential} result <- surprise(nc, observed = SID74, expected = BIR74) ggplot(result) + geom_sf(aes(fill = surprise)) + scale_fill_surprise(option = "viridis") + labs(title = "Sequential Scale (Viridis)") + theme_minimal() ``` ### Diverging Scale ```{r scale-diverging} ggplot(result) + geom_sf(aes(fill = signed_surprise)) + scale_fill_surprise_diverging() + labs(title = "Diverging Scale for Signed Surprise") + theme_minimal() ``` ### Binned Scale ```{r scale-binned} ggplot(result) + geom_sf(aes(fill = surprise)) + scale_fill_surprise_binned(n.breaks = 5) + labs(title = "Binned Scale (5 breaks)") + theme_minimal() ``` ### Diverging Binned Scale ```{r scale-diverging-binned} ggplot(result) + geom_sf(aes(fill = signed_surprise)) + scale_fill_surprise_diverging_binned(n.breaks = 7) + labs(title = "Diverging Binned Scale") + theme_minimal() ``` ## 9.2 stat_surprise() - Compute in ggplot2 ```{r stat-surprise} ggplot(nc) + stat_surprise(aes(geometry = geometry, observed = SID74, expected = BIR74)) + scale_fill_surprise_diverging() + labs(title = "stat_surprise() Computes Inline") + theme_minimal() ``` ## 9.3 Histogram and Density Geoms ```{r geom-histogram} result <- surprise(nc, observed = SID74, expected = BIR74) ggplot(result, aes(x = surprise)) + geom_surprise_histogram(bins = 15) + labs(title = "Distribution of Surprise Values") + theme_minimal() ``` ```{r geom-density} ggplot(result, aes(x = signed_surprise)) + geom_surprise_density(fill = "#4575b4", alpha = 0.7) + geom_vline(xintercept = 0, linetype = "dashed") + labs(title = "Density of Signed Surprise") + theme_minimal() ``` *** # 10. sf Spatial Functions ## 10.1 st_surprise() - Convenience Wrapper ```{r st-surprise} result_sf <- st_surprise(nc, observed = SID74, expected = BIR74) class(result_sf) ``` ## 10.2 st_density() - Spatial KDE ```{r st-density} # st_density computes kernel density estimation for point patterns # Returns a list with density grid nc_centroids <- st_centroid(nc) density_result <- st_density(nc_centroids, bandwidth = 0.5) cat("Density result structure:\n") cat("- x grid:", length(density_result$x), "values\n") cat("- y grid:", length(density_result$y), "values\n") cat("- z matrix:", dim(density_result$z)[1], "x", dim(density_result$z)[2], "\n") ``` ## 10.3 st_aggregate_surprise() - Aggregate to Larger Regions ```{r st-aggregate} # Create larger regions to aggregate to # Using a simple union of counties nc_result <- surprise(nc, observed = SID74, expected = BIR74) # Show surprise values from result cat("Sample surprise values:\n") print(head(nc_result[, c("NAME", "surprise", "signed_surprise")], 5)) ``` *** # 11. Base R Plot Methods ## 11.1 plot.bs_surprise_sf() ```{r plot-sf-base, fig.height=5} result <- surprise(nc, observed = SID74, expected = BIR74) plot(result, which = "signed_surprise", main = "Base R Plot: Signed Surprise") ``` ## 11.2 plot.bs_surprise() ```{r plot-surprise-base} auto_result <- auto_surprise(c(50, 100, 150, 200, 75), c(10000, 50000, 100000, 25000, 15000)) plot(auto_result, type = "histogram", main = "Surprise Distribution") ``` ## 11.3 plot.bs_model_space() ```{r plot-model-space} space <- model_space(bs_model_uniform(), bs_model_baserate(nc$BIR74)) updated <- bayesian_update(space, nc$SID74) plot(updated) ``` ## 11.4 plot.bs_surprise_temporal() ```{r plot-temporal-base} plot(temporal_result, type = "cumulative", main = "Cumulative Surprise Over Time") ``` *** # 12. Summary Statistics ## 12.1 summary.bs_surprise() ```{r summary} result <- surprise(nc, observed = SID74, expected = BIR74) summary(result) ``` *** # 13. Included Datasets ## 13.1 canada_mischief ```{r data-canada} data(canada_mischief, package = "bayesiansurpriser") cat("Canada mischief dataset:\n") cat("Rows:", nrow(canada_mischief), "\n") cat("Columns:", paste(names(canada_mischief), collapse = ", "), "\n") # Compute surprise on Canadian data canada_result <- auto_surprise( canada_mischief$mischief_count, canada_mischief$population ) cat("\nSurprise values by province:\n") df_result <- data.frame( province = canada_mischief$name, surprise = round(canada_result$surprise, 4) ) print(df_result[order(-df_result$surprise), ]) ``` ## 13.2 example_counties ```{r data-counties} data(example_counties, package = "bayesiansurpriser") cat("Example counties dataset:\n") cat("Class:", class(example_counties)[1], "\n") cat("Rows:", nrow(example_counties), "\n") cat("Columns:", paste(names(example_counties), collapse = ", "), "\n") ``` *** # Session Info ```{r session} sessionInfo() ```