## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6,
  warning = FALSE,
  message = FALSE,
  fig.alt = "US Census example visualization of Bayesian surprise or funnel plot values."
)

is_pkgdown <- identical(Sys.getenv("IN_PKGDOWN"), "true")
has_census_key <- nzchar(Sys.getenv("CENSUS_API_KEY"))
run_tidycensus_examples <- is_pkgdown && has_census_key

## ----load-packages------------------------------------------------------------
library(bayesiansurpriser)
library(tidycensus)
library(tigris)
library(sf)
library(ggplot2)
library(dplyr)

# Set your Census API key (do this once)
# census_api_key("YOUR_KEY_HERE", install = TRUE)

## ----state-poverty, eval=run_tidycensus_examples------------------------------
# # Get state-level poverty data
# state_poverty <- get_acs(
#   geography = "state",
#   variables = c(
#     total_pop = "B17001_001",
#     in_poverty = "B17001_002"
#   ),
#   year = 2022,
#   survey = "acs5",
#   geometry = TRUE,
#   output = "wide"
# )
# 
# state_poverty <- state_poverty %>%
#   filter(!is.na(in_povertyE) & !is.na(total_popE)) %>%
#   filter(total_popE > 0) %>%
#   mutate(poverty_rate = in_povertyE / total_popE)
# 
# # Compute surprise on the poverty-rate distribution.
# # This asks which state rates are unusual relative to the cross-state pattern.
# state_surprise <- surprise(
#   state_poverty,
#   observed = poverty_rate,
#   models = c("uniform", "gaussian", "sampled")
# )
# 
# # View results
# cat("Surprise value range:", round(range(state_surprise$surprise), 4), "\n")

## ----state-results, eval=run_tidycensus_examples------------------------------
# # States with positive signed surprise
# state_surprise %>%
#   st_drop_geometry() %>%
#   filter(signed_surprise > 0) %>%
#   arrange(desc(surprise)) %>%
#   select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>%
#   head(10)

## ----state-low, eval=run_tidycensus_examples----------------------------------
# # States with negative signed surprise
# state_surprise %>%
#   st_drop_geometry() %>%
#   filter(signed_surprise < 0) %>%
#   arrange(desc(surprise)) %>%
#   select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>%
#   head(10)

## ----state-map, fig.width=10, fig.height=6, eval=run_tidycensus_examples------
# # Shift AK/HI for visualization
# state_surprise_shifted <- state_surprise %>%
#   shift_geometry()
# 
# ggplot(state_surprise_shifted) +
#   geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.3) +
#   scale_fill_surprise_diverging(name = "Signed\nsurprise") +
#   labs(
#     title = "States with Atypical Poverty Rates",
#     subtitle = "Signed surprise from the cross-state poverty-rate distribution",
#     caption = "Source: ACS 2018-2022 5-Year Estimates"
#   ) +
#   theme_void() +
#   theme(
#     legend.position = "right",
#     plot.title = element_text(hjust = 0.5, face = "bold"),
#     plot.subtitle = element_text(hjust = 0.5)
#   )

## ----funnel-plot, fig.width=9, fig.height=6, eval=run_tidycensus_examples-----
# library(ggrepel)
# 
# # Compute funnel data
# funnel_df <- compute_funnel_data(
#   observed = state_poverty$in_povertyE,
#   sample_size = state_poverty$total_popE,
#   type = "count",
#   limits = c(2, 3)
# )
# funnel_df$state <- state_poverty$NAME
# funnel_df$rate <- funnel_df$observed / funnel_df$sample_size
# 
# # National rate
# national_rate <- sum(state_poverty$in_povertyE) / sum(state_poverty$total_popE)
# 
# # Create funnel bands
# n_range <- range(state_poverty$total_popE)
# n_seq <- exp(seq(log(n_range[1] * 0.8), log(n_range[2] * 1.2), length.out = 200))
# se_seq <- sqrt(national_rate * (1 - national_rate) / n_seq)
# 
# funnel_bands <- data.frame(
#   n = n_seq,
#   lower_2sd = national_rate - 2 * se_seq,
#   upper_2sd = national_rate + 2 * se_seq,
#   lower_3sd = national_rate - 3 * se_seq,
#   upper_3sd = national_rate + 3 * se_seq
# )
# 
# # Plot
# ggplot() +
#   geom_ribbon(
#     data = funnel_bands,
#     aes(x = n, ymin = lower_3sd, ymax = upper_3sd),
#     fill = "#9ecae1", alpha = 0.5
#   ) +
#   geom_ribbon(
#     data = funnel_bands,
#     aes(x = n, ymin = lower_2sd, ymax = upper_2sd),
#     fill = "#3182bd", alpha = 0.5
#   ) +
#   geom_hline(yintercept = national_rate, linetype = "dashed", color = "#636363") +
#   geom_point(
#     data = funnel_df,
#     aes(x = sample_size, y = rate, color = abs(z_score) > 2),
#     size = 3
#   ) +
#   geom_text_repel(
#     data = funnel_df %>% filter(abs(z_score) > 2),
#     aes(x = sample_size, y = rate, label = state),
#     size = 3, fontface = "bold"
#   ) +
#   scale_color_manual(values = c("FALSE" = "#636363", "TRUE" = "#e6550d"), guide = "none") +
#   scale_x_log10(labels = scales::comma) +
#   scale_y_continuous(labels = scales::percent) +
#   labs(
#     title = "Funnel Plot: State Poverty Rates",
#     subtitle = "States outside bands have rates that are unusual for their population size",
#     x = "Population (log scale)",
#     y = "Poverty Rate"
#   ) +
#   theme_minimal()

## ----county-funnel, fig.width=9, fig.height=7, eval=run_tidycensus_examples----
# # Get all county poverty data (no geometry needed for funnel plot)
# county_poverty_all <- get_acs(
#   geography = "county",
#   variables = c(
#     total_pop = "B17001_001",
#     in_poverty = "B17001_002"
#   ),
#   year = 2022,
#   survey = "acs5",
#   output = "wide"
# ) %>%
#   filter(!is.na(in_povertyE) & !is.na(total_popE)) %>%
#   filter(total_popE > 0) %>%
#   mutate(poverty_rate = in_povertyE / total_popE)
# 
# cat("County population range:", format(range(county_poverty_all$total_popE), big.mark = ","), "\n")
# cat("Number of counties:", nrow(county_poverty_all), "\n")
# 
# # National rate from county data
# national_rate_county <- sum(county_poverty_all$in_povertyE) / sum(county_poverty_all$total_popE)
# 
# # Compute funnel data
# county_funnel <- compute_funnel_data(
#   observed = county_poverty_all$in_povertyE,
#   sample_size = county_poverty_all$total_popE,
#   type = "count",
#   limits = c(2, 3)
# )
# county_funnel$name <- county_poverty_all$NAME
# county_funnel$rate <- county_funnel$observed / county_funnel$sample_size
# 
# # Create funnel bands across full range
# n_range_county <- range(county_poverty_all$total_popE)
# n_seq_county <- exp(seq(log(n_range_county[1] * 0.8), log(n_range_county[2] * 1.2), length.out = 300))
# se_seq_county <- sqrt(national_rate_county * (1 - national_rate_county) / n_seq_county)
# 
# county_bands <- data.frame(
#   n = n_seq_county,
#   lower_2sd = national_rate_county - 2 * se_seq_county,
#   upper_2sd = national_rate_county + 2 * se_seq_county,
#   lower_3sd = national_rate_county - 3 * se_seq_county,
#   upper_3sd = national_rate_county + 3 * se_seq_county
# )
# 
# # Identify extreme outliers for labeling
# extreme_counties <- county_funnel %>%
#   filter(abs(z_score) > 4) %>%
#   arrange(desc(abs(z_score))) %>%
#   head(15)
# 
# # Plot
# ggplot() +
#   geom_ribbon(
#     data = county_bands,
#     aes(x = n, ymin = pmax(lower_3sd, 0), ymax = pmin(upper_3sd, 1)),
#     fill = "#9ecae1", alpha = 0.5
#   ) +
#   geom_ribbon(
#     data = county_bands,
#     aes(x = n, ymin = pmax(lower_2sd, 0), ymax = pmin(upper_2sd, 1)),
#     fill = "#3182bd", alpha = 0.5
#   ) +
#   geom_hline(yintercept = national_rate_county, linetype = "dashed", color = "#636363") +
#   geom_point(
#     data = county_funnel,
#     aes(x = sample_size, y = rate, color = abs(z_score) > 3),
#     size = 1, alpha = 0.6
#   ) +
#   geom_text_repel(
#     data = extreme_counties,
#     aes(x = sample_size, y = rate, label = name),
#     size = 2.5, fontface = "bold", max.overlaps = 20
#   ) +
#   scale_color_manual(values = c("FALSE" = "#636363", "TRUE" = "#e6550d"), guide = "none") +
#   scale_x_log10(labels = scales::comma) +
#   scale_y_continuous(labels = scales::percent, limits = c(0, 0.7)) +
#   labs(
#     title = "Funnel Plot: County Poverty Rates (All US Counties)",
#     subtitle = "Classic funnel shape: wide bands for small populations, narrow for large",
#     x = "Population (log scale)",
#     y = "Poverty Rate",
#     caption = "Orange = |z| > 3 | Labeled = |z| > 4"
#   ) +
#   theme_minimal()

## ----tract-analysis, fig.width=9, fig.height=7, eval=run_tidycensus_examples----
# # Get tract-level data for Cook County (Chicago)
# chicago_tracts <- get_acs(
#   geography = "tract",
#   state = "IL",
#   county = "Cook",
#   variables = c(
#     total_pop = "B17001_001",
#     in_poverty = "B17001_002"
#   ),
#   year = 2022,
#   survey = "acs5",
#   geometry = TRUE,
#   output = "wide"
# )
# 
# chicago_tracts <- chicago_tracts %>%
#   filter(!is.na(in_povertyE) & !is.na(total_popE)) %>%
#   filter(total_popE > 100) %>%
#   mutate(poverty_rate = in_povertyE / total_popE)
# 
# cat("Population range:", range(chicago_tracts$total_popE), "\n")
# cat("Number of tracts:", nrow(chicago_tracts), "\n")
# 
# # Compute surprise on the poverty-rate distribution.
# # The funnel plot above is the companion check for sample-size reliability.
# chicago_surprise <- surprise(
#   chicago_tracts,
#   observed = poverty_rate,
#   models = c("uniform", "gaussian", "sampled")
# )
# 
# # Plot
# ggplot(chicago_surprise) +
#   geom_sf(aes(fill = surprise), color = NA) +
#   scale_fill_surprise(option = "magma", name = "Surprise\n(bits)") +
#   labs(
#     title = "Atypical Poverty Rates in Cook County, IL",
#     subtitle = "Census tracts far from the countywide poverty-rate distribution",
#     caption = "Source: ACS 2018-2022 5-Year Estimates"
#   ) +
#   theme_void()

## ----tract-top, eval=run_tidycensus_examples----------------------------------
# # Largest signed surprise values under the rate-distribution model space
# chicago_surprise %>%
#   st_drop_geometry() %>%
#   mutate(poverty_rate = in_povertyE / total_popE) %>%
#   arrange(desc(abs(signed_surprise))) %>%
#   select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>%
#   head(10)

## ----large-counties, fig.width=10, fig.height=6, eval=run_tidycensus_examples----
# # Get county data
# county_poverty <- get_acs(
#   geography = "county",
#   variables = c(
#     total_pop = "B17001_001",
#     in_poverty = "B17001_002"
#   ),
#   year = 2022,
#   survey = "acs5",
#   geometry = TRUE,
#   output = "wide"
# ) %>%
#   shift_geometry()
# 
# # Filter to counties with >100k population
# large_counties <- county_poverty %>%
#   filter(!is.na(in_povertyE) & !is.na(total_popE)) %>%
#   filter(total_popE > 100000) %>%
#   mutate(poverty_rate = in_povertyE / total_popE)
# 
# cat("Counties with >100k population:", nrow(large_counties), "\n")
# cat("Population range:", format(range(large_counties$total_popE), big.mark = ","), "\n")
# 
# # Compute surprise on the county poverty-rate distribution
# large_surprise <- surprise(
#   large_counties,
#   observed = poverty_rate,
#   models = c("uniform", "gaussian", "sampled")
# )

## ----large-counties-top, eval=run_tidycensus_examples-------------------------
# # Largest positive signed surprise values
# cat("=== POSITIVE SIGNED SURPRISE (large counties) ===\n")
# large_surprise %>%
#   st_drop_geometry() %>%
#   filter(signed_surprise > 0) %>%
#   arrange(desc(surprise)) %>%
#   select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>%
#   head(10)

## ----large-counties-low, eval=run_tidycensus_examples-------------------------
# cat("=== NEGATIVE SIGNED SURPRISE (large counties) ===\n")
# large_surprise %>%
#   st_drop_geometry() %>%
#   filter(signed_surprise < 0) %>%
#   arrange(desc(surprise)) %>%
#   select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>%
#   head(10)

## ----large-counties-map, fig.width=10, fig.height=6, eval=run_tidycensus_examples----
# ggplot(large_surprise) +
#   geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.1) +
#   scale_fill_surprise_diverging(name = "Signed\nsurprise") +
#   labs(
#     title = "Atypical Poverty Rates (Counties >100k Population)",
#     subtitle = "Signed surprise from the large-county poverty-rate distribution",
#     caption = "Source: ACS 2018-2022 5-Year Estimates"
#   ) +
#   theme_void()

## ----session-info-------------------------------------------------------------
sessionInfo()