## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6, warning = FALSE, message = FALSE, fig.alt = "US Census example visualization of Bayesian surprise or funnel plot values." ) is_pkgdown <- identical(Sys.getenv("IN_PKGDOWN"), "true") has_census_key <- nzchar(Sys.getenv("CENSUS_API_KEY")) run_tidycensus_examples <- is_pkgdown && has_census_key ## ----load-packages------------------------------------------------------------ library(bayesiansurpriser) library(tidycensus) library(tigris) library(sf) library(ggplot2) library(dplyr) # Set your Census API key (do this once) # census_api_key("YOUR_KEY_HERE", install = TRUE) ## ----state-poverty, eval=run_tidycensus_examples------------------------------ # # Get state-level poverty data # state_poverty <- get_acs( # geography = "state", # variables = c( # total_pop = "B17001_001", # in_poverty = "B17001_002" # ), # year = 2022, # survey = "acs5", # geometry = TRUE, # output = "wide" # ) # # state_poverty <- state_poverty %>% # filter(!is.na(in_povertyE) & !is.na(total_popE)) %>% # filter(total_popE > 0) %>% # mutate(poverty_rate = in_povertyE / total_popE) # # # Compute surprise on the poverty-rate distribution. # # This asks which state rates are unusual relative to the cross-state pattern. # state_surprise <- surprise( # state_poverty, # observed = poverty_rate, # models = c("uniform", "gaussian", "sampled") # ) # # # View results # cat("Surprise value range:", round(range(state_surprise$surprise), 4), "\n") ## ----state-results, eval=run_tidycensus_examples------------------------------ # # States with positive signed surprise # state_surprise %>% # st_drop_geometry() %>% # filter(signed_surprise > 0) %>% # arrange(desc(surprise)) %>% # select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>% # head(10) ## ----state-low, eval=run_tidycensus_examples---------------------------------- # # States with negative signed surprise # state_surprise %>% # st_drop_geometry() %>% # filter(signed_surprise < 0) %>% # arrange(desc(surprise)) %>% # select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>% # head(10) ## ----state-map, fig.width=10, fig.height=6, eval=run_tidycensus_examples------ # # Shift AK/HI for visualization # state_surprise_shifted <- state_surprise %>% # shift_geometry() # # ggplot(state_surprise_shifted) + # geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.3) + # scale_fill_surprise_diverging(name = "Signed\nsurprise") + # labs( # title = "States with Atypical Poverty Rates", # subtitle = "Signed surprise from the cross-state poverty-rate distribution", # caption = "Source: ACS 2018-2022 5-Year Estimates" # ) + # theme_void() + # theme( # legend.position = "right", # plot.title = element_text(hjust = 0.5, face = "bold"), # plot.subtitle = element_text(hjust = 0.5) # ) ## ----funnel-plot, fig.width=9, fig.height=6, eval=run_tidycensus_examples----- # library(ggrepel) # # # Compute funnel data # funnel_df <- compute_funnel_data( # observed = state_poverty$in_povertyE, # sample_size = state_poverty$total_popE, # type = "count", # limits = c(2, 3) # ) # funnel_df$state <- state_poverty$NAME # funnel_df$rate <- funnel_df$observed / funnel_df$sample_size # # # National rate # national_rate <- sum(state_poverty$in_povertyE) / sum(state_poverty$total_popE) # # # Create funnel bands # n_range <- range(state_poverty$total_popE) # n_seq <- exp(seq(log(n_range[1] * 0.8), log(n_range[2] * 1.2), length.out = 200)) # se_seq <- sqrt(national_rate * (1 - national_rate) / n_seq) # # funnel_bands <- data.frame( # n = n_seq, # lower_2sd = national_rate - 2 * se_seq, # upper_2sd = national_rate + 2 * se_seq, # lower_3sd = national_rate - 3 * se_seq, # upper_3sd = national_rate + 3 * se_seq # ) # # # Plot # ggplot() + # geom_ribbon( # data = funnel_bands, # aes(x = n, ymin = lower_3sd, ymax = upper_3sd), # fill = "#9ecae1", alpha = 0.5 # ) + # geom_ribbon( # data = funnel_bands, # aes(x = n, ymin = lower_2sd, ymax = upper_2sd), # fill = "#3182bd", alpha = 0.5 # ) + # geom_hline(yintercept = national_rate, linetype = "dashed", color = "#636363") + # geom_point( # data = funnel_df, # aes(x = sample_size, y = rate, color = abs(z_score) > 2), # size = 3 # ) + # geom_text_repel( # data = funnel_df %>% filter(abs(z_score) > 2), # aes(x = sample_size, y = rate, label = state), # size = 3, fontface = "bold" # ) + # scale_color_manual(values = c("FALSE" = "#636363", "TRUE" = "#e6550d"), guide = "none") + # scale_x_log10(labels = scales::comma) + # scale_y_continuous(labels = scales::percent) + # labs( # title = "Funnel Plot: State Poverty Rates", # subtitle = "States outside bands have rates that are unusual for their population size", # x = "Population (log scale)", # y = "Poverty Rate" # ) + # theme_minimal() ## ----county-funnel, fig.width=9, fig.height=7, eval=run_tidycensus_examples---- # # Get all county poverty data (no geometry needed for funnel plot) # county_poverty_all <- get_acs( # geography = "county", # variables = c( # total_pop = "B17001_001", # in_poverty = "B17001_002" # ), # year = 2022, # survey = "acs5", # output = "wide" # ) %>% # filter(!is.na(in_povertyE) & !is.na(total_popE)) %>% # filter(total_popE > 0) %>% # mutate(poverty_rate = in_povertyE / total_popE) # # cat("County population range:", format(range(county_poverty_all$total_popE), big.mark = ","), "\n") # cat("Number of counties:", nrow(county_poverty_all), "\n") # # # National rate from county data # national_rate_county <- sum(county_poverty_all$in_povertyE) / sum(county_poverty_all$total_popE) # # # Compute funnel data # county_funnel <- compute_funnel_data( # observed = county_poverty_all$in_povertyE, # sample_size = county_poverty_all$total_popE, # type = "count", # limits = c(2, 3) # ) # county_funnel$name <- county_poverty_all$NAME # county_funnel$rate <- county_funnel$observed / county_funnel$sample_size # # # Create funnel bands across full range # n_range_county <- range(county_poverty_all$total_popE) # n_seq_county <- exp(seq(log(n_range_county[1] * 0.8), log(n_range_county[2] * 1.2), length.out = 300)) # se_seq_county <- sqrt(national_rate_county * (1 - national_rate_county) / n_seq_county) # # county_bands <- data.frame( # n = n_seq_county, # lower_2sd = national_rate_county - 2 * se_seq_county, # upper_2sd = national_rate_county + 2 * se_seq_county, # lower_3sd = national_rate_county - 3 * se_seq_county, # upper_3sd = national_rate_county + 3 * se_seq_county # ) # # # Identify extreme outliers for labeling # extreme_counties <- county_funnel %>% # filter(abs(z_score) > 4) %>% # arrange(desc(abs(z_score))) %>% # head(15) # # # Plot # ggplot() + # geom_ribbon( # data = county_bands, # aes(x = n, ymin = pmax(lower_3sd, 0), ymax = pmin(upper_3sd, 1)), # fill = "#9ecae1", alpha = 0.5 # ) + # geom_ribbon( # data = county_bands, # aes(x = n, ymin = pmax(lower_2sd, 0), ymax = pmin(upper_2sd, 1)), # fill = "#3182bd", alpha = 0.5 # ) + # geom_hline(yintercept = national_rate_county, linetype = "dashed", color = "#636363") + # geom_point( # data = county_funnel, # aes(x = sample_size, y = rate, color = abs(z_score) > 3), # size = 1, alpha = 0.6 # ) + # geom_text_repel( # data = extreme_counties, # aes(x = sample_size, y = rate, label = name), # size = 2.5, fontface = "bold", max.overlaps = 20 # ) + # scale_color_manual(values = c("FALSE" = "#636363", "TRUE" = "#e6550d"), guide = "none") + # scale_x_log10(labels = scales::comma) + # scale_y_continuous(labels = scales::percent, limits = c(0, 0.7)) + # labs( # title = "Funnel Plot: County Poverty Rates (All US Counties)", # subtitle = "Classic funnel shape: wide bands for small populations, narrow for large", # x = "Population (log scale)", # y = "Poverty Rate", # caption = "Orange = |z| > 3 | Labeled = |z| > 4" # ) + # theme_minimal() ## ----tract-analysis, fig.width=9, fig.height=7, eval=run_tidycensus_examples---- # # Get tract-level data for Cook County (Chicago) # chicago_tracts <- get_acs( # geography = "tract", # state = "IL", # county = "Cook", # variables = c( # total_pop = "B17001_001", # in_poverty = "B17001_002" # ), # year = 2022, # survey = "acs5", # geometry = TRUE, # output = "wide" # ) # # chicago_tracts <- chicago_tracts %>% # filter(!is.na(in_povertyE) & !is.na(total_popE)) %>% # filter(total_popE > 100) %>% # mutate(poverty_rate = in_povertyE / total_popE) # # cat("Population range:", range(chicago_tracts$total_popE), "\n") # cat("Number of tracts:", nrow(chicago_tracts), "\n") # # # Compute surprise on the poverty-rate distribution. # # The funnel plot above is the companion check for sample-size reliability. # chicago_surprise <- surprise( # chicago_tracts, # observed = poverty_rate, # models = c("uniform", "gaussian", "sampled") # ) # # # Plot # ggplot(chicago_surprise) + # geom_sf(aes(fill = surprise), color = NA) + # scale_fill_surprise(option = "magma", name = "Surprise\n(bits)") + # labs( # title = "Atypical Poverty Rates in Cook County, IL", # subtitle = "Census tracts far from the countywide poverty-rate distribution", # caption = "Source: ACS 2018-2022 5-Year Estimates" # ) + # theme_void() ## ----tract-top, eval=run_tidycensus_examples---------------------------------- # # Largest signed surprise values under the rate-distribution model space # chicago_surprise %>% # st_drop_geometry() %>% # mutate(poverty_rate = in_povertyE / total_popE) %>% # arrange(desc(abs(signed_surprise))) %>% # select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>% # head(10) ## ----large-counties, fig.width=10, fig.height=6, eval=run_tidycensus_examples---- # # Get county data # county_poverty <- get_acs( # geography = "county", # variables = c( # total_pop = "B17001_001", # in_poverty = "B17001_002" # ), # year = 2022, # survey = "acs5", # geometry = TRUE, # output = "wide" # ) %>% # shift_geometry() # # # Filter to counties with >100k population # large_counties <- county_poverty %>% # filter(!is.na(in_povertyE) & !is.na(total_popE)) %>% # filter(total_popE > 100000) %>% # mutate(poverty_rate = in_povertyE / total_popE) # # cat("Counties with >100k population:", nrow(large_counties), "\n") # cat("Population range:", format(range(large_counties$total_popE), big.mark = ","), "\n") # # # Compute surprise on the county poverty-rate distribution # large_surprise <- surprise( # large_counties, # observed = poverty_rate, # models = c("uniform", "gaussian", "sampled") # ) ## ----large-counties-top, eval=run_tidycensus_examples------------------------- # # Largest positive signed surprise values # cat("=== POSITIVE SIGNED SURPRISE (large counties) ===\n") # large_surprise %>% # st_drop_geometry() %>% # filter(signed_surprise > 0) %>% # arrange(desc(surprise)) %>% # select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>% # head(10) ## ----large-counties-low, eval=run_tidycensus_examples------------------------- # cat("=== NEGATIVE SIGNED SURPRISE (large counties) ===\n") # large_surprise %>% # st_drop_geometry() %>% # filter(signed_surprise < 0) %>% # arrange(desc(surprise)) %>% # select(NAME, poverty_rate, total_popE, surprise, signed_surprise) %>% # head(10) ## ----large-counties-map, fig.width=10, fig.height=6, eval=run_tidycensus_examples---- # ggplot(large_surprise) + # geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.1) + # scale_fill_surprise_diverging(name = "Signed\nsurprise") + # labs( # title = "Atypical Poverty Rates (Counties >100k Population)", # subtitle = "Signed surprise from the large-county poverty-rate distribution", # caption = "Source: ACS 2018-2022 5-Year Estimates" # ) + # theme_void() ## ----session-info------------------------------------------------------------- sessionInfo()