## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6, warning = FALSE, message = FALSE, fig.alt = "Canadian Census example visualization of Bayesian surprise or funnel plot values." ) is_pkgdown <- identical(Sys.getenv("IN_PKGDOWN"), "true") has_cancensus_key <- nzchar(Sys.getenv("CM_API_KEY")) || nzchar(Sys.getenv("CANCENSUS_API_KEY")) run_cancensus_examples <- is_pkgdown && has_cancensus_key ## ----load-packages------------------------------------------------------------ library(bayesiansurpriser) library(cancensus) library(sf) library(ggplot2) library(dplyr) # Set your CensusMapper API key (do this once) # set_cancensus_api_key("YOUR_KEY_HERE", install = TRUE) # Optionally set a cache path for faster subsequent queries # set_cancensus_cache_path("path/to/cache", install = TRUE) ## ----list-datasets, eval=FALSE------------------------------------------------ # # List available Census datasets # datasets <- list_census_datasets() # print(datasets %>% select(dataset, description)) ## ----list-regions, eval=FALSE------------------------------------------------- # # List regions for the 2021 Census # regions_2021 <- list_census_regions("CA21") # head(regions_2021 %>% filter(level == "CMA")) ## ----fetch-income-data, eval=run_cancensus_examples--------------------------- # # Find relevant vectors for low income # # v_CA21_1085: Total - Low-income status in 2020 # # v_CA21_1086: 0 to 17 years in low income # # v_CA21_1090: In low income # # # Get low income data at the Census Division level for all of Canada # # Using labels = "short" for cleaner column names # income_data <- get_census( # dataset = "CA21", # regions = list(C = "01"), # All of Canada # vectors = c( # total_pop = "v_CA21_1085", # Total population for low income status # low_income = "v_CA21_1090" # Population in low income # ), # level = "CD", # geo_format = "sf", # labels = "short", # quiet = TRUE # ) # # # Clean and filter # income_data <- income_data %>% # filter(!is.na(total_pop) & !is.na(low_income)) %>% # filter(total_pop > 0) %>% # mutate( # low_income_rate = low_income / total_pop # ) # # head(income_data %>% st_drop_geometry() %>% select(GeoUID, `Region Name`, total_pop, low_income, low_income_rate)) ## ----compute-income-surprise, eval=run_cancensus_examples--------------------- # # Compute surprise on the low-income-rate distribution. # income_surprise <- surprise( # income_data, # observed = low_income_rate, # models = c("uniform", "gaussian", "sampled") # ) # # print(income_surprise) # summary(income_surprise) ## ----plot-income-surprise, fig.width=10, fig.height=8, eval=run_cancensus_examples---- # # Add surprise values to data # income_data$surprise <- get_surprise(income_surprise, "surprise") # income_data$signed_surprise <- get_surprise(income_surprise, "signed") # # # Transform to Statistics Canada Lambert projection for better Canada visualization # income_data_lambert <- st_transform(income_data, "EPSG:3347") # # # Plot Canada-wide # ggplot(income_data_lambert) + # geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.1) + # scale_fill_surprise_diverging(name = "Signed\nsurprise") + # labs( # title = "Atypical Low Income Rates by Census Division", # subtitle = "Signed surprise from the cross-division low-income-rate distribution", # caption = "Source: Statistics Canada, 2021 Census" # ) + # theme_void() + # theme( # legend.position = "right", # plot.title = element_text(hjust = 0.5, face = "bold"), # plot.subtitle = element_text(hjust = 0.5) # ) ## ----top-surprise-regions, eval=run_cancensus_examples------------------------ # # Top regions with positive signed surprise # high_income_surprise <- income_data %>% # st_drop_geometry() %>% # filter(signed_surprise > 0) %>% # arrange(desc(surprise)) %>% # select(`Region Name`, low_income_rate, total_pop, surprise) %>% # head(10) # # cat("Census Divisions with positive signed surprise:\n") # print(high_income_surprise) # # # Top regions with negative signed surprise # low_income_surprise <- income_data %>% # st_drop_geometry() %>% # filter(signed_surprise < 0) %>% # arrange(desc(surprise)) %>% # select(`Region Name`, low_income_rate, total_pop, surprise) %>% # head(10) # # cat("\nCensus Divisions with negative signed surprise:\n") # print(low_income_surprise) ## ----fetch-housing-data, eval=run_cancensus_examples-------------------------- # # Get housing tenure data for Vancouver CMA # # v_CA21_4237: Total private households by tenure # # v_CA21_4238: Owner households # # housing_data <- get_census( # dataset = "CA21", # regions = list(CMA = "59933"), # Vancouver CMA # vectors = c( # total_hh = "v_CA21_4237", # Total households # owner_hh = "v_CA21_4238" # Owner households # ), # level = "CT", # geo_format = "sf", # labels = "short", # quiet = TRUE # ) # # housing_data <- housing_data %>% # filter(!is.na(total_hh) & !is.na(owner_hh)) %>% # filter(total_hh > 50) %>% # Filter small tracts # mutate( # owner_rate = owner_hh / total_hh # ) # # cat("Number of Census Tracts:", nrow(housing_data), "\n") ## ----housing-surprise, fig.width=9, fig.height=7, eval=run_cancensus_examples---- # # Compute surprise on the home-ownership-rate distribution. # housing_surprise <- surprise( # housing_data, # observed = owner_rate, # models = c("uniform", "gaussian", "sampled") # ) # # housing_data$signed_surprise <- get_surprise(housing_surprise, "signed") # # # Plot Vancouver # ggplot(housing_data) + # geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.02) + # scale_fill_surprise_diverging(name = "Signed\nsurprise") + # labs( # title = "Atypical Home Ownership Rates in Metro Vancouver", # subtitle = "Signed surprise from the tract-level owner-rate distribution", # caption = "Source: Statistics Canada, 2021 Census" # ) + # theme_void() + # theme( # legend.position = "right", # plot.title = element_text(hjust = 0.5, face = "bold"), # plot.subtitle = element_text(hjust = 0.5, size = 9) # ) ## ----fetch-language-data, eval=run_cancensus_examples------------------------- # # Get language data for Toronto CMA # # v_CA21_1144: Total - Knowledge of official languages # # v_CA21_1153: Neither English nor French # # language_data <- get_census( # dataset = "CA21", # regions = list(CMA = "35535"), # Toronto CMA # vectors = c( # total_pop = "v_CA21_1144", # Total population # no_official = "v_CA21_1153" # Neither official language # ), # level = "CT", # geo_format = "sf", # labels = "short", # quiet = TRUE # ) # # language_data <- language_data %>% # filter(!is.na(total_pop) & !is.na(no_official)) %>% # filter(total_pop > 100) %>% # mutate(no_official_rate = no_official / total_pop) ## ----language-surprise, fig.width=9, fig.height=7, eval=run_cancensus_examples---- # # Compute surprise on the non-official-language-rate distribution. # lang_surprise <- surprise( # language_data, # observed = no_official_rate, # models = c("uniform", "gaussian", "sampled") # ) # # language_data$signed_surprise <- get_surprise(lang_surprise, "signed") # # # Plot Toronto # ggplot(language_data) + # geom_sf(aes(fill = signed_surprise), color = "white", linewidth = 0.02) + # scale_fill_surprise_diverging(name = "Signed\nsurprise") + # labs( # title = "Atypical Non-Official Language Concentrations in Toronto", # subtitle = "Signed surprise from the tract-level non-official-language-rate distribution", # caption = "Source: Statistics Canada, 2021 Census" # ) + # theme_void() + # theme( # legend.position = "right", # plot.title = element_text(hjust = 0.5, face = "bold"), # plot.subtitle = element_text(hjust = 0.5, size = 9) # ) ## ----provincial-funnel, eval=run_cancensus_examples--------------------------- # # Get provincial data # provincial_data <- get_census( # dataset = "CA21", # regions = list(C = "01"), # vectors = c( # total_pop = "v_CA21_1085", # Total for low income # low_income = "v_CA21_1090" # In low income # ), # level = "PR", # geo_format = NA, # labels = "short", # quiet = TRUE # ) # # provincial_data <- provincial_data %>% # filter(!is.na(total_pop) & total_pop > 0) # # # Compute funnel data for points # funnel_df <- compute_funnel_data( # observed = provincial_data$low_income, # sample_size = provincial_data$total_pop, # type = "count", # limits = c(2, 3) # ) # funnel_df$province <- provincial_data$`Region Name` # funnel_df$rate <- funnel_df$observed / funnel_df$sample_size # # # National rate # national_rate <- sum(provincial_data$low_income) / sum(provincial_data$total_pop) # # # Create continuous funnel bands # n_range <- range(provincial_data$total_pop) # n_seq <- exp(seq(log(n_range[1] * 0.8), log(n_range[2] * 1.2), length.out = 200)) # se_seq <- sqrt(national_rate * (1 - national_rate) / n_seq) # # funnel_bands <- data.frame( # n = n_seq, # rate = national_rate, # lower_2sd = national_rate - 2 * se_seq, # upper_2sd = national_rate + 2 * se_seq, # lower_3sd = national_rate - 3 * se_seq, # upper_3sd = national_rate + 3 * se_seq # ) # # # Create funnel plot # ggplot() + # # 3 SD band (outer) # geom_ribbon( # data = funnel_bands, # aes(x = n, ymin = lower_3sd, ymax = upper_3sd), # fill = "#9ecae1", alpha = 0.5 # ) + # # 2 SD band (inner) # geom_ribbon( # data = funnel_bands, # aes(x = n, ymin = lower_2sd, ymax = upper_2sd), # fill = "#3182bd", alpha = 0.5 # ) + # # National average line # geom_hline( # yintercept = national_rate, # linetype = "dashed", color = "#636363", linewidth = 0.8 # ) + # # Points # geom_point( # data = funnel_df, # aes(x = sample_size, y = rate, color = abs(z_score) > 2), # size = 4 # ) + # # Labels # ggrepel::geom_text_repel( # data = funnel_df, # aes(x = sample_size, y = rate, label = province), # size = 3, fontface = "bold", # max.overlaps = 15, # segment.color = "gray60" # ) + # scale_color_manual( # values = c("FALSE" = "#636363", "TRUE" = "#e6550d"), # guide = "none" # ) + # scale_x_log10( # labels = scales::comma, # breaks = c(1e4, 1e5, 1e6, 1e7) # ) + # scale_y_continuous(labels = scales::percent) + # labs( # title = "Funnel Plot: Provincial Low Income Rates", # subtitle = "Provinces outside bands have surprising rates given population size", # x = "Population (log scale)", # y = "Low Income Rate", # caption = "Source: Statistics Canada, 2021 Census" # ) + # theme_minimal() + # theme( # plot.title = element_text(face = "bold"), # panel.grid.minor = element_blank() # ) ## ----temporal-comparison, eval=run_cancensus_examples------------------------- # # Get provincial data for 2016 and 2021 # # Note: Variable IDs differ between Census years # # # 2021 data # prov_2021 <- get_census( # dataset = "CA21", # regions = list(C = "01"), # vectors = c(total_pop = "v_CA21_1085", low_income = "v_CA21_1090"), # level = "PR", # geo_format = NA, # labels = "short", # quiet = TRUE # ) # prov_2021 <- prov_2021[!is.na(prov_2021$total_pop) & prov_2021$total_pop > 0, ] # prov_2021$low_income_rate <- prov_2021$low_income / prov_2021$total_pop # prov_2021$census_year <- 2021 # # # 2016 data (different vector IDs for same concept) # prov_2016 <- get_census( # dataset = "CA16", # regions = list(C = "01"), # vectors = c(total_pop = "v_CA16_2540", low_income = "v_CA16_2555"), # level = "PR", # geo_format = NA, # labels = "short", # quiet = TRUE # ) # prov_2016 <- prov_2016[!is.na(prov_2016$total_pop) & prov_2016$total_pop > 0, ] # prov_2016$low_income_rate <- prov_2016$low_income / prov_2016$total_pop # prov_2016$census_year <- 2016 # # # Compute surprise for each year's provincial low-income-rate distribution # surprise_2021 <- surprise( # prov_2021, # observed = low_income_rate, # models = c("uniform", "gaussian", "sampled") # ) # prov_2021$signed_surprise <- get_surprise(surprise_2021, "signed") # # surprise_2016 <- surprise( # prov_2016, # observed = low_income_rate, # models = c("uniform", "gaussian", "sampled") # ) # prov_2016$signed_surprise <- get_surprise(surprise_2016, "signed") # # # Compare changes # comparison <- merge( # prov_2016[, c("GeoUID", "Region Name", "signed_surprise")], # prov_2021[, c("GeoUID", "signed_surprise")], # by = "GeoUID", # suffixes = c("_2016", "_2021") # ) # comparison$change <- comparison$signed_surprise_2021 - comparison$signed_surprise_2016 # print(comparison[order(abs(comparison$change), decreasing = TRUE), ]) ## ----custom-models, eval=run_cancensus_examples------------------------------- # # Using the Vancouver housing data # # Create a custom model space # # # Create custom model space for rates # housing_models <- model_space( # bs_model_gaussian(), # bs_model_sampled() # ) # # print(housing_models) # # # Use surprise() with custom model space # custom_result <- surprise( # housing_data, # observed = owner_rate, # models = housing_models # ) # # housing_data$custom_surprise <- get_surprise(custom_result, "signed") # # # Summary statistics # cat("Custom model surprise summary:\n") # summary(housing_data$custom_surprise) ## ----interpret-surprise, eval=run_cancensus_examples-------------------------- # # Categorize regions by surprise level # income_data <- income_data %>% # mutate( # surprise_category = cut( # abs(signed_surprise), # breaks = c(0, 0.1, 0.3, 0.5, 1.0, Inf), # labels = c("Trivial", "Minor", "Moderate", "Substantial", "High"), # include.lowest = TRUE # ) # ) # # # Summary by category # cat("Census Divisions by surprise level:\n") # print(table(income_data$surprise_category)) ## ----session-info------------------------------------------------------------- sessionInfo()