## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, fig.align = "center", message = FALSE, warning = FALSE ) ## ----load-packages------------------------------------------------------------ # Core packages library(datacommons) library(dplyr) library(tidyr) library(stringr) library(ggplot2) library(scales) library(knitr) # Set a consistent theme for plots theme_set(theme_minimal()) ## ----api-setup, eval=FALSE---------------------------------------------------- # # Set your API key # dc_set_api_key("YOUR_API_KEY_HERE") # # # Or manually set DATACOMMONS_API_KEY in your .Renviron file ## ----api-check, echo=FALSE---------------------------------------------------- # Check for API key if (!dc_has_api_key()) { message("This vignette requires a Data Commons API key.") message( "Get your free key at: https://docs.datacommons.org/api/#obtain-an-api-key" ) knitr::knit_exit() } ## ----national-population------------------------------------------------------ # Get U.S. population data # We found these identifiers using the Data Commons website: # - Variable: "Count_Person" (from Statistical Variable Explorer) # - Place: "country/USA" (from Place Explorer) us_population <- dc_get_observations( variable_dcids = "Count_Person", entity_dcids = "country/USA", date = "all", return_type = "data.frame" ) # Examine the structure glimpse(us_population) # Notice we have multiple sources (facets) for the same data unique(us_population$facet_name) ## ----clean-population-data---------------------------------------------------- # Strategy 1: Use the official Census PEP (Population Estimates Program) us_pop_clean <- us_population |> filter(facet_name == "USCensusPEP_Annual_Population") |> mutate( year = as.integer(date), population_millions = value / 1e6 ) |> filter(year >= 2000) |> select(year, population_millions) |> arrange(year) # Visualize the clean data ggplot(us_pop_clean, aes(x = year, y = population_millions)) + geom_line(color = "steelblue", linewidth = 1.5) + geom_point(color = "steelblue", size = 3) + scale_y_continuous( labels = label_number(suffix = "M"), limits = c(280, 350) ) + scale_x_continuous(breaks = seq(2000, 2025, 5)) + labs( title = "U.S. Population Growth", subtitle = "Using Census Population Estimates Program data", x = "Year", y = "Population (millions)", caption = "Source: U.S. Census Bureau via Data Commons" ) # Calculate average annual growth growth_rate <- us_pop_clean |> mutate( annual_growth = (population_millions / lag(population_millions) - 1) * 100 ) |> summarize( avg_growth = mean(annual_growth, na.rm = TRUE), pre_2020_growth = mean(annual_growth[year < 2020], na.rm = TRUE), post_2020_growth = mean(annual_growth[year >= 2020], na.rm = TRUE) ) message( "Average annual growth rate: ", round(growth_rate$avg_growth, 2), "%\n", "Pre-2020 growth rate: ", round(growth_rate$pre_2020_growth, 2), "%\n", "Post-2020 growth rate: ", round(growth_rate$post_2020_growth, 2), "%" ) # Note for statistics enthusiasts: We're using arithmetic mean for simplicity. # For compound growth rates, geometric mean would be more precise, which you can # calculate as geometric_mean_growth = (last_value/first_value)^(1/n_years) - 1. # But given the small annual changes (~0.7%), the difference is negligible. ## ----aggregate-sources-------------------------------------------------------- # Strategy 2: Take the median across all sources for each year us_pop_aggregated <- us_population |> filter(!is.na(value)) |> mutate(year = as.integer(date)) |> filter(year >= 2000) |> group_by(year) |> summarize( population_millions = median(value) / 1e6, n_sources = n_distinct(facet_name), .groups = "drop" ) # Show which years have the most sources us_pop_aggregated |> slice_max(n_sources, n = 5) |> kable(caption = "Years with most data sources") ## ----state-analysis----------------------------------------------------------- # Method 1: Use the parent/child relationship in observations state_data <- dc_get_observations( variable_dcids = c("Count_Person", "Median_Age_Person"), date = "latest", parent_entity = "country/USA", entity_type = "State", return_type = "data.frame" ) # The code automatically constructs the following query: # Find all entities of type State contained in country/USA glimpse(state_data) # Process the data - reshape from long to wide state_summary <- state_data |> filter(str_detect(facet_name, "Census")) |> # Use Census data select( entity = entity_dcid, state_name = entity_name, variable = variable_dcid, value ) |> group_by(entity, state_name, variable) |> # Take first if duplicates summarize(value = first(value), .groups = "drop") |> pivot_wider(names_from = variable, values_from = value) |> filter( !is.na(Count_Person), !is.na(Median_Age_Person), Count_Person > 500000 # Focus on states, not small territories ) # Visualize the relationship ggplot(state_summary, aes(x = Median_Age_Person, y = Count_Person)) + geom_point(aes(size = Count_Person), alpha = 0.6, color = "darkblue") + geom_text( data = state_summary |> filter(Count_Person > 10e6), aes(label = state_name), vjust = -1, hjust = 0.5, size = 3 ) + scale_y_log10(labels = label_comma()) + scale_size(range = c(3, 10), guide = "none") + labs( title = "State Demographics: Population vs. Median Age", x = "Median Age (years)", y = "Population (log scale)", caption = "Source: U.S. Census Bureau via Data Commons" ) # Find extremes state_summary |> filter( Median_Age_Person == min(Median_Age_Person) | Median_Age_Person == max(Median_Age_Person) ) |> select(state_name, Median_Age_Person, Count_Person) |> mutate(Count_Person = label_comma()(Count_Person)) |> kable(caption = "States with extreme median ages") ## ----cross-dataset------------------------------------------------------------ # Get multiple variables for California counties # Notice how we can mix variables from different sources in one query! ca_integrated <- dc_get_observations( variable_dcids = c( "Count_Person", # Census "Median_Age_Person", # Census "Percent_Person_Obesity", # CDC "UnemploymentRate_Person" # BLS ), date = "latest", parent_entity = "geoId/06", # California entity_type = "County", return_type = "data.frame" ) # Check which sources we're pulling from ca_integrated |> group_by(variable_name, facet_name) |> summarize(n = n(), .groups = "drop") |> slice_head(n = 10) |> kable(caption = "Data sources by variable") # Process the integrated data ca_analysis <- ca_integrated |> # Pick one source per variable for consistency filter( (variable_dcid == "Count_Person" & str_detect(facet_name, "CensusACS5Year")) | (variable_dcid == "Median_Age_Person" & str_detect(facet_name, "CensusACS5Year")) | (variable_dcid == "Percent_Person_Obesity" & str_detect(facet_name, "CDC")) | (variable_dcid == "UnemploymentRate_Person" & str_detect(facet_name, "BLS")) ) |> select( entity = entity_dcid, county_name = entity_name, variable = variable_dcid, value ) |> group_by(entity, county_name, variable) |> summarize(value = first(value), .groups = "drop") |> pivot_wider(names_from = variable, values_from = value) |> drop_na() |> mutate( county_name = str_remove(county_name, " County$"), population_k = Count_Person / 1000 ) # Explore relationships between variables ggplot(ca_analysis, aes(x = Median_Age_Person, y = Percent_Person_Obesity)) + geom_point(aes(size = Count_Person, color = UnemploymentRate_Person), alpha = 0.7 ) + geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "dashed") + scale_size(range = c(2, 10), guide = "none") + scale_color_viridis_c(name = "Unemployment\nRate (%)") + labs( title = "California Counties: Age, Obesity, and Unemployment", subtitle = "Integrating Census, CDC, and BLS data through Data Commons", x = "Median Age (years)", y = "Obesity Rate (%)", caption = "Sources: Census ACS, CDC PLACES, BLS via Data Commons" ) + theme(legend.position = "right") # Show correlations ca_analysis |> select(Median_Age_Person, Percent_Person_Obesity, UnemploymentRate_Person) |> cor() |> round(2) |> kable(caption = "Correlations between demographic and health variables") ## ----time-series-------------------------------------------------------------- # Major city DCIDs (found using datacommons.org/place) major_cities <- c( "geoId/3651000", # New York City "geoId/0644000", # Los Angeles "geoId/1714000", # Chicago "geoId/4835000", # Houston "geoId/0455000" # Phoenix ) # Get historical population data city_populations <- dc_get_observations( variable_dcids = "Count_Person", entity_dcids = major_cities, date = "all", return_type = "data.frame" ) # Process - use Census PEP data for consistency city_pop_clean <- city_populations |> filter( str_detect(facet_name, "USCensusPEP"), !is.na(value) ) |> mutate( year = as.integer(date), population_millions = value / 1e6, city = str_extract(entity_name, "^[^,]+") ) |> filter(year >= 2000) |> group_by(city, year) |> summarize( population_millions = mean(population_millions), .groups = "drop" ) # Visualize growth trends ggplot( city_pop_clean, aes(x = year, y = population_millions, color = city) ) + geom_line(linewidth = 1.2) + geom_point(size = 2) + scale_color_brewer(palette = "Set1") + scale_y_continuous(labels = label_number(suffix = "M")) + labs( title = "Population Growth in Major U.S. Cities", x = "Year", y = "Population (millions)", color = "City", caption = "Source: Census Population Estimates Program via Data Commons" ) + theme(legend.position = "bottom") # Calculate growth rates city_pop_clean |> group_by(city) |> filter(year == min(year) | year == max(year)) |> arrange(city, year) |> summarize( years = paste(min(year), "-", max(year)), start_pop = first(population_millions), end_pop = last(population_millions), total_growth_pct = round((end_pop / start_pop - 1) * 100, 1), .groups = "drop" ) |> arrange(desc(total_growth_pct)) |> kable( caption = "City population growth rates", col.names = c("City", "Period", "Start (M)", "End (M)", "Growth %") ) ## ----understanding-facets----------------------------------------------------- # Example: What sources are available for U.S. population? us_population |> filter(date == "2020") |> select(facet_name, value) |> distinct() |> mutate(value = label_comma()(value)) |> arrange(desc(value)) |> kable(caption = "Different sources for 2020 U.S. population")