## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,        # standardized width that fits vignette text column
  fig.height = 5,       # default height, override in individual chunks as needed
  fig.dpi = 150,        # higher resolution for crisp rendering
  out.width = "100%",   # constrain to page width, prevents overflow
  warning = FALSE,
  message = FALSE
)

## ----setup--------------------------------------------------------------------
library(mongolstats)
library(dplyr)
library(ggplot2)
library(sf)
library(tidyr)
library(purrr)

nso_options(mongolstats.lang = "en")

# Global theme with proper margins to prevent text cutoff
theme_set(
  theme_minimal(base_size = 11) +
    theme(
      plot.margin = margin(10, 10, 10, 10),
      plot.title = element_text(size = 13, face = "bold"),
      plot.subtitle = element_text(size = 10, color = "grey40"),
      legend.text = element_text(size = 9),
      legend.title = element_text(size = 10)
    )
)

## ----helpers------------------------------------------------------------------
# 1. Define UB District Names (Map vs Data standard)
# Map names (transliterated) -> Data names (English standard)
ub_name_map <- c(
  "Baganuur" = "Baganuur",
  "Bagaxangai" = "Bagakhangai",
  "Bayangol" = "Bayangol",
  "Bayanzu'rx" = "Bayanzurkh",
  "Chingeltei" = "Chingeltei",
  "Nalaix" = "Nalaikh",
  "Songinoxairxan" = "Songinokhairkhan",
  "Su'xbaatar" = "Sukhbaatar",
  "Xan-Uul" = "Khan-Uul"
)

# 2. Get Spatial Boundaries
# We use a cached version or fetch if needed
get_ub_shapes <- function() {
  # Get all soums and UB boundary
  all_soums <- mn_boundaries(level = "ADM2")
  aimags <- mn_boundaries(level = "ADM1")
  ub_boundary <- aimags |> filter(shapeName == "Ulaanbaatar")

  # Spatial filter (turning off S2 for robustness)
  sf_use_s2(FALSE)
  ub_districts <- all_soums |>
    filter(shapeName %in% names(ub_name_map)) |>
    st_filter(ub_boundary)

  # Add standardized name column
  ub_districts |>
    mutate(District = ub_name_map[shapeName])
}

ub_shapes <- get_ub_shapes()

# 3. Helper to fetch and filter NSO data for UB
get_ub_data <- function(tbl_id, ...) {
  # Fetch data for all regions
  df <- nso_data(tbl_id, labels = "en", ...)

  # Filter for UB districts using the "511" prefix code pattern
  df |>
    filter(grepl("^511", Region)) |>
    mutate(District = trimws(Region_en)) |> # Standardize name
    filter(District %in% ub_name_map) |>
    select(-Region_en, -Region) # Keep only standardized District
}

## ----demog-density------------------------------------------------------------
# Fetch Population Data (DT_NSO_0300_002V4)
pop_data <- get_ub_data("DT_NSO_0300_002V4",
  selections = list(Year = "2024")
)

# Join population data with district boundaries
ub_pop_map <- ub_shapes |>
  left_join(pop_data, by = "District") |>
  # Project to UTM Zone 48N for accurate area calculation in meters (not degrees)
  st_transform(32648) |>
  mutate(
    Area_km2 = as.numeric(st_area(geometry)) / 1e6, # Convert m² to km²
    Density = value / Area_km2 # People per km²
  ) |>
  st_transform(4326) # Back to WGS84 for plotting

# Map population density
p_dens <- ggplot(ub_pop_map) +
  geom_sf(aes(fill = Density), color = "white", size = 0.2) +
  scale_fill_viridis_c(
    option = "rocket",
    direction = -1,
    trans = "log10",  # log scale because densities span 4 orders of magnitude
    name = "Population Density\n(People/km²)",
    labels = scales::comma
  ) +
  labs(
    title = "Population Density by District (2024)",
    subtitle = "People per square kilometer (Logarithmic Scale)",
    caption = "Source: NSO Mongolia (DT_NSO_0300_002V4)\nNote: Log scale used to visualize wide range of densities."
  ) +
  theme_void() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(color = "grey40"),
    legend.position = "bottom",          # bottom legend maximizes map width
    legend.key.width = unit(1.5, "cm")
  )

p_dens  # print static ggplot

## ----demog-trend--------------------------------------------------------------
# Fetch historical population data (last 10 years)
pop_trend <- get_ub_data("DT_NSO_0300_002V4",
  selections = list(Year = as.character(2013:2024))
)

p_trend <- pop_trend |>
  mutate(Year = as.numeric(Year_en)) |>
  arrange(Year) |>
  ggplot(aes(x = Year, y = value, color = District, group = District)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(breaks = 2013:2024) +
  labs(
    title = "Population Growth Trends (2013-2024)",
    subtitle = "Annual resident population",
    y = "Population", x = NULL, color = "District",
    caption = "Source: NSO Mongolia"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "bottom"
  )

p_trend  # print static ggplot

## ----socio-edu----------------------------------------------------------------
# Fetch Schools (DT_NSO_2001_002V2) and Students (DT_NSO_2002_057V2)
schools <- get_ub_data("DT_NSO_2001_002V2", selections = list(Year = "2024")) |>
  rename(Schools = value)
students <- get_ub_data("DT_NSO_2002_057V2", selections = list(Year = "2024")) |>
  rename(Students = value)

edu_load <- left_join(schools, students, by = c("District", "Year")) |>
  mutate(Students_Per_School = Students / Schools)

p_edu <- ggplot(edu_load, aes(
  x = reorder(District, Students_Per_School),
  y = Students_Per_School, fill = District
)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = round(Students_Per_School)), hjust = -0.1, size = 3) +  # inline labels replace tooltips
  coord_flip() +
  scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Educational Load: Students per School (2024)",
    subtitle = "Average number of students per general educational school",
    y = "Students per School", x = NULL,
    caption = "Source: NSO Mongolia"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.major.y = element_blank()  # horizontal gridlines clutter bar charts
  )

p_edu  # print static ggplot

## ----health-resp--------------------------------------------------------------
# 1. Get Respiratory Deaths (Code found in discovery phase)
# We need to find the code dynamically or use known one.
# For vignette stability, we'll search for it.
resp_meta <- nso_dim_values("DT_NSO_2100_027V1", "Indicator", labels = "en")
resp_code <- resp_meta |>
  filter(grepl("respiratory", label_en, ignore.case = TRUE)) |>
  pull(code)

resp_deaths <- get_ub_data("DT_NSO_2100_027V1",
  selections = list(Indicator = resp_code, Year = "2024")
) |>
  rename(Deaths = value)

# 2. Calculate Rates using Population data from Demographics section
health_metrics <- resp_deaths |>
  left_join(pop_data, by = c("District")) |>
  rename(Population = value) |>
  mutate(Resp_Death_Rate = (Deaths / Population) * 10000)

p_resp <- ggplot(health_metrics, aes(
  x = reorder(District, Resp_Death_Rate),
  y = Resp_Death_Rate, fill = Resp_Death_Rate
)) +
  geom_col() +
  geom_text(aes(label = round(Resp_Death_Rate, 1)), hjust = -0.1, size = 3) +  # inline labels
  scale_fill_viridis_c(option = "inferno", direction = -1, name = "Rate") +
  coord_flip() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +  # room for labels
  labs(
    title = "Respiratory Mortality Rate (2024)",
    subtitle = "Deaths per 10,000 population",
    y = "Rate per 10,000", x = NULL,
    caption = "Source: NSO Mongolia"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "none"  # color already encoded in bar height
  )

p_resp  # print static ggplot

## ----health-imr---------------------------------------------------------------
# Fetch IMR directly (DT_NSO_2100_015V2 is Rate per 1000)
# Using 2023 as 2024 data appears incomplete for some districts
imr_data <- get_ub_data("DT_NSO_2100_015V2",
  selections = list(Year = "2023")
)

p_imr <- ggplot(imr_data, aes(x = reorder(District, value), y = value)) +
  geom_segment(aes(xend = District, yend = 0), color = "grey") +  # lollipop stem
  geom_point(size = 4, color = "steelblue") +  # lollipop head
  geom_text(aes(label = round(value, 1)), hjust = -0.3, size = 3) +  # inline labels
  coord_flip() +
  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
  labs(
    title = "Infant Mortality Rate (2023)",
    subtitle = "Deaths per 1,000 live births",
    y = "IMR (per 1,000)", x = NULL,
    caption = "Source: NSO Mongolia"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(face = "bold"),
    panel.grid.major.y = element_blank()
  )

p_imr  # print static ggplot

## ----multi-var----------------------------------------------------------------
# Combine all metrics into one analytical dataset
# We use inner_join to ensure we only have districts with all data
analysis_df <- health_metrics |>
  select(District, Population, Resp_Death_Rate) |>
  left_join(imr_data |> select(District, IMR = value), by = "District") |>
  left_join(edu_load |> select(District, Students_Per_School), by = "District") |>
  left_join(ub_pop_map |> st_drop_geometry() |> select(District, Density), by = "District")

# Correlation Matrix
cor_mat <- analysis_df |>
  select(Density,
    `Students/School` = Students_Per_School,
    `Resp. Rate` = Resp_Death_Rate, IMR
  ) |>
  cor(use = "complete.obs")

# Reshape for ggplot
cor_long <- as.data.frame(cor_mat) |>
  mutate(Var1 = rownames(cor_mat)) |>
  pivot_longer(-Var1, names_to = "Var2", values_to = "Correlation") |>
  filter(Var1 != Var2) # Remove diagonal

p_cor <- ggplot(cor_long, aes(x = Var1, y = Var2, fill = Correlation)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(
    low = "blue", high = "red", mid = "white",
    midpoint = 0, limit = c(-1, 1)
  ) +
  geom_text(aes(
    label = round(Correlation, 2),
    color = ifelse(abs(Correlation) > 0.5, "white", "black")  # contrast for readability
  )) +
  scale_color_identity() +
  labs(
    title = "Correlation of District Indicators",
    subtitle = "Pearson correlation coefficients",
    x = NULL, y = NULL
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold")
  )

p_cor  # print static ggplot

## ----aq-station-map-----------------------------------------------------------
# Define station-to-district mapping with accurate coordinates
# Sources: WAQI API (api.waqi.info) and user-provided coordinates
ub_aq_stations <- tribble(
  ~Station, ~District, ~Latitude, ~Longitude, ~Source,
  "Misheel-Expo center", "Khan-Uul", 47.89428, 106.8825, "WAQI",
  "West crossroad", "Bayangol", 47.91576, 106.8940, "WAQI",
  "1st micro district", "Songinokhairkhan", 47.91798, 106.8481, "WAQI",
  "13th micro district", "Bayanzurkh", 47.91761, 106.9374, "WAQI",
  "32nd Toirog", "Sukhbaatar", 47.939333, 106.914333, "User",
  "Ofitseruudiin ordon", "Bayanzurkh", 47.916611, 106.972556, "User",
  "Kharkhorin market", "Songinokhairkhan", 47.910583, 106.837111, "User",
  "Urgakh naran micro district", "Bayanzurkh", 47.873778, 107.115028, "User",
  "Dambdarjaa", "Sukhbaatar", 47.988250, 106.951167, "User",
  "Khailaast", "Chingeltei", 47.963528, 106.893889, "User",
  "Tolgoit", "Songinokhairkhan", 47.920278, 106.799250, "User",
  "Zuragt", "Chingeltei", 47.92973, 106.8886, "WAQI",
  "Amgalan", "Bayanzurkh", 47.906250, 107.016556, "User",
  "Nisekh", "Khan-Uul", 47.86394, 106.7791, "WAQI",
  "Tavan buudal", "Sukhbaatar", 47.954500, 106.914833, "User",
  "Bayankhoshuu", "Songinokhairkhan", 47.945528, 106.822917, "User",
  "Sharkhad", "Bayanzurkh", 47.93375, 107.0103, "WAQI",
  "100 ail", "Sukhbaatar", 47.93291, 106.9214, "WAQI"
)

# Convert to sf object
ub_stations_sf <- ub_aq_stations |>
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) |>
  st_transform(32648)

# Station count by district
station_count <- ub_aq_stations |>
  count(District, name = "Stations") |>
  arrange(desc(Stations))

knitr::kable(station_count, caption = "Air Quality Monitoring Stations by District")

## ----aq-network-map, fig.height=6---------------------------------------------
# Create map with district boundaries and station locations
p_map <- ggplot() +
  geom_sf(data = ub_shapes, fill = "grey90", color = "white", linewidth = 0.5) +
  geom_sf(data = ub_stations_sf, color = "#e74c3c", size = 3, alpha = 0.8) +  # red dots for stations
  geom_sf_text(data = ub_shapes, aes(label = District), size = 2.5, color = "grey40") +  # district labels
  labs(
    title = "Air Quality Monitoring Network in Ulaanbaatar",
    subtitle = "18 stations distributed across 6 districts",
    caption = "Red points = monitoring stations"
  ) +
  theme_void() +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    plot.subtitle = element_text(size = 11),
    plot.caption = element_text(hjust = 0, size = 9)
  )

p_map  # print static ggplot

## ----station-coverage---------------------------------------------------------
# Join station counts with population data for coverage analysis
station_coverage <- station_count |>
  left_join(pop_data |> select(District, Population = value), by = "District") |>
  mutate(
    Pop_per_Station = Population / Stations,
    Coverage_Score = case_when(
      Pop_per_Station < 100000 ~ "High Coverage",
      Pop_per_Station < 150000 ~ "Moderate Coverage",
      TRUE ~ "Low Coverage"
    )
  )

p_coverage <- ggplot(
  station_coverage,
  aes(
    x = reorder(District, Pop_per_Station),
    y = Pop_per_Station / 1000,
    fill = Coverage_Score
  )
) +
  geom_col() +
  geom_text(aes(label = scales::comma(Pop_per_Station)), hjust = -0.1, size = 3) +  # inline labels
  coord_flip() +
  scale_fill_manual(values = c(
    "High Coverage" = "#27ae60",     # green = good
    "Moderate Coverage" = "#f39c12", # yellow = caution
    "Low Coverage" = "#e74c3c"       # red = concern
  )) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2))) +
  labs(
    title = "Air Quality Monitoring Coverage by District",
    subtitle = "Population per monitoring station (thousands)",
    x = NULL,
    y = "Population per Station (thousands)",
    fill = "Coverage Level"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "bottom",
    panel.grid.major.y = element_blank()
  )

p_coverage  # print static ggplot