## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, # standardized width that fits vignette text column fig.height = 5, # default height, override in individual chunks as needed fig.dpi = 150, # higher resolution for crisp rendering out.width = "100%", # constrain to page width, prevents overflow warning = FALSE, message = FALSE ) ## ----setup-------------------------------------------------------------------- library(mongolstats) library(dplyr) library(ggplot2) library(tidyr) library(stringr) nso_options(mongolstats.lang = "en") # Global theme with proper margins to prevent text cutoff theme_set( theme_minimal(base_size = 11) + theme( plot.margin = margin(10, 10, 10, 10), plot.title = element_text(size = 13, face = "bold"), plot.subtitle = element_text(size = 10, color = "grey40"), legend.text = element_text(size = 9), legend.title = element_text(size = 10) ) ) ## ----air-quality-data--------------------------------------------------------- # 1. Identify Top 10 Polluted Stations in UB (using 2024 annual data) air_annual <- nso_data( "DT_NSO_2024_135V01", selections = list(Year = "2024"), labels = "en" ) |> filter(!is.na(value)) |> mutate( Station = str_trim(`Station location_en`), Pollutant = str_trim(Indicator_en) ) # Define UB Stations ub_stations <- c( "Misheel-Expo center", "West crossroad", "1st micro district", "13th micro district", "32nd Toirog", "Ofitseruudiin ordon", "Kharkhorin market", "Urgakh naran micro district", "Dambdarjaa", "Khailaast", "Tolgoit", "Zuragt", "Amgalan", "Nisekh", "Tavan buudal", "Bayankhoshuu", "Sharkhad", "100 ail" ) # Rank UB stations by PM2.5 levels top_10_stations <- air_annual |> filter(Station %in% ub_stations) |> filter(str_detect(Pollutant, "PM2.5")) |> arrange(desc(value)) |> head(10) |> pull(Station) # 2. Fetch Monthly Data for Detailed Trends (2023-2025) air_monthly <- nso_data( "DT_NSO_2400_015V2", selections = list( Year = as.character(2023:2025) ), labels = "en" ) |> filter(!is.na(value)) |> mutate( Station = str_trim(Location_en), Pollutant_Raw = str_trim(`Indicator of air pollution_en`), Date = as.Date(paste0(Month_en, "-01")) ) |> # Explicit date filter to ensure only 2023-2025 data (Year selection may not work) filter(Date >= as.Date("2021-01-01"), Date <= as.Date("2025-12-31")) |> filter(Station %in% top_10_stations) # Clean and Reshape Data for Ribbons # We need Average, Min, and Max for PM2.5 and SO2 air_trends <- air_monthly |> mutate( Pollutant_Type = case_when( str_detect(Pollutant_Raw, "PM2.5") ~ "PM2.5", str_detect(Pollutant_Raw, "Sulphur dioxide") ~ "SO2", TRUE ~ NA_character_ ), Measure = case_when( str_detect(Pollutant_Raw, "Average") ~ "Average", str_detect(Pollutant_Raw, "Maximum") ~ "Maximum", str_detect(Pollutant_Raw, "Minimum") ~ "Minimum", TRUE ~ "Other" ) ) |> filter(!is.na(Pollutant_Type), Measure %in% c("Average", "Maximum")) |> # Use Average for the trend line and Maximum for the upper ribbon bound # Outlier Removal: Group by Station and Pollutant to detect anomalies # Using IQR method (3 * IQR) to remove extreme outliers while keeping natural spikes group_by(Station, Pollutant_Type, Measure) |> mutate( Q1 = quantile(value, 0.25, na.rm = TRUE), Q3 = quantile(value, 0.75, na.rm = TRUE), IQR = Q3 - Q1, Lower = Q1 - 3 * IQR, Upper = Q3 + 3 * IQR, Is_Outlier = value < Lower | value > Upper ) |> filter(!Is_Outlier) |> ungroup() |> select(Station, Date, Pollutant_Type, Measure, value) |> pivot_wider(names_from = Measure, values_from = value) # Note: If "Minimum" isn't explicitly available, we'll just show Average and Max range ## ----pm25-trends, fig.height=8, fig.alt="Line plot showing monthly average PM2.5 concentrations for 10 stations from 2021 to 2025. All stations show peaks exceeding the 0.025 mg/m³ dashed regulatory limit line during winter months."---- # MAC annual average for PM2.5 mac_pm25 <- 0.025 p_pm <- air_trends |> filter(Pollutant_Type == "PM2.5") |> ggplot(aes(x = Date, group = Station)) + # MAC reference line - regulatory threshold for health protection geom_hline(yintercept = mac_pm25, linetype = "dashed", color = "darkred", linewidth = 1) + # Ribbon shows range from average to maximum - captures daily variability geom_ribbon(aes(ymin = Average, ymax = Maximum), fill = "red", alpha = 0.2) + geom_line(aes(y = Average), color = "firebrick", linewidth = 0.8) + facet_wrap(~Station, ncol = 2, scales = "free_y") + # each station on its own scale scale_x_date(date_breaks = "6 months", date_labels = "%b %Y") + labs( title = "PM2.5 Trends in Top 10 Polluted UB Stations (2021-2025)", subtitle = "Monthly Average (Line), Maximum (Ribbon), and MAC Limit (Dashed Line)", x = NULL, y = "Concentration (mg/m³)" ) + scale_y_continuous(limits = c(0, NA)) + # y-axis starts at 0 for honest comparison theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold"), strip.text = element_text(face = "bold"), panel.grid.minor = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1) ) p_pm # print static ggplot ## ----so2-trends, fig.height=8, fig.alt="Line plot showing monthly average SO2 concentrations for 10 stations from 2021 to 2025. Peaks significantly exceed the 0.020 mg/m³ limit during winters, dropping to near zero in summers."---- # MAC annual average for SO2 mac_so2 <- 0.020 p_so2 <- air_trends |> filter(Pollutant_Type == "SO2") |> ggplot(aes(x = Date, group = Station)) + # MAC reference line - SO2 threshold for respiratory health geom_hline(yintercept = mac_so2, linetype = "dashed", color = "darkorange3", linewidth = 1) + geom_ribbon(aes(ymin = Average, ymax = Maximum), fill = "orange", alpha = 0.2) + geom_line(aes(y = Average), color = "darkorange", linewidth = 0.8) + facet_wrap(~Station, ncol = 2, scales = "free_y") + scale_x_date(date_breaks = "6 months", date_labels = "%b %Y") + labs( title = "SO₂ Trends in Top 10 Polluted UB Stations (2021-2025)", subtitle = "Monthly Average (Line), Maximum (Ribbon), and MAC Limit (Dashed Line)", x = NULL, y = "Concentration (mg/m³)" ) + theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold"), strip.text = element_text(face = "bold"), panel.grid.minor = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1) ) p_so2 # print static ggplot ## ----compliance-check, fig.alt="Bar chart showing exceedance factors for PM2.5, SO2, and NO2. PM2.5 bars are mostly red (non-compliant) reaching 2.5x times the limit. SO2 bars are all red, reaching over 3x the limit."---- # 1. Get MAC Standards mac_so2 <- 0.020 # Annual average mg/m3 mac_no2 <- 0.030 # Annual average mg/m3 mac_pm25 <- 0.025 # Annual average mg/m3 (WHO/National target approximation) # 2. Calculate Exceedance for 2024 compliance <- air_annual |> filter(Station %in% top_10_stations) |> mutate( Pollutant_Type = case_when( str_detect(Pollutant, "PM2.5") ~ "PM2.5", str_detect(Pollutant, "Sulphur dioxide") ~ "SO2", str_detect(Pollutant, "Nitrogen dioxide") ~ "NO2", TRUE ~ NA_character_ ), MAC = case_when( Pollutant_Type == "PM2.5" ~ mac_pm25, Pollutant_Type == "SO2" ~ mac_so2, Pollutant_Type == "NO2" ~ mac_no2, TRUE ~ NA_real_ ) ) |> filter(!is.na(Pollutant_Type)) |> mutate( Exceedance_Factor = value / MAC, Status = ifelse(Exceedance_Factor > 1, "Non-Compliant", "Compliant") ) # 3. Plot regulatory compliance - exceedance factor shows how many times over the limit p_comp <- compliance |> ggplot(aes(x = reorder(Station, Exceedance_Factor), y = Exceedance_Factor, fill = Status)) + geom_col() + geom_hline(yintercept = 1, linetype = "dashed", color = "black", linewidth = 1) + # compliance threshold geom_text(aes(label = round(Exceedance_Factor, 1)), hjust = -0.1, size = 2.5) + # inline labels facet_wrap(~Pollutant_Type, scales = "free_x") + coord_flip() + scale_fill_manual(values = c("Compliant" = "#27ae60", "Non-Compliant" = "#c0392b")) + scale_y_continuous(expand = expansion(mult = c(0, 0.15))) + labs( title = "Regulatory Compliance: Exceedance Factors (2024)", subtitle = "Ratio of Annual Average to Maximum Allowable Concentration (MAC)", y = "Times above Safety Limit", x = NULL ) + theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold"), legend.position = "bottom", panel.grid.major.y = element_blank() ) p_comp # print static ggplot ## ----water-quality, fig.alt="Bar chart of water conductivity. Uvs Lake and Khyargas Lake show extremely high values (>10,000), while river stations like Khangal and Tuul show elevated values compared to typical freshwater."---- # Fetch water quality data water <- nso_data( "DT_NSO_2300_005V12", selections = list(), labels = "en" ) |> filter(!is.na(value)) |> mutate( Station = str_trim(`Station location_en`), Indicator = str_trim(Indicator_en) ) # Focus on electrical conductivity & Top 15 conductivity <- water |> filter(str_detect(Indicator, "Electrical conductivity")) |> slice_max(order_by = value, n = 15) # Plot water quality - top stations by conductivity p <- conductivity |> mutate(Station = reorder(Station, value)) |> ggplot(aes(x = Station, y = value, fill = value)) + geom_col(show.legend = FALSE) + geom_text(aes(label = scales::comma(value)), hjust = -0.1, size = 3) + # inline labels coord_flip() + scale_y_continuous(labels = scales::comma, expand = expansion(mult = c(0, 0.15))) + scale_fill_viridis_c(option = "viridis") + labs( title = "Top 15 Water Stations by Conductivity (2023)", subtitle = "Higher values may indicate increased mineralization or pollution", x = NULL, y = "Conductivity (μS/cm)" ) + theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold"), panel.grid.major.y = element_blank(), panel.grid.minor = element_blank() ) p # print static ggplot ## ----dust-days, fig.alt="Bar chart showing annual dust days for 2024. Sainshand leads with 30 days, followed by Dalanzadgad with 21 days."---- # Fetch dust day data dust <- nso_data( "DT_NSO_2400_028V1", selections = list(), labels = "en" ) |> filter(!is.na(value)) |> mutate( Station = str_trim(`Station location_en`), Month_date = Month_en ) # Parse dates and aggregate to annual dust_annual <- dust |> mutate( Year = as.numeric(str_sub(Month_date, 1, 4)) ) |> filter(Year == 2024) |> group_by(Station) |> summarize( Total_dust_days = sum(value, na.rm = TRUE), .groups = "drop" ) |> slice_max(Total_dust_days, n = 10) # Plot dust exposure by station p <- dust_annual |> mutate(Station = reorder(Station, Total_dust_days)) |> ggplot(aes(x = Station, y = Total_dust_days, fill = Total_dust_days)) + geom_col(show.legend = FALSE) + geom_text(aes(label = Total_dust_days), hjust = -0.2, size = 3.5) + # inline labels coord_flip() + scale_fill_viridis_c(option = "magma", direction = -1) + scale_y_continuous(expand = expansion(mult = c(0, 0.15))) + labs( title = "Top 10 Stations for Dust Exposure (2024)", subtitle = "Total annual dust days", x = NULL, y = "Days", caption = "Source: NSO Mongolia" ) + theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold"), panel.grid.major.y = element_blank() ) p # print static ggplot ## ----spatial-example, eval=FALSE---------------------------------------------- # # Hypothetical station metadata table (user would compile this) # station_metadata <- tribble( # ~Station_name, ~Latitude, ~Longitude, ~Aimag, # "Ulaanbaatar", 47.9221, 106.9155, "Ulaanbaatar", # "Darkhan", 49.4869, 105.9228, "Darkhan-Uul", # "Erdenet", 49.0333, 104.0833, "Orkhon" # # ... add all stations # ) # # # Join with environmental data # air_with_location <- air_quality |> # left_join(station_metadata, by = c("Station" = "Station_name")) # # # Now you can create maps, calculate distances, etc. ## ----env-tables, echo=FALSE--------------------------------------------------- tibble::tribble( ~Indicator, ~Table_ID, ~Geographic_Level, "Air Pollutant Concentrations", "DT_NSO_2024_135V01", "Station", "Maximum Allowable Concentrations", "DT_NSO_2400_015V1", "Standard", "Water Quality", "DT_NSO_2300_005V12", "Station", "Dust Days", "DT_NSO_2400_028V1", "Station", "Forest Fires", "DT_NSO_2400_005V1", "Aimag" ) |> knitr::kable()