---
title: "Discovering Public Health Data"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Discovering Public Health Data}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,        # standardized width that fits vignette text column
  fig.height = 5,       # default height, override in individual chunks as needed
  fig.dpi = 150,        # higher resolution for crisp rendering
  out.width = "100%",   # constrain to page width, prevents overflow
  warning = FALSE,
  message = FALSE
)
```

```{r setup, message=FALSE, warning=FALSE}
library(mongolstats)
library(dplyr)
library(ggplot2)
nso_options(mongolstats.lang = "en")

# Global theme with proper margins to prevent text cutoff
theme_set(
  theme_minimal(base_size = 11) +
    theme(
      plot.margin = margin(10, 10, 10, 10),
      plot.title = element_text(size = 13, face = "bold"),
      plot.subtitle = element_text(size = 10, color = "grey40"),
      legend.text = element_text(size = 9),
      legend.title = element_text(size = 10)
    )
)
```

## Overview

Mongolia's National Statistics Office maintains comprehensive public health surveillance data. This guide demonstrates how to discover and access epidemiological data for research and policy analysis.

## Finding Health Tables

### Search by Keyword

Finding the right data is the first step in any analysis. The `nso_itms_search()` function allows you to query the entire NSO catalog using simple keywords:

```{r search-examples, eval=TRUE}
# Infant and maternal health
mortality <- nso_itms_search("mortality")
mortality |>
  select(tbl_id, tbl_eng_nm) |>
  head(10)

# Cancer surveillance
cancer <- nso_itms_search("cancer")
cancer |> select(tbl_id, tbl_eng_nm)

# Communicable diseases
infectious <- nso_itms_search("tuberculosis")
infectious |> select(tbl_id, tbl_eng_nm)
```

### Browse by Sector

Health and education statistics are grouped together:

```{r sectors, eval=TRUE}
# View all sectors
sectors <- nso_sectors()
sectors

# Find health-related subsectors
health_sector <- sectors |> filter(grepl("health", text, ignore.case = TRUE))
if (nrow(health_sector) > 0) {
  subsectors <- nso_subsectors(health_sector$id[1])
  subsectors |> head()
}
```

## Case Study: Cancer Epidemiology

### Exploring Cancer Incidence Data

Cancer burden is shifting in Mongolia. To understand these changes, we can analyze incidence trends over the last decade:

```{r cancer-metadata, eval=TRUE}
# Find cancer incidence table
cancer_tbl <- "DT_NSO_2100_012V1" # New cases per 10,000 population

# Examine available dimensions
meta <- nso_table_meta(cancer_tbl)
meta

# View cancer types
cancer_types <- nso_dim_values(cancer_tbl, "Type malignant neoplasms", labels = "en")
cancer_types |> head(10)

# Check time coverage
# Note: "Annual" dimension uses internal codes, so we map labels (years) to codes
annual_meta <- nso_dim_values(cancer_tbl, "Annual", labels = "both")
years <- annual_meta$label_en
years
```

### Fetching and Visualizing Cancer Trends

```{r cancer-analysis, eval=TRUE}
# Fetch cancer incidence data for the most common types
# We focus on the last 10 years to show recent trends
# and select 4 major cancer types (Lung, Liver, Stomach, Cervix)

# Step 1: Identify the 10 most recent years
recent_years <- annual_meta |>
  arrange(label_en) |>
  tail(10) |>
  pull(code)

# Step 2: Fetch data for major cancer types
cancer_data <- nso_data(
  tbl_id = cancer_tbl,
  selections = list(
    "Type malignant neoplasms" = c("1", "2", "3", "4"), # Lung, Liver, Stomach, Cervix
    "Annual" = recent_years
  ),
  labels = "en"
)

# Visualize cancer incidence trends as static plot
p <- cancer_data |>
  ggplot(aes(
    x = as.integer(Annual_en), y = value, color = `Type malignant neoplasms_en`,
    group = `Type malignant neoplasms_en`
  )) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 3, shape = 21, fill = "white", stroke = 1.5) +  # hollow points stand out on lines
  scale_color_viridis_d(option = "plasma", end = 0.9) +  # colorblind-friendly discrete palette
  scale_x_continuous(breaks = function(x) seq(ceiling(min(x)), floor(max(x)), by = 1)) +
  labs(
    title = "Cancer Incidence Trends in Mongolia",
    subtitle = "New cases per 10,000 population (Recent Trends)",
    x = NULL,
    y = "Incidence Rate (per 10,000)",
    color = "Cancer Type",
    caption = "Source: NSO Mongolia"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "top",
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(color = "grey40", margin = margin(b = 10)),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),  # vertical gridlines clutter multi-line plots
    axis.text = element_text(color = "grey30")
  )

p  # print static ggplot
```

### Regional Disparities

```{r infant-mortality, eval=TRUE}
# Infant mortality by aimag
imr_tbl <- "DT_NSO_2100_015V1" # IMR per 1,000 live births (Monthly)

# Get metadata
imr_meta <- nso_table_meta(imr_tbl)
months <- nso_dim_values(imr_tbl, "Month", labels = "en")

# Fetch recent data for all regions (2024 Average)
months_2024 <- months |>
  filter(grepl("2024", label_en)) |>
  pull(code)

imr_data <- nso_data(
  tbl_id = imr_tbl,
  selections = list(
    "Region" = nso_dim_values(imr_tbl, "Region")$code,
    "Month" = months_2024
  ),
  labels = "en"
) |>
  filter(nchar(Region) == 3) |> # Keep only Aimags and Ulaanbaatar
  mutate(
    Region_en = trimws(Region_en),
    Region_en = dplyr::case_match(
      Region_en,
      "Bayan-Ulgii" ~ "Bayan-Ölgii",
      "Uvurkhangai" ~ "Övörkhangai",
      "Khuvsgul" ~ "Hovsgel",
      "Umnugovi" ~ "Ömnögovi",
      "Tuv" ~ "Töv",
      "Sukhbaatar" ~ "Sükhbaatar",
      .default = Region_en
    )
  ) |>
  group_by(Region_en) |>
  summarise(value = mean(value, na.rm = TRUE), .groups = "drop")

# Find regions with highest IMR
imr_data |>
  arrange(desc(value)) |>
  select(Region_en, value) |>
  head(10)
```

### Time Trend Analysis

```{r imr-trends, eval=TRUE}
# Analyze national trend (Monthly)
imr_national <- nso_data(
  tbl_id = imr_tbl,
  selections = list(
    "Region" = "0", # National total
    "Month" = months$code
  ),
  labels = "en"
)

# Analyze national infant mortality trend using monthly data
# Convert Month_en column (format: "YYYY-MM") to proper dates
# Filter to 2019-2024 period for clear recent trends

imr_national |>
  mutate(date = as.Date(paste0(Month_en, "-01"))) |>
  filter(date >= as.Date("2019-01-01") & date <= as.Date("2024-12-31")) |>
  ggplot(aes(x = date, y = value, group = 1)) +
  geom_line(color = "#2980b9", linewidth = 1, alpha = 0.3) +  # dim raw data so trend stands out
  geom_point(color = "#2980b9", size = 2, shape = 21, fill = "white", stroke = 1, alpha = 0.5) +
  geom_smooth(method = "loess", se = TRUE, color = "#2980b9", fill = "#2980b9", alpha = 0.2, linewidth = 1.5) +  # LOESS smoother reveals underlying trend
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, 0.2))) +  # y-axis starts at 0 to avoid exaggerating changes
  labs(
    title = "Infant Mortality Rate Trend",
    subtitle = "Monthly Deaths per 1,000 live births (2019-2024)",
    x = NULL,
    y = "IMR (per 1,000 live births)",
    caption = "Source: NSO Mongolia"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(color = "grey40"),
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank()
  )
```

## Case Study: Tuberculosis Burden

Let's analyze the seasonal trends of Tuberculosis using monthly data.

```{r tb-data, eval=TRUE}
# TB cases (Monthly)
tb_tbl <- "DT_NSO_2100_035V1" # CASES OF COMMUNICABLE DISEASES, by type of selected diseases and by month

# Get metadata to find the code for Tuberculosis
# Note: Dimensions are "Indicators" and "Month"
indicators <- nso_dim_values(tb_tbl, "Indicators", labels = "en")
tb_code <- indicators |>
  filter(grepl("Tuberculosis", label_en, ignore.case = TRUE)) |>
  pull(code)

# Fetch monthly data
tb_data <- nso_data(
  tbl_id = tb_tbl,
  selections = list(
    "Indicators" = tb_code,
    "Month" = nso_dim_values(tb_tbl, "Month")$code
  ),
  labels = "en"
)

# Visualize Monthly Tuberculosis Trends
p <- tb_data |>
  mutate(date = as.Date(paste0(Month_en, "-01"))) |>
  filter(!is.na(value)) |>
  ggplot(aes(x = date, y = value, group = 1)) +
  geom_line(color = "#2c3e50", linewidth = 1, alpha = 0.3) +  # dim raw data
  geom_point(color = "#2c3e50", size = 2, alpha = 0.3) +
  geom_smooth(method = "loess", se = TRUE, color = "#e74c3c", fill = "#e74c3c", alpha = 0.2, linewidth = 1.5) +  # trend line
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, 0.2))) +  # start y-axis at 0
  labs(
    title = "Tuberculosis Cases in Mongolia",
    subtitle = "Monthly reported cases",
    x = NULL,
    y = "Number of Cases (Monthly)",
    caption = "Source: NSO Mongolia (DT_NSO_2100_035V1)"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 16),
    plot.subtitle = element_text(color = "grey40"),
    panel.grid.minor = element_blank()
  )

p  # print static ggplot
```

> **Biostatistical Note:** This plot shows the *number* of reported cases, not the incidence *rate*. Trends should be interpreted with caution, as an increase in cases could be due to population growth or improved detection, rather than an increase in disease risk.

## Tips for Epidemiological Research

1. **Always check time coverage**: Use `nso_table_periods()` to verify data availability
2. **Use labels for clarity**: Set `labels = "en"` to get readable dimension names
3. **Join multiple indicators**: Combine tables to calculate derived metrics (e.g., case-fatality rates)
4. **Account for denominator data**: Link disease counts with population data for rate calculations
5. **Regional analysis**: Most health tables include breakdowns by aimag and soum for geographic analysis

## Next Steps

- **Mapping Health Outcomes**: See the [Mapping Guide](mapping.html) for spatial epidemiology
- **Reference Documentation**: Explore all available functions in the [Reference](https://temuulene.github.io/mongolstats/reference/index.html)