--- title: "Discovering Public Health Data" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Discovering Public Health Data} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, # standardized width that fits vignette text column fig.height = 5, # default height, override in individual chunks as needed fig.dpi = 150, # higher resolution for crisp rendering out.width = "100%", # constrain to page width, prevents overflow warning = FALSE, message = FALSE ) ``` ```{r setup, message=FALSE, warning=FALSE} library(mongolstats) library(dplyr) library(ggplot2) nso_options(mongolstats.lang = "en") # Global theme with proper margins to prevent text cutoff theme_set( theme_minimal(base_size = 11) + theme( plot.margin = margin(10, 10, 10, 10), plot.title = element_text(size = 13, face = "bold"), plot.subtitle = element_text(size = 10, color = "grey40"), legend.text = element_text(size = 9), legend.title = element_text(size = 10) ) ) ``` ## Overview Mongolia's National Statistics Office maintains comprehensive public health surveillance data. This guide demonstrates how to discover and access epidemiological data for research and policy analysis. ## Finding Health Tables ### Search by Keyword Finding the right data is the first step in any analysis. The `nso_itms_search()` function allows you to query the entire NSO catalog using simple keywords: ```{r search-examples, eval=TRUE} # Infant and maternal health mortality <- nso_itms_search("mortality") mortality |> select(tbl_id, tbl_eng_nm) |> head(10) # Cancer surveillance cancer <- nso_itms_search("cancer") cancer |> select(tbl_id, tbl_eng_nm) # Communicable diseases infectious <- nso_itms_search("tuberculosis") infectious |> select(tbl_id, tbl_eng_nm) ``` ### Browse by Sector Health and education statistics are grouped together: ```{r sectors, eval=TRUE} # View all sectors sectors <- nso_sectors() sectors # Find health-related subsectors health_sector <- sectors |> filter(grepl("health", text, ignore.case = TRUE)) if (nrow(health_sector) > 0) { subsectors <- nso_subsectors(health_sector$id[1]) subsectors |> head() } ``` ## Case Study: Cancer Epidemiology ### Exploring Cancer Incidence Data Cancer burden is shifting in Mongolia. To understand these changes, we can analyze incidence trends over the last decade: ```{r cancer-metadata, eval=TRUE} # Find cancer incidence table cancer_tbl <- "DT_NSO_2100_012V1" # New cases per 10,000 population # Examine available dimensions meta <- nso_table_meta(cancer_tbl) meta # View cancer types cancer_types <- nso_dim_values(cancer_tbl, "Type malignant neoplasms", labels = "en") cancer_types |> head(10) # Check time coverage # Note: "Annual" dimension uses internal codes, so we map labels (years) to codes annual_meta <- nso_dim_values(cancer_tbl, "Annual", labels = "both") years <- annual_meta$label_en years ``` ### Fetching and Visualizing Cancer Trends ```{r cancer-analysis, eval=TRUE} # Fetch cancer incidence data for the most common types # We focus on the last 10 years to show recent trends # and select 4 major cancer types (Lung, Liver, Stomach, Cervix) # Step 1: Identify the 10 most recent years recent_years <- annual_meta |> arrange(label_en) |> tail(10) |> pull(code) # Step 2: Fetch data for major cancer types cancer_data <- nso_data( tbl_id = cancer_tbl, selections = list( "Type malignant neoplasms" = c("1", "2", "3", "4"), # Lung, Liver, Stomach, Cervix "Annual" = recent_years ), labels = "en" ) # Visualize cancer incidence trends as static plot p <- cancer_data |> ggplot(aes( x = as.integer(Annual_en), y = value, color = `Type malignant neoplasms_en`, group = `Type malignant neoplasms_en` )) + geom_line(linewidth = 1.2) + geom_point(size = 3, shape = 21, fill = "white", stroke = 1.5) + # hollow points stand out on lines scale_color_viridis_d(option = "plasma", end = 0.9) + # colorblind-friendly discrete palette scale_x_continuous(breaks = function(x) seq(ceiling(min(x)), floor(max(x)), by = 1)) + labs( title = "Cancer Incidence Trends in Mongolia", subtitle = "New cases per 10,000 population (Recent Trends)", x = NULL, y = "Incidence Rate (per 10,000)", color = "Cancer Type", caption = "Source: NSO Mongolia" ) + theme_minimal(base_size = 12) + theme( legend.position = "top", plot.title = element_text(face = "bold", size = 16), plot.subtitle = element_text(color = "grey40", margin = margin(b = 10)), panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(), # vertical gridlines clutter multi-line plots axis.text = element_text(color = "grey30") ) p # print static ggplot ``` ### Regional Disparities ```{r infant-mortality, eval=TRUE} # Infant mortality by aimag imr_tbl <- "DT_NSO_2100_015V1" # IMR per 1,000 live births (Monthly) # Get metadata imr_meta <- nso_table_meta(imr_tbl) months <- nso_dim_values(imr_tbl, "Month", labels = "en") # Fetch recent data for all regions (2024 Average) months_2024 <- months |> filter(grepl("2024", label_en)) |> pull(code) imr_data <- nso_data( tbl_id = imr_tbl, selections = list( "Region" = nso_dim_values(imr_tbl, "Region")$code, "Month" = months_2024 ), labels = "en" ) |> filter(nchar(Region) == 3) |> # Keep only Aimags and Ulaanbaatar mutate( Region_en = trimws(Region_en), Region_en = dplyr::case_match( Region_en, "Bayan-Ulgii" ~ "Bayan-Ölgii", "Uvurkhangai" ~ "Övörkhangai", "Khuvsgul" ~ "Hovsgel", "Umnugovi" ~ "Ömnögovi", "Tuv" ~ "Töv", "Sukhbaatar" ~ "Sükhbaatar", .default = Region_en ) ) |> group_by(Region_en) |> summarise(value = mean(value, na.rm = TRUE), .groups = "drop") # Find regions with highest IMR imr_data |> arrange(desc(value)) |> select(Region_en, value) |> head(10) ``` ### Time Trend Analysis ```{r imr-trends, eval=TRUE} # Analyze national trend (Monthly) imr_national <- nso_data( tbl_id = imr_tbl, selections = list( "Region" = "0", # National total "Month" = months$code ), labels = "en" ) # Analyze national infant mortality trend using monthly data # Convert Month_en column (format: "YYYY-MM") to proper dates # Filter to 2019-2024 period for clear recent trends imr_national |> mutate(date = as.Date(paste0(Month_en, "-01"))) |> filter(date >= as.Date("2019-01-01") & date <= as.Date("2024-12-31")) |> ggplot(aes(x = date, y = value, group = 1)) + geom_line(color = "#2980b9", linewidth = 1, alpha = 0.3) + # dim raw data so trend stands out geom_point(color = "#2980b9", size = 2, shape = 21, fill = "white", stroke = 1, alpha = 0.5) + geom_smooth(method = "loess", se = TRUE, color = "#2980b9", fill = "#2980b9", alpha = 0.2, linewidth = 1.5) + # LOESS smoother reveals underlying trend scale_x_date(date_breaks = "1 year", date_labels = "%Y") + scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, 0.2))) + # y-axis starts at 0 to avoid exaggerating changes labs( title = "Infant Mortality Rate Trend", subtitle = "Monthly Deaths per 1,000 live births (2019-2024)", x = NULL, y = "IMR (per 1,000 live births)", caption = "Source: NSO Mongolia" ) + theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold", size = 16), plot.subtitle = element_text(color = "grey40"), panel.grid.minor = element_blank(), panel.grid.major.x = element_blank() ) ``` ## Case Study: Tuberculosis Burden Let's analyze the seasonal trends of Tuberculosis using monthly data. ```{r tb-data, eval=TRUE} # TB cases (Monthly) tb_tbl <- "DT_NSO_2100_035V1" # CASES OF COMMUNICABLE DISEASES, by type of selected diseases and by month # Get metadata to find the code for Tuberculosis # Note: Dimensions are "Indicators" and "Month" indicators <- nso_dim_values(tb_tbl, "Indicators", labels = "en") tb_code <- indicators |> filter(grepl("Tuberculosis", label_en, ignore.case = TRUE)) |> pull(code) # Fetch monthly data tb_data <- nso_data( tbl_id = tb_tbl, selections = list( "Indicators" = tb_code, "Month" = nso_dim_values(tb_tbl, "Month")$code ), labels = "en" ) # Visualize Monthly Tuberculosis Trends p <- tb_data |> mutate(date = as.Date(paste0(Month_en, "-01"))) |> filter(!is.na(value)) |> ggplot(aes(x = date, y = value, group = 1)) + geom_line(color = "#2c3e50", linewidth = 1, alpha = 0.3) + # dim raw data geom_point(color = "#2c3e50", size = 2, alpha = 0.3) + geom_smooth(method = "loess", se = TRUE, color = "#e74c3c", fill = "#e74c3c", alpha = 0.2, linewidth = 1.5) + # trend line scale_x_date(date_breaks = "1 year", date_labels = "%Y") + scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, 0.2))) + # start y-axis at 0 labs( title = "Tuberculosis Cases in Mongolia", subtitle = "Monthly reported cases", x = NULL, y = "Number of Cases (Monthly)", caption = "Source: NSO Mongolia (DT_NSO_2100_035V1)" ) + theme_minimal(base_size = 12) + theme( plot.title = element_text(face = "bold", size = 16), plot.subtitle = element_text(color = "grey40"), panel.grid.minor = element_blank() ) p # print static ggplot ``` > **Biostatistical Note:** This plot shows the *number* of reported cases, not the incidence *rate*. Trends should be interpreted with caution, as an increase in cases could be due to population growth or improved detection, rather than an increase in disease risk. ## Tips for Epidemiological Research 1. **Always check time coverage**: Use `nso_table_periods()` to verify data availability 2. **Use labels for clarity**: Set `labels = "en"` to get readable dimension names 3. **Join multiple indicators**: Combine tables to calculate derived metrics (e.g., case-fatality rates) 4. **Account for denominator data**: Link disease counts with population data for rate calculations 5. **Regional analysis**: Most health tables include breakdowns by aimag and soum for geographic analysis ## Next Steps - **Mapping Health Outcomes**: See the [Mapping Guide](mapping.html) for spatial epidemiology - **Reference Documentation**: Explore all available functions in the [Reference](https://temuulene.github.io/mongolstats/reference/index.html)