## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(wdiexplorer)

## ----get-data-----------------------------------------------------------------
pisa_data <- get_wdi_data(indicator = "LO.PISA.MAT", verbose = TRUE)

## -----------------------------------------------------------------------------
dplyr::glimpse(pisa_data)

## ----missingness-plot, fig.width=7.5, fig.height=11.5, fig.cap="Missingness plot, providing information about years and countries with missing entries and the overall percentages of missing and present data. It also shows that no data points are available across all countries during the years 1960 to 1999 and 2019 to 2024. It also shows that data are collected triennially."----
plot_missing(wdi_data = pisa_data, group_var = "income")

## ----missingness--------------------------------------------------------------
index = "LO.PISA.MAT"

pisa_data |>
  dplyr::select(country, income, year, tidyselect::all_of(index)) |>
  dplyr::group_by(income, country) |>
  naniar::miss_var_summary() |>
  dplyr::filter(variable == index) |>
  dplyr::arrange(desc(n_miss))

## ----checks-function----------------------------------------------------------
get_valid_data(pisa_data, verbose = TRUE)

## ----variation----------------------------------------------------------------
pisa_diss_mat <- compute_dissimilarity(pisa_data)
 
pisa_variation <- compute_variation(
                    pisa_data, 
                    diss_matrix = pisa_diss_mat, 
                    group_var = "income"
        )

## ----dissimilarities----------------------------------------------------------
pisa_variation |> 
        dplyr::arrange(desc(country_avg_dist)) |> 
        dplyr::slice_head(n = 3)

## ----trend-shape--------------------------------------------------------------
pisa_trend_shape <- compute_trend_shape_features(pisa_data)

## ----trend-strength-----------------------------------------------------------
pisa_trend_shape |> 
        dplyr::arrange(desc(trend_strength)) |>
    dplyr::slice_head(n = 3)

## ----temporal-----------------------------------------------------------------
pisa_temporal <- compute_temporal_features(pisa_data)

## ----flat-spot----------------------------------------------------------------
pisa_temporal |> 
   dplyr::arrange(desc(flat_spot)) |> 
   dplyr::slice(c(1:3, (dplyr::n() - 2):dplyr::n()))

## ----diagnostic-metrics-------------------------------------------------------
pisa_diagnostic_metrics <- compute_diagnostic_indices(pisa_data, group_var = "income")

## ----add-group----------------------------------------------------------------
pisa_diagnostic_metrics_group <- add_group_info(
                    metric_summary = pisa_diagnostic_metrics,
                    pisa_data
            )

## ----distribution-plot1, fig.height=5, fig.cap="Distribution of diagnostic indices where each panel represents a different metric. It shows the spread of the metric values across countries, with each dot representing a country and coloured by income."----
# ungrouped distribution plot
plot_metric_distribution(
      metric_summary = pisa_diagnostic_metrics_group, 
      colour_var = "income"
      )

## ----distribution-plot2, fig.height=5, fig.cap="Distribution of diagnostic indices grouped by income. Each panel displays a metric, with countries organised by income to facilitate within and between group comparisons. The plot reveals income-specific patterns and outliers. High income and low income groups show wider spread across all metrics."----
# grouped distribution plot
plot_metric_distribution(
      metric_summary = pisa_diagnostic_metrics_group, 
      colour_var = "income",
      group_var = "income"
      )

## ----trend-strength-dist, fig.height=5, fig.cap="Distribution of the trend strength metric coloured by income."----
# ungrouped distribution plot for trend-strength metric
plot_metric_distribution(
        metric_summary = pisa_diagnostic_metrics_group, 
        metric_var = "trend_strength",
        colour_var = "income"
    )

## ----linearity-curvature-dist, fig.height=5, fig.cap="Distribution of the linearity and curvature metrics coloured by income and grouped by income."----
# grouped distribution plot for linearity and curvature metrics
plot_metric_distribution(
        metric_summary = pisa_diagnostic_metrics_group, 
        metric_var = c("linearity", "curvature"),
        colour_var = "income",
        group_var = "income"
    )

## ----partition plot, fig.width=7, fig.height=9, fig.cap = "Country silhouette widths, grouped by income, with the average silhouette width for each income underlaid beneath the country bars. The majority of the countries in high income group exhibit positive silhouette widths. Across all the groups, they exhibit both positive and negative silhouette widths."----
plot_metric_partition(
          metric_summary = pisa_diagnostic_metrics_group,
          metric_var = "sil_width",
          group_var = "income"
 )

## ----trajectories-plot1, fig.height=3, fig.cap="The country line plots of PISA mathematics average scores dataset. Hovering over each line displays the corresponding country name."----
# ungrouped version
plot_data_trajectories(pisa_data)


## ----trajectories-plot2, fig.height=5, fig.cap="The PISA mathematics average scores data trajectories faceted by income."----
# grouped version
plot_data_trajectories(pisa_data, group_var = "income")

## ----metric-trajectories-plot1, fig.height=3, fig.cap="The PISA mathematics average scores data trajectories. Countries with average dissimilarity distance values below or at the 95th percentile are displayed in grey, while countries with the top 5% average dissimilarity between itself and other countries are highlighted using a colour gradient. Kyrgyz Republic, China, Dominican Republic and Singapore are the only highlighted countries."----
# ungrouped version
plot_data_trajectories(
        pisa_data, 
        metric_summary = pisa_variation, 
        metric_var = "country_avg_dist"
    )

## ----metric-trajectories-plot2, fig.height=5, fig.cap="The PM2.5 air pollution data trajectories faceted by income groupings with group-based threshold with highlighted countries based on the linearity metric values. Countries with absolute linearity values below or at the 96th percentile are displayed in grey, while countries within the top 4% absolute linearity values are displayed using a colour gradient."----
# grouped version
pisa_variation_group <- add_group_info(
                    metric_summary = pisa_variation,
                    pisa_data
            )
plot_data_trajectories(
        pisa_data, 
        metric_summary = pisa_variation_group, 
        metric_var = "within_group_avg_dist",
        group_var = "income"
)

## ----parallel-plot1, fig.height=5, fig.cap = "The static version of the parallel coordinate plot displaying the metric values across all the diagnostic indices. The metric values are normalised to a scale of 0 to 1."----
plot_parallel_coords(
      diagnostic_summary = pisa_diagnostic_metrics_group,
      colour_var = "income"
)

## ----parallel-plot2, fig.height=5, fig.cap="The static version of the parallel coordinate plot displaying the metric values across all diagnostic indices grouped by income. The metric values are normalised to a scale of 0 to 1 within each group. Countries in upper middle income, shown in blue, display a wide spread across most diagnostics indices."----
plot_parallel_coords(
      diagnostic_summary = pisa_diagnostic_metrics_group,
      colour_var = "income",
      group_var = "income"
)

## ----link-view1, fig.height=3.5, fig.cap="The static version of the interactive link-based plot showing the relationship between linearity and curvature metrics across all countries. Each point in the scatterplot represents a country, and hovering a point reveals its corresponding data series."----
# ungrouped version
plot_metric_linkview(
          pisa_data, 
          metric_summary = pisa_diagnostic_metrics,
          metric_var = c("linearity", "curvature")
        )

## ----link-view2, fig.height=5, fig.cap="The static version of the grouped link-based plot showing the relationship between linearity and curvature metrics across all countries faceted by income. Each point in the scatterplot represents a country, and hovering a point reveals its corresponding data series in its panel."----
# grouped version
plot_metric_linkview(
          pisa_data, 
          metric_summary = pisa_diagnostic_metrics_group,
          metric_var = c("linearity", "curvature"),
          group_var = "income"
      )