--- title: "Healthcare Data Quality Example" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Healthcare Data Quality Example} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ``` ## Introduction This vignette demonstrates using `autoFlagR` for data quality auditing in a healthcare context. We'll work through a complete example using simulated Electronic Health Records (EHR) data. ## Load Required Packages ```{r} library(autoFlagR) library(dplyr) library(ggplot2) ``` ## Create Example Healthcare Dataset ```{r} set.seed(123) # Simulate healthcare data n_patients <- 500 healthcare_data <- data.frame( patient_id = 1:n_patients, age = round(rnorm(n_patients, 55, 15)), systolic_bp = round(rnorm(n_patients, 120, 15)), diastolic_bp = round(rnorm(n_patients, 80, 10)), cholesterol = round(rnorm(n_patients, 200, 40)), glucose = round(rnorm(n_patients, 100, 20)), bmi = round(rnorm(n_patients, 28, 5), 1), gender = sample(c("Male", "Female"), n_patients, replace = TRUE), diagnosis = sample(c("Hypertension", "Diabetes", "Normal"), n_patients, replace = TRUE, prob = c(0.3, 0.2, 0.5)) ) # Introduce known anomalies healthcare_data$age[1:10] <- c(250, 180, 200, 190, 185, 175, 170, 165, 160, 155) # Impossible ages healthcare_data$systolic_bp[11:15] <- c(300, 280, 290, 275, 285) # Extreme blood pressure healthcare_data$cholesterol[16:20] <- c(600, 580, 590, 570, 585) # Very high cholesterol healthcare_data$glucose[21:25] <- c(5, 3, 4, 2, 6) # Unrealistically low glucose # Create ground truth labels and add to data healthcare_data$is_anomaly_truth <- rep(FALSE, n_patients) healthcare_data$is_anomaly_truth[1:25] <- TRUE # First 25 are anomalies head(healthcare_data) ``` ## Preprocess Data ```{r} # Prepare data for anomaly detection prepared <- prep_for_anomaly( healthcare_data, id_cols = "patient_id", scale_method = "mad" ) # View preprocessing metadata str(attr(prepared, "metadata")) ``` ## Score Anomalies ```{r} # Score anomalies using Isolation Forest scored_data <- score_anomaly( healthcare_data, method = "iforest", contamination = 0.05, ground_truth_col = "is_anomaly_truth", id_cols = "patient_id" ) # View summary statistics summary(scored_data$anomaly_score) ``` ## Flag Top Anomalies ```{r} # Flag top anomalies flagged_data <- flag_top_anomalies( scored_data, contamination = 0.05 ) # Count anomalies cat("Total anomalies flagged:", sum(flagged_data$is_anomaly), "\n") cat("Anomaly rate:", mean(flagged_data$is_anomaly) * 100, "%\n") ``` ## Visualize Results ```{r} # Plot anomaly score distribution ggplot(flagged_data, aes(x = anomaly_score)) + geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7, color = "black") + geom_vline(xintercept = attr(flagged_data, "anomaly_threshold"), color = "red", linetype = "dashed", linewidth = 1) + labs( title = "Distribution of Anomaly Scores", x = "Anomaly Score", y = "Frequency" ) + theme_minimal() ``` ## Extract Top Anomalies ```{r} # Get top 10 anomalies top_anomalies <- get_top_anomalies(flagged_data, n = 10) # View top anomalies top_anomalies[, c("patient_id", "age", "systolic_bp", "cholesterol", "glucose", "anomaly_score", "is_anomaly")] ``` ## Benchmarking (if ground truth available) ```{r} # Extract benchmark metrics if (!is.null(attr(scored_data, "benchmark_metrics"))) { metrics <- extract_benchmark_metrics(scored_data) cat("AUC-ROC:", metrics$auc_roc, "\n") cat("AUC-PR:", metrics$auc_pr, "\n") cat("Top-10 Recall:", metrics$top_k_recall$top_10, "\n") cat("Top-50 Recall:", metrics$top_k_recall$top_50, "\n") } ``` ## Generate Comprehensive Report ```{r eval=FALSE} # Generate PDF audit report (saves to tempdir() by default) generate_audit_report( healthcare_data, filename = "healthcare_audit_report", output_dir = tempdir(), output_format = "pdf", method = "iforest", contamination = 0.05, ground_truth_col = "is_anomaly_truth", id_cols = "patient_id" ) ``` The report will include: - Executive summary with key metrics - Anomaly score distribution - Prioritized audit listing (heatmap) - Bivariate visualizations - Distribution comparisons - Benchmarking results (if ground truth provided) ## Summary This example demonstrated: 1. Creating and preprocessing healthcare data 2. Scoring anomalies using Isolation Forest 3. Flagging top anomalies for review 4. Visualizing results 5. Extracting benchmark metrics 6. Generating professional audit reports For more details, see the [Function Reference](https://vikrant31.github.io/autoFlagR/reference/index.html).