---
title: "Healthcare Data Quality Example"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Healthcare Data Quality Example}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
```

## Introduction

This vignette demonstrates using `autoFlagR` for data quality auditing in a healthcare context. We'll work through a complete example using simulated Electronic Health Records (EHR) data.

## Load Required Packages

```{r}
library(autoFlagR)
library(dplyr)
library(ggplot2)
```

## Create Example Healthcare Dataset

```{r}
set.seed(123)

# Simulate healthcare data
n_patients <- 500
healthcare_data <- data.frame(
  patient_id = 1:n_patients,
  age = round(rnorm(n_patients, 55, 15)),
  systolic_bp = round(rnorm(n_patients, 120, 15)),
  diastolic_bp = round(rnorm(n_patients, 80, 10)),
  cholesterol = round(rnorm(n_patients, 200, 40)),
  glucose = round(rnorm(n_patients, 100, 20)),
  bmi = round(rnorm(n_patients, 28, 5), 1),
  gender = sample(c("Male", "Female"), n_patients, replace = TRUE),
  diagnosis = sample(c("Hypertension", "Diabetes", "Normal"), n_patients, replace = TRUE, prob = c(0.3, 0.2, 0.5))
)

# Introduce known anomalies
healthcare_data$age[1:10] <- c(250, 180, 200, 190, 185, 175, 170, 165, 160, 155)  # Impossible ages
healthcare_data$systolic_bp[11:15] <- c(300, 280, 290, 275, 285)  # Extreme blood pressure
healthcare_data$cholesterol[16:20] <- c(600, 580, 590, 570, 585)  # Very high cholesterol
healthcare_data$glucose[21:25] <- c(5, 3, 4, 2, 6)  # Unrealistically low glucose

# Create ground truth labels and add to data
healthcare_data$is_anomaly_truth <- rep(FALSE, n_patients)
healthcare_data$is_anomaly_truth[1:25] <- TRUE  # First 25 are anomalies

head(healthcare_data)
```

## Preprocess Data

```{r}
# Prepare data for anomaly detection
prepared <- prep_for_anomaly(
  healthcare_data,
  id_cols = "patient_id",
  scale_method = "mad"
)

# View preprocessing metadata
str(attr(prepared, "metadata"))
```

## Score Anomalies

```{r}
# Score anomalies using Isolation Forest
scored_data <- score_anomaly(
  healthcare_data,
  method = "iforest",
  contamination = 0.05,
  ground_truth_col = "is_anomaly_truth",
  id_cols = "patient_id"
)

# View summary statistics
summary(scored_data$anomaly_score)
```

## Flag Top Anomalies

```{r}
# Flag top anomalies
flagged_data <- flag_top_anomalies(
  scored_data,
  contamination = 0.05
)

# Count anomalies
cat("Total anomalies flagged:", sum(flagged_data$is_anomaly), "\n")
cat("Anomaly rate:", mean(flagged_data$is_anomaly) * 100, "%\n")
```

## Visualize Results

```{r}
# Plot anomaly score distribution
ggplot(flagged_data, aes(x = anomaly_score)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7, color = "black") +
  geom_vline(xintercept = attr(flagged_data, "anomaly_threshold"),
             color = "red", linetype = "dashed", linewidth = 1) +
  labs(
    title = "Distribution of Anomaly Scores",
    x = "Anomaly Score",
    y = "Frequency"
  ) +
  theme_minimal()
```

## Extract Top Anomalies

```{r}
# Get top 10 anomalies
top_anomalies <- get_top_anomalies(flagged_data, n = 10)

# View top anomalies
top_anomalies[, c("patient_id", "age", "systolic_bp", "cholesterol", 
                  "glucose", "anomaly_score", "is_anomaly")]
```

## Benchmarking (if ground truth available)

```{r}
# Extract benchmark metrics
if (!is.null(attr(scored_data, "benchmark_metrics"))) {
  metrics <- extract_benchmark_metrics(scored_data)
  
  cat("AUC-ROC:", metrics$auc_roc, "\n")
  cat("AUC-PR:", metrics$auc_pr, "\n")
  cat("Top-10 Recall:", metrics$top_k_recall$top_10, "\n")
  cat("Top-50 Recall:", metrics$top_k_recall$top_50, "\n")
}
```

## Generate Comprehensive Report

```{r eval=FALSE}
# Generate PDF audit report (saves to tempdir() by default)
generate_audit_report(
  healthcare_data,
  filename = "healthcare_audit_report",
  output_dir = tempdir(),
  output_format = "pdf",
  method = "iforest",
  contamination = 0.05,
  ground_truth_col = "is_anomaly_truth",
  id_cols = "patient_id"
)
```

The report will include:
- Executive summary with key metrics
- Anomaly score distribution
- Prioritized audit listing (heatmap)
- Bivariate visualizations
- Distribution comparisons
- Benchmarking results (if ground truth provided)

## Summary

This example demonstrated:
1. Creating and preprocessing healthcare data
2. Scoring anomalies using Isolation Forest
3. Flagging top anomalies for review
4. Visualizing results
5. Extracting benchmark metrics
6. Generating professional audit reports

For more details, see the [Function Reference](https://vikrant31.github.io/autoFlagR/reference/index.html).