## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ## ----------------------------------------------------------------------------- library(autoFlagR) library(dplyr) library(ggplot2) ## ----------------------------------------------------------------------------- set.seed(123) # Simulate healthcare data n_patients <- 500 healthcare_data <- data.frame( patient_id = 1:n_patients, age = round(rnorm(n_patients, 55, 15)), systolic_bp = round(rnorm(n_patients, 120, 15)), diastolic_bp = round(rnorm(n_patients, 80, 10)), cholesterol = round(rnorm(n_patients, 200, 40)), glucose = round(rnorm(n_patients, 100, 20)), bmi = round(rnorm(n_patients, 28, 5), 1), gender = sample(c("Male", "Female"), n_patients, replace = TRUE), diagnosis = sample(c("Hypertension", "Diabetes", "Normal"), n_patients, replace = TRUE, prob = c(0.3, 0.2, 0.5)) ) # Introduce known anomalies healthcare_data$age[1:10] <- c(250, 180, 200, 190, 185, 175, 170, 165, 160, 155) # Impossible ages healthcare_data$systolic_bp[11:15] <- c(300, 280, 290, 275, 285) # Extreme blood pressure healthcare_data$cholesterol[16:20] <- c(600, 580, 590, 570, 585) # Very high cholesterol healthcare_data$glucose[21:25] <- c(5, 3, 4, 2, 6) # Unrealistically low glucose # Create ground truth labels and add to data healthcare_data$is_anomaly_truth <- rep(FALSE, n_patients) healthcare_data$is_anomaly_truth[1:25] <- TRUE # First 25 are anomalies head(healthcare_data) ## ----------------------------------------------------------------------------- # Prepare data for anomaly detection prepared <- prep_for_anomaly( healthcare_data, id_cols = "patient_id", scale_method = "mad" ) # View preprocessing metadata str(attr(prepared, "metadata")) ## ----------------------------------------------------------------------------- # Score anomalies using Isolation Forest scored_data <- score_anomaly( healthcare_data, method = "iforest", contamination = 0.05, ground_truth_col = "is_anomaly_truth", id_cols = "patient_id" ) # View summary statistics summary(scored_data$anomaly_score) ## ----------------------------------------------------------------------------- # Flag top anomalies flagged_data <- flag_top_anomalies( scored_data, contamination = 0.05 ) # Count anomalies cat("Total anomalies flagged:", sum(flagged_data$is_anomaly), "\n") cat("Anomaly rate:", mean(flagged_data$is_anomaly) * 100, "%\n") ## ----------------------------------------------------------------------------- # Plot anomaly score distribution ggplot(flagged_data, aes(x = anomaly_score)) + geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7, color = "black") + geom_vline(xintercept = attr(flagged_data, "anomaly_threshold"), color = "red", linetype = "dashed", linewidth = 1) + labs( title = "Distribution of Anomaly Scores", x = "Anomaly Score", y = "Frequency" ) + theme_minimal() ## ----------------------------------------------------------------------------- # Get top 10 anomalies top_anomalies <- get_top_anomalies(flagged_data, n = 10) # View top anomalies top_anomalies[, c("patient_id", "age", "systolic_bp", "cholesterol", "glucose", "anomaly_score", "is_anomaly")] ## ----------------------------------------------------------------------------- # Extract benchmark metrics if (!is.null(attr(scored_data, "benchmark_metrics"))) { metrics <- extract_benchmark_metrics(scored_data) cat("AUC-ROC:", metrics$auc_roc, "\n") cat("AUC-PR:", metrics$auc_pr, "\n") cat("Top-10 Recall:", metrics$top_k_recall$top_10, "\n") cat("Top-50 Recall:", metrics$top_k_recall$top_50, "\n") } ## ----eval=FALSE--------------------------------------------------------------- # # Generate PDF audit report (saves to tempdir() by default) # generate_audit_report( # healthcare_data, # filename = "healthcare_audit_report", # output_dir = tempdir(), # output_format = "pdf", # method = "iforest", # contamination = 0.05, # ground_truth_col = "is_anomaly_truth", # id_cols = "patient_id" # )