## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 4, message = FALSE, warning = FALSE ) library(datadriftR) ## ----drift-types-viz, fig.height=5, fig.cap="Common drift patterns", echo=FALSE---- set.seed(42) n <- 400 par(mfrow = c(2, 2), mar = c(4, 4, 3, 1)) # Abrupt plot(c(rnorm(200, 0, 1), rnorm(200, 3, 1)), type = "l", col = "#457B9D", main = "Abrupt Drift", xlab = "Time", ylab = "Value") abline(v = 200, col = "#E63946", lty = 2, lwd = 2) # Gradual gradual <- sapply(1:n, function(i) rnorm(1, mean = min(3, max(0, (i-150)/80)), sd = 1)) plot(gradual, type = "l", col = "#457B9D", main = "Gradual Drift", xlab = "Time", ylab = "Value") rect(150, -4, 230, 6, col = rgb(0.9, 0.2, 0.2, 0.15), border = NA) # Incremental plot(rnorm(n, mean = (1:n)/100, sd = 1), type = "l", col = "#457B9D", main = "Incremental Drift", xlab = "Time", ylab = "Value") # Recurring plot(sin((1:n)/25) * 2 + rnorm(n, 0, 0.3), type = "l", col = "#457B9D", main = "Recurring Drift", xlab = "Time", ylab = "Value") par(mfrow = c(1, 1)) ## ----quick-start-------------------------------------------------------------- set.seed(1) x <- c(rnorm(300, 0, 1), rnorm(200, 3, 1)) detect_drift(x, method = "page_hinkley", delta = 0.05, threshold = 50) ## ----error-rate-setup--------------------------------------------------------- set.seed(123) n_good <- 500 n_bad <- 500 error_stream <- c( rbinom(n_good, 1, prob = 0.05), rbinom(n_bad, 1, prob = 0.30) ) true_drift_error <- n_good + 1 ## ----error-methods-comparison------------------------------------------------- error_methods <- c("ddm", "eddm", "hddm_a", "hddm_w") first_index <- function(res, type) { idx <- res$index[res$type == type] if (length(idx) == 0) NA_integer_ else idx[1] } error_results <- do.call(rbind, lapply(error_methods, function(m) { res <- detect_drift(error_stream, method = m, include_warnings = TRUE) warning_idx <- first_index(res, "warning") drift_idx <- first_index(res, "drift") data.frame( Method = gsub("_", "-", toupper(m)), Warning = warning_idx, Drift = drift_idx, DriftDelay = if (!is.na(drift_idx)) drift_idx - true_drift_error else NA, stringsAsFactors = FALSE ) })) error_results ## ----error-rate-plot, fig.height=4, echo=FALSE-------------------------------- window <- 50 error_rate <- sapply(seq_along(error_stream), function(i) { mean(error_stream[max(1, i-window+1):i]) }) plot(error_rate, type = "l", col = "gray50", lwd = 2, xlab = "Observation", ylab = paste0("Error Rate (", window, "-obs window)"), main = "Error-Rate Method Comparison") abline(v = true_drift_error, col = "black", lty = 2, lwd = 2) colors <- c("#E63946", "#F4A261", "#2A9D8F", "#9B5DE5") for (i in seq_len(nrow(error_results))) { if (!is.na(error_results$Warning[i])) { abline(v = error_results$Warning[i], col = colors[i], lwd = 2, lty = 3) } if (!is.na(error_results$Drift[i])) { abline(v = error_results$Drift[i], col = colors[i], lwd = 2) } } legend("topleft", c("True drift", error_results$Method), col = c("black", colors), lty = c(2, rep(1, nrow(error_results))), lwd = 2, cex = 0.8) legend("bottomright", c("Warning", "Drift"), lty = c(3, 1), col = "gray30", lwd = 2, bty = "n", cex = 0.8) ## ----online-error-rate-------------------------------------------------------- ddm <- DDM$new() drifts <- c() for (i in seq_along(error_stream)) { ddm$add_element(error_stream[i]) if (ddm$change_detected) { drifts <- c(drifts, i) ddm$reset() } } data.frame(Method = "DDM", True = true_drift_error, Detected = drifts) ## ----online-error-rate-detect-drift------------------------------------------- ddm_res <- detect_drift(error_stream, method = "ddm", include_warnings = FALSE) ddm_res ## ----continuous-setup--------------------------------------------------------- set.seed(456) n_normal <- 300 n_faulty <- 200 sensor_stream <- c( rnorm(n_normal, mean = 20, sd = 1), rnorm(n_faulty, mean = 28, sd = 2) ) true_drift_sensor <- n_normal + 1 ## ----dist-methods-comparison-------------------------------------------------- dist_methods <- c("kswin", "adwin", "page_hinkley") dist_results <- do.call(rbind, lapply(dist_methods, function(m) { res <- detect_drift(sensor_stream, method = m) data.frame( Method = gsub("_", "-", toupper(m)), Detected = if (nrow(res) > 0) res$index[1] else NA, Delay = if (nrow(res) > 0) res$index[1] - true_drift_sensor else NA, stringsAsFactors = FALSE ) })) dist_results ## ----dist-plot, fig.height=4, echo=FALSE-------------------------------------- plot(sensor_stream, type = "l", col = "gray50", xlab = "Time", ylab = "Temperature (°C)", main = "Distribution Method Comparison") abline(v = true_drift_sensor, col = "black", lty = 2, lwd = 2) colors <- c("#E63946", "#2A9D8F", "#9B5DE5") for (i in seq_along(dist_results$Detected)) { if (!is.na(dist_results$Detected[i])) abline(v = dist_results$Detected[i], col = colors[i], lwd = 2) } legend("topleft", c("True drift", dist_results$Method), col = c("black", colors), lty = c(2, rep(1, 3)), lwd = 2, cex = 0.8) ## ----kld-setup---------------------------------------------------------------- set.seed(789) n_ref <- 400 n_shift <- 400 latency_ms <- c( rlnorm(n_ref, meanlog = log(100), sdlog = 0.25), rlnorm(n_shift, meanlog = log(180), sdlog = 0.30) ) true_drift_kld <- n_ref + 1 ## ----kld-detect--------------------------------------------------------------- window <- 200 kld <- KLDivergence$new(bins = 30, drift_level = 0.15) kld$set_initial_distribution(latency_ms[1:window]) kl <- rep(NA_real_, length(latency_ms)) for (t in (window + 1):length(latency_ms)) { current <- latency_ms[(t - window + 1):t] kld$add_distribution(current) kl[t] <- kld$get_kl_result() } detected_kld <- which(kl > kld$drift_level)[1] data.frame(True = true_drift_kld, Detected = detected_kld, Threshold = kld$drift_level) ## ----kld-plot, fig.height=4, echo=FALSE--------------------------------------- plot(kl, type = "l", col = "gray50", lwd = 2, xlab = "Time", ylab = "KL divergence", main = "KL Divergence vs. Reference Window") abline(v = true_drift_kld, col = "black", lty = 2, lwd = 2) abline(h = kld$drift_level, col = "#9B5DE5", lty = 2, lwd = 2) if (!is.na(detected_kld)) abline(v = detected_kld, col = "#E63946", lwd = 2) legend("topright", c("True drift", "Detected", "Threshold"), col = c("black", "#E63946", "#9B5DE5"), lty = c(2, 1, 2), lwd = 2, cex = 0.8) ## ----elec2-setup, eval=requireNamespace("dynaTree", quietly = TRUE) && requireNamespace("ranger", quietly = TRUE)---- library(dynaTree) library(ranger) elec2_env <- new.env(parent = emptyenv()) data("elec2", package = "dynaTree", envir = elec2_env) elec2_df <- get("elec2", envir = elec2_env) stopifnot(is.data.frame(elec2_df)) names(elec2_df) <- c("nswprice", "nswdemand", "vicprice", "vicdemand", "class_raw") elec2_df$class <- factor(elec2_df$class_raw, levels = c(1, 2), labels = c("DOWN", "UP")) elec2_df$class_raw <- NULL split_idx <- floor(nrow(elec2_df) / 2) period1_data <- elec2_df[1:split_idx, ] period2_data <- elec2_df[(split_idx + 1):nrow(elec2_df), ] n_train <- min(2000, nrow(period1_data), nrow(period2_data)) period1_train <- period1_data[1:n_train, ] period2_train <- period2_data[1:n_train, ] rf1 <- ranger(class ~ nswprice + nswdemand + vicprice + vicdemand, data = period1_train, probability = TRUE, num.trees = 200, seed = 1) rf2 <- ranger(class ~ nswprice + nswdemand + vicprice + vicdemand, data = period2_train, probability = TRUE, num.trees = 200, seed = 1) compute_pdp_rf <- function(model, data, var, grid) { preds <- sapply(grid, function(val) { newdata <- data newdata[[var]] <- val mean(predict(model, newdata)$predictions[, "UP"]) }) list(x = grid, y = preds) } demand_grid <- seq(min(elec2_df$nswdemand), max(elec2_df$nswdemand), length.out = 50) pdp1 <- compute_pdp_rf(rf1, period1_train, "nswdemand", demand_grid) pdp2 <- compute_pdp_rf(rf2, period2_train, "nswdemand", demand_grid) ## ----elec2-plot, fig.height=4, eval=requireNamespace("dynaTree", quietly = TRUE) && requireNamespace("ranger", quietly = TRUE), echo=FALSE---- plot(pdp1$x, pdp1$y, type = "l", lwd = 2, col = "#2A9D8F", xlab = "NSW Demand", ylab = "P(Price UP)", main = "PDP Drift: NSW Demand Effect on Price", ylim = range(c(pdp1$y, pdp2$y))) lines(pdp2$x, pdp2$y, lwd = 2, col = "#E63946") legend("topright", c("Period 1", "Period 2"), col = c("#2A9D8F", "#E63946"), lwd = 2) ## ----elec2-methods, eval=requireNamespace("dynaTree", quietly = TRUE) && requireNamespace("ranger", quietly = TRUE)---- # PDI (Profile Disparity Index) pd_pdi <- ProfileDifference$new(method = "pdi", deriv = "gold") pd_pdi$set_profiles(pdp1, pdp2) res_pdi <- pd_pdi$calculate_difference() # L2 norm pd_l2 <- ProfileDifference$new(method = "L2") pd_l2$set_profiles(pdp1, pdp2) res_l2 <- pd_l2$calculate_difference() # L2 derivative pd_l2d <- ProfileDifference$new(method = "L2_derivative") pd_l2d$set_profiles(pdp1, pdp2) res_l2d <- pd_l2d$calculate_difference() data.frame( Method = c("PDI", "L2", "L2_derivative"), Distance = round(c(res_pdi$distance, res_l2$distance, res_l2d$distance), 4) )