## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, eval=FALSE-------------------------------------------------------- # library(starburst) ## ----data, eval=FALSE--------------------------------------------------------- # set.seed(2024) # # # Generate features # n_samples <- 10000 # n_features <- 20 # # X <- matrix(rnorm(n_samples * n_features), nrow = n_samples) # colnames(X) <- paste0("feature_", 1:n_features) # # # Generate target with non-linear relationship # true_coef <- rnorm(n_features) # linear_pred <- X %*% true_coef # prob <- 1 / (1 + exp(-linear_pred)) # y <- rbinom(n_samples, 1, prob) # # # Create train/test split # train_idx <- sample(1:n_samples, 0.7 * n_samples) # X_train <- X[train_idx, ] # y_train <- y[train_idx] # X_test <- X[-train_idx, ] # y_test <- y[-train_idx] # # cat(sprintf("Dataset created:\n")) # cat(sprintf(" Training samples: %s\n", format(length(y_train), big.mark = ","))) # cat(sprintf(" Test samples: %s\n", format(length(y_test), big.mark = ","))) # cat(sprintf(" Features: %d\n", n_features)) # cat(sprintf(" Class balance: %.1f%% / %.1f%%\n", # mean(y_train) * 100, (1 - mean(y_train)) * 100)) ## ----grid, eval=FALSE--------------------------------------------------------- # # Define parameter space # param_grid <- expand.grid( # learning_rate = c(0.01, 0.05, 0.1), # max_depth = c(3, 5, 7), # subsample = c(0.6, 0.8, 1.0), # min_child_weight = c(1, 3, 5), # stringsAsFactors = FALSE # ) # # cat(sprintf("Grid search space:\n")) # cat(sprintf(" Total parameter combinations: %d\n", nrow(param_grid))) # cat(sprintf(" With 5-fold CV: %d model fits\n\n", nrow(param_grid) * 5)) ## ----train-fn, eval=FALSE----------------------------------------------------- # # Simple gradient boosting implementation (for demonstration) # # In practice, use xgboost, lightgbm, or other optimized libraries # train_gbm <- function(X, y, params, n_trees = 50) { # # Simplified GBM simulation # # This is a mock implementation - in real use, call xgboost, etc. # # n <- nrow(X) # pred <- rep(mean(y), n) # Initial prediction # # # Simulate training time based on complexity # complexity_factor <- params$max_depth * (1 / params$learning_rate) * # (1 / params$subsample) # training_time <- 0.001 * complexity_factor * n_trees # # Sys.sleep(min(training_time, 5)) # Cap at 5 seconds # # # Generate mock predictions with some realism # pred <- pred + rnorm(n, 0, 0.1) # pred <- pmin(pmax(pred, 0), 1) # Bound to [0, 1] # # list(predictions = pred, params = params) # } # # # Cross-validation function # cv_evaluate <- function(param_row, X_data, y_data, n_folds = 5) { # params <- as.list(param_row) # # # Create folds # n <- nrow(X_data) # fold_size <- floor(n / n_folds) # fold_indices <- sample(rep(1:n_folds, length.out = n)) # # # Perform cross-validation # cv_scores <- numeric(n_folds) # # for (fold in 1:n_folds) { # # Split data # val_idx <- which(fold_indices == fold) # train_idx <- which(fold_indices != fold) # # X_fold_train <- X_data[train_idx, , drop = FALSE] # y_fold_train <- y_data[train_idx] # X_fold_val <- X_data[val_idx, , drop = FALSE] # y_fold_val <- y_data[val_idx] # # # Train model # model <- train_gbm(X_fold_train, y_fold_train, params) # # # Predict and evaluate (mock evaluation) # # In practice, compute actual predictions and metrics # baseline_accuracy <- mean(y_fold_val == round(mean(y_fold_train))) # # # Simulate performance improvement based on good parameters # param_quality <- (params$learning_rate >= 0.05) * 0.02 + # (params$max_depth >= 5) * 0.02 + # (params$subsample >= 0.8) * 0.01 + # rnorm(1, 0, 0.02) # # accuracy <- min(baseline_accuracy + param_quality, 1.0) # cv_scores[fold] <- accuracy # } # # # Return results # list( # params = params, # mean_cv_score = mean(cv_scores), # std_cv_score = sd(cv_scores), # cv_scores = cv_scores # ) # } ## ----local, eval=FALSE-------------------------------------------------------- # # Test with 10 parameter combinations # set.seed(999) # sample_params <- param_grid[sample(1:nrow(param_grid), 10), ] # # cat(sprintf("Running local benchmark (%d parameter combinations)...\n", # nrow(sample_params))) # local_start <- Sys.time() # # local_results <- lapply(1:nrow(sample_params), function(i) { # cv_evaluate(sample_params[i, ], X_train, y_train, n_folds = 5) # }) # # local_time <- as.numeric(difftime(Sys.time(), local_start, units = "mins")) # # cat(sprintf("✓ Completed in %.2f minutes\n", local_time)) # cat(sprintf(" Average time per combination: %.1f seconds\n", # local_time * 60 / nrow(sample_params))) # cat(sprintf(" Estimated time for full grid (%d combinations): %.1f minutes\n", # nrow(param_grid), local_time * nrow(param_grid) / nrow(sample_params))) ## ----cloud, eval=FALSE-------------------------------------------------------- # n_workers <- 27 # Process ~3 parameter combinations per worker # # cat(sprintf("Running grid search (%d combinations) on %d workers...\n", # nrow(param_grid), n_workers)) # # cloud_start <- Sys.time() # # results <- starburst_map( # 1:nrow(param_grid), # function(i) cv_evaluate(param_grid[i, ], X_train, y_train, n_folds = 5), # workers = n_workers, # cpu = 2, # memory = "4GB" # ) # # cloud_time <- as.numeric(difftime(Sys.time(), cloud_start, units = "mins")) # # cat(sprintf("\n✓ Completed in %.2f minutes\n", cloud_time)) ## ----analysis, eval=FALSE----------------------------------------------------- # # Extract results # cv_scores <- sapply(results, function(x) x$mean_cv_score) # cv_stds <- sapply(results, function(x) x$std_cv_score) # # # Combine with parameters # results_df <- cbind(param_grid, # mean_score = cv_scores, # std_score = cv_stds) # # # Sort by performance # results_df <- results_df[order(-results_df$mean_score), ] # # cat("\n=== Grid Search Results ===\n\n") # cat(sprintf("Total combinations evaluated: %d\n", nrow(results_df))) # cat(sprintf("Best CV score: %.4f (± %.4f)\n", # results_df$mean_score[1], results_df$std_score[1])) # # cat("\n=== Best Hyperparameters ===\n") # cat(sprintf(" Learning rate: %.3f\n", results_df$learning_rate[1])) # cat(sprintf(" Max depth: %d\n", results_df$max_depth[1])) # cat(sprintf(" Subsample: %.2f\n", results_df$subsample[1])) # cat(sprintf(" Min child weight: %d\n", results_df$min_child_weight[1])) # # cat("\n=== Top 5 Parameter Combinations ===\n") # for (i in 1:5) { # cat(sprintf("\n%d. Score: %.4f (± %.4f)\n", i, # results_df$mean_score[i], results_df$std_score[i])) # cat(sprintf(" lr=%.3f, depth=%d, subsample=%.2f, min_child=%d\n", # results_df$learning_rate[i], # results_df$max_depth[i], # results_df$subsample[i], # results_df$min_child_weight[i])) # } # # # Parameter importance analysis # cat("\n=== Parameter Impact Analysis ===\n") # for (param in c("learning_rate", "max_depth", "subsample", "min_child_weight")) { # param_means <- aggregate(mean_score ~ get(param), # data = results_df, FUN = mean) # names(param_means)[1] <- param # # cat(sprintf("\n%s:\n", param)) # for (i in 1:nrow(param_means)) { # cat(sprintf(" %s: %.4f\n", # param_means[i, 1], # param_means[i, 2])) # } # } # # # Visualize results (if in interactive session) # if (interactive()) { # # Score distribution # hist(results_df$mean_score, # breaks = 20, # main = "Distribution of Cross-Validation Scores", # xlab = "Mean CV Score", # col = "lightblue", # border = "white") # abline(v = results_df$mean_score[1], col = "red", lwd = 2, lty = 2) # # # Learning rate effect # boxplot(mean_score ~ learning_rate, data = results_df, # main = "Learning Rate Impact", # xlab = "Learning Rate", # ylab = "CV Score", # col = "lightgreen") # } ## ----random-search, eval=FALSE------------------------------------------------ # # Generate random parameter combinations # n_random <- 100 # # random_params <- data.frame( # learning_rate = runif(n_random, 0.001, 0.3), # max_depth = sample(2:10, n_random, replace = TRUE), # subsample = runif(n_random, 0.5, 1.0), # min_child_weight = sample(1:10, n_random, replace = TRUE), # stringsAsFactors = FALSE # ) # # cat(sprintf("Running random search (%d combinations)...\n", n_random)) # # random_results <- starburst_map( # 1:nrow(random_params), # function(i) cv_evaluate(random_params[i, ], X_train, y_train, n_folds = 5), # workers = 33, # cpu = 2, # memory = "4GB" # ) # # # Find best parameters # random_scores <- sapply(random_results, function(x) x$mean_cv_score) # best_idx <- which.max(random_scores) # # cat("\nBest random search result:\n") # cat(sprintf(" Score: %.4f\n", random_scores[best_idx])) # cat(sprintf(" Learning rate: %.4f\n", random_params$learning_rate[best_idx])) # cat(sprintf(" Max depth: %d\n", random_params$max_depth[best_idx])) ## ----bayesian, eval=FALSE----------------------------------------------------- # # Bayesian optimization would involve: # # 1. Evaluate a small initial set (e.g., 10 combinations) # # 2. Fit a Gaussian process to predict performance # # 3. Use acquisition function to select next promising points # # 4. Evaluate new points in parallel # # 5. Repeat until convergence # # # This requires specialized packages like mlrMBO or rBayesianOptimization # # but can be parallelized with starburst for the evaluation step ## ----eval=FALSE--------------------------------------------------------------- # system.file("examples/grid-search.R", package = "starburst") ## ----eval=FALSE--------------------------------------------------------------- # source(system.file("examples/grid-search.R", package = "starburst"))