## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE) set.seed(1) ## ----dataGen------------------------------------------------------------------ library(ggplot2) library(batchmix) # Data dimensions N <- 600 P <- 4 K <- 5 B <- 7 # Generating model parameters mean_dist <- 2.25 batch_dist <- 0.3 group_means <- seq(1, K) * mean_dist batch_shift <- rnorm(B, mean = batch_dist, sd = batch_dist) std_dev <- rep(2, K) batch_var <- rep(1.2, B) group_weights <- rep(1 / K, K) batch_weights <- rep(1 / B, B) dfs <- c(4, 7, 15, 60, 120) my_data <- generateBatchData( N, P, group_means, std_dev, batch_shift, batch_var, group_weights, batch_weights, type = "MVT", group_dfs = dfs ) ## ----dataClean---------------------------------------------------------------- X <- my_data$observed_data true_labels <- my_data$group_IDs fixed <- my_data$fixed batch_vec <- my_data$batch_IDs alpha <- 1 initial_labels <- generateInitialLabels(alpha, K, fixed, true_labels) ## ----runMCMCChains------------------------------------------------------------ # Sampling parameters R <- 1000 thin <- 50 n_chains <- 4 # Density choice type <- "MVT" # MCMC samples and BIC vector mcmc_output <- runMCMCChains( X, n_chains, R, thin, batch_vec, type, initial_labels = initial_labels, fixed = fixed ) ## ----plotAcceptanceRatesEarly------------------------------------------------- plotAcceptanceRates(mcmc_output) ## ----likelihood--------------------------------------------------------------- plotLikelihoods(mcmc_output) ## ----continueChains----------------------------------------------------------- R_new <- 9000 # Given an initial value for the parameters new_output <- continueChains( mcmc_output, X, fixed, batch_vec, R_new, keep_old_samples = TRUE ) ## ----continuedLikelihood------------------------------------------------------ plotLikelihoods(new_output) ## ----plotAcceptanceRates------------------------------------------------------ plotAcceptanceRates(new_output) ## ----processChains------------------------------------------------------------ # Burn in burn <- 5000 # Process the MCMC samples processed_samples <- processMCMCChains(new_output, burn) ## ----pca---------------------------------------------------------------------- chain_used <- processed_samples[[1]] pc <- prcomp(X, scale = T) pc_batch_corrected <- prcomp(chain_used$inferred_dataset) plot_df <- data.frame( PC1 = pc$x[, 1], PC2 = pc$x[, 2], PC1_bf = pc_batch_corrected$x[, 1], PC2_bf = pc_batch_corrected$x[, 2], pred_labels = factor(chain_used$pred), true_labels = factor(true_labels), prob = chain_used$prob, batch = factor(batch_vec) ) plot_df |> ggplot(aes( x = PC1, y = PC2, colour = true_labels, alpha = prob )) + geom_point() plot_df |> ggplot(aes( x = PC1_bf, y = PC2_bf, colour = pred_labels, alpha = prob )) + geom_point() test_inds <- which(fixed == 0) sum(true_labels[test_inds] == chain_used$pred[test_inds]) / length(test_inds)