# Import the TIP library
library(tip)
# Import the US Arrests dataset
data(USArrests)
# Convert the data to a matrix
X <- data.matrix(USArrests)
# Compute the distance matrix
distance_matrix <- data.matrix(dist(X))
# Compute the temperature parameter estiamte
temperature <- 1/median(distance_matrix[upper.tri(distance_matrix)])
# For each subject, compute the point estimate for the number of similar
# subjects using univariate multiple change point detection (i.e.)
init_num_neighbors = get_cpt_neighbors(.distance_matrix = distance_matrix)
#> Warning in BINSEG(sumstat, pen = pen.value, cost_func = costfunc, minseglen
#> = minseglen, : The number of changepoints identified is Q, it is advised to
#> increase Q to make sure changepoints have not been missed.
# Set the number of burn-in iterations in the Gibbs samlper
# RECOMENDATION: *** burn >= 1000 ***
burn <- 10
# Set the number of sampling iterations in the Gibbs sampler
# RECOMENDATION: *** samples >= 1000 ***
samples <- 10
# Extract the state names
names_subjects <- rownames(USArrests)
# Run TIP clustering using only the prior
# --> That is, the likelihood function is constant
tip1 <- tip(.data = data.matrix(X),
.burn = burn,
.samples = samples,
.similarity_matrix = exp(-1.0*temperature*distance_matrix),
.init_num_neighbors = init_num_neighbors,
.likelihood_model = "NIW",
.subject_names = names_subjects,
.num_cores = 1)
#> Bayesian Clustering: Table Invitation Prior Gibbs Sampler
#> burn-in: 10
#> samples: 10
#> Likelihood Model: NIW
#>
|
| | 0%
|
|==== | 6%
|
|======== | 11%
|
|============ | 17%
|
|================ | 22%
|
|=================== | 28%
|
|======================= | 33%
|
|=========================== | 39%
|
|=============================== | 44%
|
|=================================== | 50%
|
|======================================= | 56%
|
|=========================================== | 61%
|
|=============================================== | 67%
|
|=================================================== | 72%
|
|====================================================== | 78%
|
|========================================================== | 83%
|
|============================================================== | 89%
|
|================================================================== | 94%
|
|======================================================================| 100%# Produce plots for the Bayesian Clustering Model
tip_plots <- plot(tip1)# View the posterior distribution of the number of clusters
tip_plots$trace_plot_posterior_number_of_clusters# View the trace plot with respect to the posterior number of clusters
tip_plots$trace_plot_posterior_number_of_clusters# Extract posterior cluster assignments using the Posterior Expected Adjusted Rand (PEAR) index
cluster_assignments <- mcclust::maxpear(psm = tip1@posterior_similarity_matrix)$cl
# Create a list where each element stores the cluster assignments
cluster_assignment_list <- list()
for(k in 1:length(unique(cluster_assignments))){
cluster_assignment_list[[k]] <- names_subjects[which(cluster_assignments == k)]
}
cluster_assignment_list
#> [[1]]
#> [1] "Alabama" "Alaska" "California" "Colorado" "Illinois"
#> [6] "Louisiana" "Michigan" "Mississippi" "Missouri" "Nevada"
#> [11] "New Jersey" "Oregon" "Texas" "Washington"
#>
#> [[2]]
#> [1] "Arizona" "Connecticut" "Delaware" "Hawaii"
#> [5] "Maryland" "New Mexico" "North Carolina" "Ohio"
#> [9] "Pennsylvania" "South Carolina" "Tennessee"
#>
#> [[3]]
#> [1] "Arkansas" "Florida" "Idaho" "Iowa"
#> [5] "Kentucky" "Maine" "Montana" "New Hampshire"
#> [9] "New York" "North Dakota" "Rhode Island" "South Dakota"
#> [13] "Utah" "Vermont" "West Virginia" "Wisconsin"
#> [17] "Wyoming"
#>
#> [[4]]
#> [1] "Georgia" "Massachusetts"
#>
#> [[5]]
#> [1] "Indiana" "Virginia"
#>
#> [[6]]
#> [1] "Kansas" "Nebraska"
#>
#> [[7]]
#> [1] "Minnesota" "Oklahoma"# Create the one component graph with minimum entropy
partition_list <- partition_undirected_graph(.graph_matrix = tip1@posterior_similarity_matrix,
.num_components = 1,
.step_size = 0.001)# View the state names
# names_subjects
# Create a vector of true region labels to see if there is a pattern
true_region <- c("Southeast", "West", "Southwest", "Southeast", "West", "West",
"Northeast", "Northeast", "Southeast", "Southeast", "West", "West",
"Midwest", "Midwest", "Midwest", "Midwest", "Southeast", "Southeast",
"Northeast", "Northeast", "Northeast", "Midwest", "Midwest", "Southeast",
"Midwest", "West", "Midwest", "West", "Northeast", "Northeast",
"Southwest", "Northeast", "Southeast", "Midwest", "Midwest", "Southwest",
"West", "Northeast", "Northeast", "Southeast", "Midwest", "Southeast",
"Southwest", "West", "Northeast", "Southeast", "West", "Southeast",
"Midwest", "West")
names_subjects
#> [1] "Alabama" "Alaska" "Arizona" "Arkansas"
#> [5] "California" "Colorado" "Connecticut" "Delaware"
#> [9] "Florida" "Georgia" "Hawaii" "Idaho"
#> [13] "Illinois" "Indiana" "Iowa" "Kansas"
#> [17] "Kentucky" "Louisiana" "Maine" "Maryland"
#> [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
#> [25] "Missouri" "Montana" "Nebraska" "Nevada"
#> [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
#> [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
#> [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
#> [41] "South Dakota" "Tennessee" "Texas" "Utah"
#> [45] "Vermont" "Virginia" "Washington" "West Virginia"
#> [49] "Wisconsin" "Wyoming"
# Associate class labels and colors for the plot
class_palette_colors <- c("Northeast" = "blue",
"Southeast" = "red",
"Midwest" = "purple",
"Southwest" = "orange",
"West" = "green")
# Associate class labels and shapes for the plot
class_palette_shapes <- c("Northeast" = 15,
"Southeast" = 16,
"Midwest" = 17,
"Southwest" = 18,
"West" = 19)
# Visualize the posterior similarity matrix by constructing a graph plot of
# the one-cluster graph. The true labels are used here (below they are not).
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
.subject_names = names_subjects,
.subject_class_names = true_region,
.class_colors = class_palette_colors,
.class_shapes = class_palette_shapes,
.node_size = 2,
.add_node_labels = TRUE)
#> Warning: Duplicated override.aes is ignored.# Visualize the posterior similarity matrix by constructing a graph plot of
# the one-cluster graph. The true labels are used here (below they are not).
# Remove the subject names with .add_node_labels = FALSE
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
.subject_names = names_subjects,
.subject_class_names = true_region,
.class_colors = class_palette_colors,
.class_shapes = class_palette_shapes,
.node_size = 2,
.add_node_labels = FALSE)
#> Warning: Duplicated override.aes is ignored.# Construct a network plot without class labels
# Note: Subject labels may be suppressed using .add_node_labels = FALSE.
ggnet2_network_plot(.matrix_graph = partition_list$partitioned_graph_matrix,
.subject_names = names_subjects,
.node_size = 2,
.add_node_labels = TRUE)