## ----include=FALSE------------------------------------------------------------ library(stabm) ## ----------------------------------------------------------------------------- listStabilityMeasures() ## ----------------------------------------------------------------------------- feats = list(1:3, 1:4, c(1:3, 5:7)) stabilityJaccard(features = feats) stabilityNogueira(features = feats, p = 10) ## ----------------------------------------------------------------------------- mat = 0.92 ^ abs(outer(1:10, 1:10, "-")) set.seed(1) stabilityIntersectionCount(features = feats, sim.mat = mat, N = 100) ## ---- fig.width=4.5, fig.height=3, fig.align="center", message=FALSE---------- plotFeatures(feats) ## ----------------------------------------------------------------------------- library(rpart) # for classification trees data("BostonHousing2", package = "mlbench") # remove feature that is a version of the target variable dataset = subset(BostonHousing2, select = -cmedv) ## ----------------------------------------------------------------------------- fit_tree = function(target = "medv", data = dataset, ratio = 0.67, cp = 0.01) { n = nrow(data) i = sample(n, n * ratio) formula = as.formula(paste(target, "~ .")) model = rpart::rpart(formula = formula, data = data, subset = i, control = rpart.control(maxsurrogate = 0, cp = cp)) names(model$variable.importance) } set.seed(1) fit_tree() ## ----------------------------------------------------------------------------- set.seed(1) selected_features = replicate(30, fit_tree(), simplify = FALSE) ## ----------------------------------------------------------------------------- # Selected in each repetition: Reduce(intersect, selected_features) # Sorted selection frequency across all 30 repetitions: sort(table(unlist(selected_features)), decreasing = TRUE) ## ----------------------------------------------------------------------------- plotFeatures(selected_features) ## ----------------------------------------------------------------------------- stabilityJaccard(selected_features) ## ----------------------------------------------------------------------------- set.seed(1) selected_features2 = replicate(30, fit_tree(cp = 0.02), simplify = FALSE) stabilityJaccard(selected_features2) plotFeatures(selected_features2) ## ----------------------------------------------------------------------------- dataset2 = subset(BostonHousing2, select = -town) dataset2$chas = as.numeric(dataset2$chas) set.seed(1) selected_features3 = replicate(30, fit_tree(target = "rm", data = dataset2, cp = 0.075), simplify = FALSE) ## ----------------------------------------------------------------------------- # similarity matrix sim.mat = abs(cor(subset(dataset2, select = -rm))) sel.feats = unique(unlist(selected_features3)) sim.mat[sel.feats, sel.feats] ## ----------------------------------------------------------------------------- plotFeatures(selected_features3, sim.mat = sim.mat) ## ----------------------------------------------------------------------------- stabilityIntersectionCount(selected_features3, sim.mat = sim.mat, N = 100) ## ----------------------------------------------------------------------------- no.sim.mat = diag(nrow(sim.mat)) colnames(no.sim.mat) = row.names(no.sim.mat) = colnames(sim.mat) stabilityIntersectionCount(selected_features3, sim.mat = no.sim.mat) ## ----------------------------------------------------------------------------- set.seed(1) # select a subset of instances for visualization purposes inds = sample(nrow(dataset2), 50) dataset.cluster = dataset2[inds, ] # run k-means clustering with k = 3 30 times km = replicate(30, kmeans(dataset.cluster, centers = 3), simplify = FALSE) # change cluster names for comparability best = which.min(sapply(km, function(x) x$tot.withinss)) best.centers = km[[best]]$centers km.clusters = lapply(km, function(kmi) { dst = as.matrix(dist(rbind(best.centers, kmi$centers)))[4:6, 1:3] rownames(dst) = colnames(dst) = 1:3 # greedy choice of best matches of clusters new.cluster.names = numeric(3) while(nrow(dst) > 0) { min.dst = which.min(dst) row = (min.dst - 1) %% nrow(dst) + 1 row.o = as.numeric(rownames(dst)[row]) col = ceiling(min.dst / nrow(dst)) col.o = as.numeric(colnames(dst)[col]) new.cluster.names[row.o] = col.o dst = dst[-row, -col, drop = FALSE] } new.cluster.names[kmi$cluster] }) # for each cluster, create a list containing the instances # belonging to this cluster over the 30 repetitions clusters = lapply(1:3, function(i) { lapply(km.clusters, function(kmc) { which(kmc == i) }) }) ## ----------------------------------------------------------------------------- stab.cl = sapply(clusters, stabilityJaccard) stab.cl ## ----------------------------------------------------------------------------- w = sapply(clusters, function(cl) { mean(lengths(cl)) }) sum(stab.cl * w) / sum(w)