## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(etree) ## ----------------------------------------------------------------------------- # Set seed set.seed(123) # Number of observation n_obs <- 100 # Response variable for regression resp_reg <- c(rnorm(n_obs / 4, mean = 0, sd = 1), rnorm(n_obs / 4, mean = 2, sd = 1), rnorm(n_obs / 4, mean = 4, sd = 1), rnorm(n_obs / 4, mean = 6, sd = 1)) ## ----------------------------------------------------------------------------- # Response variable for classification resp_cls <- factor(c(sample(x = 1:4, size = n_obs / 4, replace = TRUE, prob = c(0.85, 0.05, 0.05, 0.05)), sample(x = 1:4, size = n_obs / 4, replace = TRUE, prob = c(0.05, 0.85, 0.05, 0.05)), sample(x = 1:4, size = n_obs / 4, replace = TRUE, prob = c(0.05, 0.05, 0.85, 0.05)), sample(x = 1:4, size = n_obs / 4, replace = TRUE, prob = c(0.05, 0.05, 0.05, 0.85)))) ## ----------------------------------------------------------------------------- # Numeric x1 <- c(runif(n_obs / 4, min = 0, max = 0.55), runif(n_obs / 4 * 3, min = 0.45, max = 1)) # Nominal x2 <- factor(c(rbinom(n_obs / 4, 1, 0.1), rbinom(n_obs / 4, 1, 0.9), rbinom(n_obs / 2, 1, 0.1))) # Functions x3 <- c(fda.usc::rproc2fdata(n_obs / 2, seq(0, 1, len = 100), sigma = 1), fda.usc::rproc2fdata(n_obs / 4, seq(0, 1, len = 100), sigma = 1, mu = rep(1, 100)), fda.usc::rproc2fdata(n_obs / 4, seq(0, 1, len = 100), sigma = 1)) # Graphs x4 <- c(lapply(1:(n_obs / 4 * 3), function(j) igraph::sample_gnp(100, 0.1)), lapply(1:(n_obs / 4), function(j) igraph::sample_gnp(100, 0.9))) # Covariates list cov_list <- list(X1 = x1, X2 = x2, X3 = x3, X4 = x4) ## ----------------------------------------------------------------------------- # Quick fit etree_fit <- etree(response = resp_reg, covariates = cov_list) ## ---- fig.dim = c(7.5, 6)----------------------------------------------------- # Plot plot(etree_fit) ## ---- fig.dim = c(9, 6)------------------------------------------------------- # Fit with smaller minbucket etree_fit1 <- etree(response = resp_reg, covariates = cov_list, minbucket = 1) # Plot plot(etree_fit1) ## ---- fig.dim = c(12, 7)------------------------------------------------------ # Fit with larger alpha etree_fit2 <- etree(response = resp_reg, covariates = cov_list, alpha = 1) # Plot plot(etree_fit2) ## ---- fig.dim = c(7.5, 5)----------------------------------------------------- # Fit with 'cluster' etree_fit3 <- etree(response = resp_reg, covariates = cov_list, split_type = 'cluster') # Plot plot(etree_fit3) ## ----------------------------------------------------------------------------- # Print the fitted tree print(etree_fit) ## ----------------------------------------------------------------------------- # Print a subtree print(etree_fit[5]) ## ----------------------------------------------------------------------------- # Equivalent method to print a subtree print(etree_fit[[5]]) ## ---- fig.dim = c(7, 4)------------------------------------------------------- # Plot a subtree plot(etree_fit[5]) ## ----------------------------------------------------------------------------- # Get all nodes' ids nodeids(etree_fit) ## ----------------------------------------------------------------------------- # Get all terminal nodes' ids nodeids(etree_fit, terminal = TRUE) ## ----------------------------------------------------------------------------- # Get nodes' ids of a subtree nodeids(etree_fit, from = 5) ## ----------------------------------------------------------------------------- # Select inner nodes' ids tnodes <- nodeids(etree_fit, terminal = TRUE) nodes <- 1:max(tnodes) inodes <- nodes[-tnodes] # Retrieve pvalues nodeapply(etree_fit, ids = inodes, FUN = function (n) n$info$pvalue) ## ----------------------------------------------------------------------------- # Total number of nodes length(etree_fit) # Depth depth(etree_fit) # Width width(etree_fit) ## ----------------------------------------------------------------------------- # Predictions from the fitted object pred <- predict(etree_fit) print(pred) ## ----------------------------------------------------------------------------- # MSE between the response and its average mean((resp_reg - mean(resp_reg)) ^ 2) # MSE between the response and predictions from the fitted tree mean((resp_reg - pred) ^ 2) ## ----------------------------------------------------------------------------- # New set of covariates n_obs <- 40 x1n <- c(runif(n_obs / 4, min = 0, max = 0.55), runif(n_obs / 4 * 3, min = 0.45, max = 1)) x2n <- factor(c(rbinom(n_obs / 4, 1, 0.1), rbinom(n_obs / 4, 1, 0.9), rbinom(n_obs / 2, 1, 0.1))) x3n <- c(fda.usc::rproc2fdata(n_obs / 2, seq(0, 1, len = 100), sigma = 1), fda.usc::rproc2fdata(n_obs / 4, seq(0, 1, len = 100), sigma = 1, mu = rep(1, 100)), fda.usc::rproc2fdata(n_obs / 4, seq(0, 1, len = 100), sigma = 1)) x4n <- c(lapply(1:(n_obs / 4 * 3), function(j) igraph::sample_gnp(100, 0.1)), lapply(1:(n_obs / 4), function(j) igraph::sample_gnp(100, 0.9))) new_cov_list <- list(X1 = x1n, X2 = x2n, X3 = x3n, X4 = x4n) # New response new_resp_reg <- c(rnorm(n_obs / 4, mean = 0, sd = 1), rnorm(n_obs / 4, mean = 2, sd = 1), rnorm(n_obs / 4, mean = 4, sd = 1), rnorm(n_obs / 4, mean = 6, sd = 1)) # Predictions new_pred <- predict(etree_fit, newdata = new_cov_list) print(new_pred) ## ----------------------------------------------------------------------------- # MSE between the new response and its average mean((new_resp_reg - mean(new_resp_reg)) ^ 2) # MSE between the new response and predictions with the new set of covariates mean((new_resp_reg - new_pred) ^ 2) ## ---- fig.dim = c(7, 6)------------------------------------------------------- # Quick fit etree_fit <- etree(response = resp_cls, covariates = cov_list) # Plot plot(etree_fit) ## ----------------------------------------------------------------------------- # Predictions from the fitted object pred <- predict(etree_fit) print(pred) ## ----------------------------------------------------------------------------- # Confusion matrix between the response and predictions from the fitted tree table(pred, resp_cls, dnn = c('Predicted', 'True')) # Misclassification error for predictions from the fitted tree sum(pred != resp_cls) / length(resp_cls) ## ----------------------------------------------------------------------------- # New response new_resp_cls <- factor(c(sample(x = 1:4, size = n_obs / 4, replace = TRUE, prob = c(0.85, 0.05, 0.05, 0.05)), sample(x = 1:4, size = n_obs / 4, replace = TRUE, prob = c(0.05, 0.85, 0.05, 0.05)), sample(x = 1:4, size = n_obs / 4, replace = TRUE, prob = c(0.05, 0.05, 0.85, 0.05)), sample(x = 1:4, size = n_obs / 4, replace = TRUE, prob = c(0.05, 0.05, 0.05, 0.85)))) # Predictions new_pred <- predict(etree_fit, newdata = new_cov_list) print(new_pred) # Confusion matrix between the new response and predictions from the fitted tree table(new_pred, new_resp_cls, dnn = c('Predicted', 'True')) # Misclassification error for predictions on the new set of covariates sum(new_pred != new_resp_cls) / length(new_resp_cls)