## ----message=FALSE, echo=FALSE------------------------------------------------ library("gbm3") ## ----------------------------------------------------------------------------- # create some binomial response data - N observations N <- 1000 X1 <- runif(N) X2 <- runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) mu <- c(-1,0,1,2)[as.numeric(X3)] p <- 1/(1+exp(-(sin(3*X1) - 4*X2 + mu))) Y <- rbinom(N,1,p) # response # Normalized weights for each observation w <- rexp(N) w <- N*w/sum(w) # Offset is set to 0 offset <- rep(0, N) # Collate data data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3) ## ----------------------------------------------------------------------------- available_distributions() ## ----------------------------------------------------------------------------- # Create a default Bernoulli bern_dist <- gbm_dist("Bernoulli") ## ----------------------------------------------------------------------------- # Creating an appropriate training parameters object train_params <- training_params(num_trees = 2000, interaction_depth = 3, num_train=nrow(data), num_features = 3) ## ----------------------------------------------------------------------------- # Example of a gbmParallel with more threads par_detail <- gbmParallel(num_threads = 2) # Pass to par_details in gbmt ## ----message=FALSE------------------------------------------------------------ # Create a gbm fit fit <- gbmt(Y ~ X1 + X2 + X3, distribution=bern_dist, data=data, weights = w, offset=offset, train_params = train_params, cv_folds=5, keep_gbm_data=TRUE) ## ----message=FALSE------------------------------------------------------------ # A default fit to gaussian data # Create data N <- 1000 X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=T)) X4 <- ordered(sample(letters[1:6],N,replace=T)) X5 <- factor(sample(letters[1:3],N,replace=T)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] SNR <- 10 # signal-to-noise ratio Y <- X1**1.5 + 2 * (X2**.5) + mu sigma <- sqrt(var(Y)/SNR) Y <- Y + rnorm(N,0,sigma) # create a bunch of missing values X1[sample(1:N,size=100)] <- NA X3[sample(1:N,size=300)] <- NA # Gaussian data gauss_data <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Perform a cross-validated fit gauss_fit <- gbmt(Y ~ X1 + X2 + X3 + X4 + X5 + X6, data=gauss_data, cv_folds =5, keep_gbm_data = TRUE) ## ----message=FALSE------------------------------------------------------------ # Use gaussian fit and determine the performance # red line is testing error # green line is cv error # blue line is OOB error best_iter_test <- gbmt_performance(gauss_fit, method='test') best_iter_cv <- gbmt_performance(gauss_fit, method='cv') best_iter_oob <- gbmt_performance(gauss_fit, method='OOB') ## ----message=FALSE------------------------------------------------------------ # Fit additional trees to gaussian fit and recalculate optimal number of iterations gauss_fit_more <- gbm_more(gauss_fit, num_new_trees = 5000) # This is a large number of #extra trees! best_iter_test_more <- gbmt_performance(gauss_fit_more, method='test') ## ----message=FALSE------------------------------------------------------------ # Given additional data - of the same shape as our gaussian example X1 <- runif(N) X2 <- 2*runif(N) X3 <- factor(sample(letters[1:4],N,replace=TRUE)) X4 <- ordered(sample(letters[1:6],N,replace=TRUE)) X5 <- factor(sample(letters[1:3],N,replace=TRUE)) X6 <- 3*runif(N) mu <- c(-1,0,1,2)[as.numeric(X3)] Y <- X1**1.5 + 2 * (X2**.5) + mu Y <- Y + rnorm(N,0,sigma) data2 <- data.frame(Y=Y,X1=X1,X2=X2,X3=X3,X4=X4,X5=X5,X6=X6) # Then predictions can be made preds <- predict(gauss_fit_more, newdata=data2, n.trees=best_iter_test_more) ## ----------------------------------------------------------------------------- # Relative influence of predictors - both examples print(relative_influence(gauss_fit_more, best_iter_test_more)) print(relative_influence(fit, gbmt_performance(fit, method='cv'))) ## ----------------------------------------------------------------------------- # Examine the marginal effects of the first two variables - gaussian example # Shows the fitted model predictions as a function of 1st two predictors # with all the others integrated out plot(gauss_fit_more, var_index = 1:2)