## ----knitrOpts---------------------------------------------------------------- library(knitr) suggested_packages <- c("geosphere", "nycflights13", "dplyr", "ggplot2", "microbenchmark") opts_chunk$set(eval = all(vapply(suggested_packages, requireNamespace, quietly = TRUE, FUN.VALUE = FALSE))) ## ----loadPackages------------------------------------------------------------- tryCatch({ library(geosphere) library(nycflights13) library(dplyr, warn.conflicts = FALSE) library(ggplot2) library(microbenchmark) library(data.table, warn.conflicts = FALSE) library(magrittr) library(hutils, warn.conflicts = FALSE) }, # requireNamespace does not detect errors like # package ‘dplyr’ was installed by an R version with different internals; it needs to be reinstalled for use with this R version error = function(e) { opts_chunk$set(eval = FALSE) }) options(digits = 4) ## ----aliases------------------------------------------------------------------ OR(OR(TRUE, stop("Never happens")), ## short-circuits AND(FALSE, stop("Never happens"))) ## ----compare_if_else---------------------------------------------------------- my_check <- function(values) { all(vapply(values[-1], function(x) identical(values[[1]], x), logical(1))) } set.seed(2) cnd <- sample(c(TRUE, FALSE, NA), size = 100e3, replace = TRUE) yes <- sample(letters, size = 100e3, replace = TRUE) no <- sample(letters, size = 100e3, replace = TRUE) na <- sample(letters, size = 100e3, replace = TRUE) microbenchmark(dplyr = dplyr::if_else(cnd, yes, no, na), hutils = hutils::if_else(cnd, yes, no, na), check = my_check) %>% print cnd <- sample(c(TRUE, FALSE, NA), size = 100e3, replace = TRUE) yes <- sample(letters, size = 1, replace = TRUE) no <- sample(letters, size = 100e3, replace = TRUE) na <- sample(letters, size = 1, replace = TRUE) microbenchmark(dplyr = dplyr::if_else(cnd, yes, no, na), hutils = hutils::if_else(cnd, yes, no, na), check = my_check) %>% print ## ----compare_coalesce--------------------------------------------------------- x <- sample(c(letters, NA), size = 100e3, replace = TRUE) A <- sample(c(letters, NA), size = 100e3, replace = TRUE) B <- sample(c(letters, NA), size = 100e3, replace = TRUE) C <- sample(c(letters, NA), size = 100e3, replace = TRUE) microbenchmark(dplyr = dplyr::coalesce(x, A, B, C), hutils = hutils::coalesce(x, A, B, C), check = my_check) %>% print ## ----compare_coalesce_short_circuit_x----------------------------------------- x <- sample(c(letters), size = 100e3, replace = TRUE) microbenchmark(dplyr = dplyr::coalesce(x, A, B, C), hutils = hutils::coalesce(x, A, B, C), check = my_check) %>% print ## ----compare_coalesce_short_circuit_A----------------------------------------- x <- sample(c(letters, NA), size = 100e3, replace = TRUE) A <- sample(c(letters), size = 100e3, replace = TRUE) microbenchmark(dplyr = dplyr::coalesce(x, A, B, C), hutils = hutils::coalesce(x, A, B, C), check = my_check) %>% print ## ----canonical_drop_DT-------------------------------------------------------- DT <- data.table(A = 1:5, B = 1:5, C = 1:5) DT[, A := NULL] ## ----drop_col_hutils---------------------------------------------------------- DT <- data.table(A = 1:5, B = 1:5, C = 1:5) DT %>% drop_col("A") %>% drop_col("B") # or DT <- data.table(A = 1:5, B = 1:5, C = 1:5) DT %>% drop_cols(c("A", "B")) ## ----drop_colr---------------------------------------------------------------- flights <- as.data.table(flights) flights %>% drop_colr("time") %>% drop_colr("arr(?!_delay)", perl = TRUE) ## ----drop_constant_cols------------------------------------------------------- flights %>% .[origin == "JFK"] %>% drop_constant_cols ## ----drop_empty_cols---------------------------------------------------------- planes %>% as.data.table %>% .[!complete.cases(.)] planes %>% as.data.table %>% .[!complete.cases(.)] %>% # drops speed drop_empty_cols ## ----duplicated_rows---------------------------------------------------------- flights %>% # only the 'second' of the duplicates is returned .[duplicated(., by = c("origin", "dest"))] flights %>% # Both rows are returned and (by default) # duplicates are presented adjacently duplicated_rows(by = c("origin", "dest")) ## ----haversine_distance------------------------------------------------------- DT1 <- data.table(lat_orig = runif(1e5, -80, 80), lon_orig = runif(1e5, -179, 179), lat_dest = runif(1e5, -80, 80), lon_dest = runif(1e5, -179, 179)) DT2 <- copy(DT1) microbenchmark(DT1[, distance := haversine_distance(lat_orig, lon_orig, lat_dest, lon_dest)], DT2[, distance := distHaversine(cbind(lon_orig, lat_orig), cbind(lon_orig, lat_orig))]) rm(DT1, DT2) ## ----mutate-other, results='asis'--------------------------------------------- set.seed(1) DT <- data.table(Fruit = sample(c("apple", "pear", "orange", "tomato", "eggplant"), size = 20, prob = c(0.45, 0.25, 0.15, 0.1, 0.05), replace = TRUE), Price = rpois(20, 10)) kable(mutate_other(DT, "Fruit", n = 3)[]) ## ----iris-veriscolor---------------------------------------------------------- iris <- as.data.table(iris) iris[Species %in% c("setosa", "versicolour")] %$% mean(Sepal.Length / Sepal.Width) ## ----iris-versicolor, error=TRUE---------------------------------------------- iris <- as.data.table(iris) iris[Species %ein% c("setosa", "versicolour")] %$% mean(Sepal.Length / Sepal.Width) ## ----pin---------------------------------------------------------------------- identical(iris[grep("v", Species)], iris[Species %pin% "v"]) ## ----pin-multi---------------------------------------------------------------- iris[Species %pin% c("ver", "vir")] %>% head ## ----------------------------------------------------------------------------- DT <- data.table(x = 1:5, y = letters[1:5], AB = c(NA, TRUE, FALSE, TRUE, FALSE)) select_which(DT, anyNA, .and.dots = "y") ## ----------------------------------------------------------------------------- dt <- data.table(y = !sample(0:1, size = 100, replace = TRUE), x = runif(100)) dt[, pred := predict(lm(y ~ x, data = .SD), newdata = .SD)] dt[, auc(y, pred)] ## ----select_grep-------------------------------------------------------------- flights %>% select_grep("arr") ## ----select_grep-and---------------------------------------------------------- flights %>% select_grep("arr", .and = "year", .but.not = "arr_time") ## ----------------------------------------------------------------------------- RQ(dplyr, "dplyr must be installed") RQ("dplyr", "dplyr needs installing", "dplyr installed.") ## ----ahull-1------------------------------------------------------------------ if (!identical(Sys.info()[["sysname"]], "Darwin")) ggplot(data.table(x = c(0, 1, 2, 3, 4), y = c(0, 1, 2, 0.1, 0))) + geom_area(aes(x, y)) + geom_rect(data = ahull(, c(0, 1, 2, 3, 4), c(0, 1, 2, 0.1, 0)), aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), color = "red") ## ----ahull-demos, fig.width = 8, fig.height = 6------------------------------- set.seed(101) ahull_dt <- data.table(x = c(0:100) / 100, y = cumsum(rnorm(101, 0.05))) if (!identical(Sys.info()[["sysname"]], "Darwin")) ggplot(ahull_dt) + geom_area(aes(x, y)) + geom_rect(data = ahull(ahull_dt), aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), color = "red") + geom_rect(data = ahull(ahull_dt, incl_negative = TRUE), aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), color = "blue") + geom_rect(data = ahull(ahull_dt, incl_negative = TRUE, minH = 4), aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), color = "green") + geom_rect(data = ahull(ahull_dt, incl_negative = TRUE, minW = 0.25), aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), color = "white", fill = NA) ## ----weighted_quantile-ex----------------------------------------------------- x <- 1:10 w <- c(rep(1, 5), rep(2, 5)) quantile(x, prob = c(0.25, 0.75), names = FALSE) weighted_quantile(x, w, p = c(0.25, 0.75)) ## ----mutate_ntile-ex---------------------------------------------------------- flights %>% as.data.table %>% .[, .(year, month, day, origin, dest, distance)] %>% mutate_ntile(distance, n = 5L) ## ----mutate_ntile-ex-charonly------------------------------------------------- flights %>% as.data.table %>% .[, .(year, month, day, origin, dest, distance)] %>% mutate_ntile(distance, n = 5L) ## ----mutate_ntile-ex-2-------------------------------------------------------- flights %>% as.data.table %>% mutate_ntile("distance", n = 5L, character.only = TRUE) %>% .[, dep_delay := coalesce(dep_delay, 0)] %>% .[, .(avgDelay = mean(dep_delay)), keyby = "distanceQuintile"] ## ----longest-affix------------------------------------------------------------ trim_common_affixes(c("CurrentHousingCosts(weekly)", "CurrentFuelCosts(weekly)")) ## ----swap--------------------------------------------------------------------- a <- 1 b <- 2 a %<->% b identical(c(a, b), c(2, 1)) ## ----average-bearing---------------------------------------------------------- average_bearing(0, 270) # NW mean(c(0, 270)) # SE (i.e. wrong) ## ----Mode-eg------------------------------------------------------------------ Mode(c(1, 1, 1, 2, 3)) ## ----samp-eg------------------------------------------------------------------ DT <- data.table(x = c(5, 2, 3), y = c(5, 3, 4)) DT[, .(Base = sample(.BY[["x"]]:.BY[["y"]])), keyby = .(x, y)] DT[, .(Base = samp(.BY[["x"]]:.BY[["y"]])), keyby = .(x, y)]