## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE, message = FALSE, cache = FALSE ) ## ----eval=FALSE--------------------------------------------------------------- # install.packages("mrddGlobal") ## ----------------------------------------------------------------------------- library(`mrddGlobal`) ## ----fig.width=6, fig.height=5, fig.align="center"---------------------------- library(sf) library(ggplot2) # Loading in shape file data nc <- st_read(system.file("shape/nc.shp", package = "sf")) nc <- st_union(nc) # Generating points in and out of NC bbox <- st_bbox(nc) n <- 2000 pts <- data.frame( x = runif(n, bbox["xmin"], bbox["xmax"]), y = runif(n, bbox["ymin"], bbox["ymax"]) ) knitr::kable(head(pts)) ## ----fig.width=6, fig.height=5, fig.align="center"---------------------------- pts_sf <- st_as_sf(pts, coords = c("x", "y"), crs = st_crs(nc)) inside <- st_within(pts_sf, nc, sparse = FALSE)[,1] pts_sf$status <- ifelse(inside, "Inside NC", "Outside NC") ggplot() + geom_sf(data = nc, fill = NA, color = "black", linewidth = 0.6) + geom_sf(data = pts_sf, aes(color = status), alpha = 0.6, size = 1) + coord_sf() + scale_color_manual(values = c("Inside NC" = "blue", "Outside NC" = "red")) + theme_minimal() + labs( title = "Random Points Inside and Outside North Carolina", color = "" ) ## ----fig.width=6, fig.height=5, fig.align="center"---------------------------- closest_points_all <- get_closest_points_shp(X = pts, shp_file = nc) knitr::kable(head(closest_points_all)) ## ----fig.width=6, fig.height=5, fig.align="center"---------------------------- closest_points_geo <- st_as_sf(as.data.frame(closest_points_all), coords = c("X", "Y"), crs = 4326) ggplot() + geom_sf(data = nc, fill = NA, color = "black", linewidth = 0.6) + geom_sf(data = closest_points_geo, alpha = 0.6, size = 1, color = "purple") + coord_sf() + theme_minimal() + labs( title = "Closest Boundary Points for North Carolina Observations", color = "" ) ## ----echo=FALSE, out.width="80%", fig.align="center"-------------------------- knitr::include_graphics("images/wrinkled_sphere.png") ## ----------------------------------------------------------------------------- # First, I will create a simulated dataframe set.seed(1) n <- 2000 df <- matrix(runif(n * 3, -1.2, 1.2), ncol = 3) # Then I will define the function that determines the boundary bndry_func <- function(X) { rowSums(X^2) + 0.5 * sin(rowSums(X^4)) - 1 } # Finally, I will get just the point cloud from the make_cloud function cloud <- make_cloud(X = df, bndry_func = bndry_func, num_pts_discr = 2e4, step_size = 1e-1, tol = 1e-2, verbose = FALSE) knitr::kable(head(cloud)) ## ----------------------------------------------------------------------------- treated <- ifelse(bndry_func(df) <= 0, 1, 0) df_bndry <- get_closest_points_bbox(cloud = cloud, X = df, t = treated) closest_points_all <- df_bndry[["closest_points"]] knitr::kable(head(closest_points_all)) ## ----------------------------------------------------------------------------- # Closest boundary points are found analytically since the boundary is simple closest_boundary_point <- function(df) { # Separating individual coordinates and initializing output matrix x1 <- df[, 1] x2 <- df[, 2] result <- matrix(0, nrow = nrow(df), ncol = 2) # Case 1: both coordinates are negative # Closest boundary point is the origin mask1 <- x1 < 0 & x2 < 0 result[mask1, ] <- cbind(0, 0) # Case 2: first coordinate is positive and larger than the second coordinate # Closest boundary point is either directly below or above on the x axis mask2 <- x1 >= 0 & x1 > x2 & !mask1 result[mask2, ] <- cbind(x1[mask2], 0) # Case 3: second coordinate is positive and larger than the first coordinate # Closest boundary point is either directly left or right on the y axis mask3 <- x2 >= 0 & x1 < x2 & !mask1 result[mask3, ] <- cbind(0, x2[mask3]) return(result) } sim_data <- gen_data_cef(1, n = 1000, seed = 1) X <- as.matrix(sim_data[,1:2]) Y <- as.matrix(sim_data[,3]) t <- ifelse(X[,1] >= 0 & X[,2] >= 0, 1, 0) closest_points_all <- closest_boundary_point(X) ## ----------------------------------------------------------------------------- results <- cef_disc_test(X, Y, t, closest_points_all, verbose = FALSE, num.trees = 5000, enable.ll.split = TRUE, bwselect = "diff") results ## ----------------------------------------------------------------------------- results <- cef_disc_test(X, Y, t, closest_points_all, num_splits = 10, verbose = FALSE, num.trees = 5000, enable.ll.split = TRUE, bwselect = "diff") results ## ----------------------------------------------------------------------------- summary(results) ## ----------------------------------------------------------------------------- results <- cef_disc_test(X, Y, t, closest_points_all, num_splits = 15, verbose = FALSE, num.trees = 5000, enable.ll.split = TRUE, bwselect = "diff") results ## ----fig.width=6, fig.height=6, fig.align="center", echo = FALSE-------------- library(patchwork) library(dplyr) library(ggplot2) set.seed(0) theta <- seq(0, 2 * pi, length.out = 400) boundary <- data.frame( x = 0.5 + 0.5*cos(theta), y = 0.5 + 0.5*sin(theta) ) split_box <- function(box, depth, max_depth) { if (depth == max_depth) return(list(box)) axis <- sample(c("x", "y"), 1) if (axis == "x") { cut <- (box$xmin + box$xmax) / 2 left <- list( xmin = box$xmin, xmax = cut, ymin = box$ymin, ymax = box$ymax ) right <- list( xmin = cut, xmax = box$xmax, ymin = box$ymin, ymax = box$ymax ) } else { cut <- (box$ymin + box$ymax) / 2 left <- list( xmin = box$xmin, xmax = box$xmax, ymin = box$ymin, ymax = cut ) right <- list( xmin = box$xmin, xmax = box$xmax, ymin = cut, ymax = box$ymax ) } c( split_box(left, depth + 1, max_depth), split_box(right, depth + 1, max_depth) ) } bbox <- list(xmin = 0, xmax = 1, ymin = 0, ymax = 1) leaves <- split_box(bbox, depth = 0, max_depth = 3) leaves_df <- bind_rows(lapply(leaves, as.data.frame)) p_partition <- ggplot() + geom_polygon( data = boundary, aes(x, y), fill = "grey80", color = "grey40", alpha = 0.8 ) + geom_rect( data = leaves_df, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), fill = NA, color = "black", linewidth = 0.6 ) + geom_point(aes(x = 0.146, y = 0.854), color = "red", size = 3) + coord_equal() + labs(title = "Recursive Partition with Irregular Support") + theme_minimal() + labs(x = "x1", y = "x2") p_partition ## ----fig.width=6, fig.height=6, fig.align="center", echo = FALSE-------------- top_left_leaf <- leaves_df %>% filter(ymax == max(ymax)) %>% filter(xmin == min(xmin)) set.seed(123) n_mc <- 500 mc <- data.frame( x = runif(n_mc, top_left_leaf$xmin, top_left_leaf$xmax), y = runif(n_mc, top_left_leaf$ymin, top_left_leaf$ymax) ) mc[["inside"]] <- ifelse((mc$x - 0.5)^2 + (mc$y - 0.5)^2 <= 0.5^2, TRUE, FALSE) ggplot() + geom_polygon( data = boundary, aes(x, y), fill = "grey85", color = "grey50", alpha = 0.6 ) + geom_rect( data = top_left_leaf, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), fill = NA, color = "black", linewidth = 1.2 ) + geom_point( data = mc, aes(x, y, color = inside), size = 1.5 ) + scale_color_manual(values = c("grey70","steelblue")) + coord_equal() + labs(title = "Monte Carlo Points in Top-Left Leaf", color = "Inside Support") + theme_minimal() + labs(x = "x1", y = "x2") ## ----------------------------------------------------------------------------- nc <- st_read(system.file("shape/nc.shp", package = "sf")) nc <- st_union(nc) bbox <- st_bbox(nc) bbox ## ----fig.width=6, fig.height=6, fig.align="center"---------------------------- set.seed(123) n <- 1000 x1 <- runif(n) x2 <- runif(n) x <- cbind(x1, x2) bndry_func <- function(X) { return((X[,1] - 0.5)^2 + (X[,2] - 0.5)^2 - 0.5^2) } cloud <- make_cloud(X = x, bndry_func = bndry_func, start_pt = c(1, 0), num_pts_discr = 1e4, tol = 1e-2, verbose = FALSE) plot(cloud, xlab = "x1", ylab = "x2", main = "Circle Boundary Point Cloud") ## ----------------------------------------------------------------------------- inside <- ifelse((x[,1] - 0.5)^2 + (x[,2] - 0.5)^2 <= 0.5^2, 1, 0) df_bndry <- get_closest_points_bbox(cloud, x, inside) lower_bounds0 <- df_bndry[["lower_bounds0"]] lower_bounds1 <- df_bndry[["lower_bounds1"]] upper_bounds0 <- df_bndry[["upper_bounds0"]] upper_bounds1 <- df_bndry[["upper_bounds1"]] cat("lower_bounds0:\n") print(lower_bounds0) cat("\nlower_bounds1:\n") print(lower_bounds1) cat("\nupper_bounds0:\n") print(upper_bounds0) cat("\nupper_bounds1:\n") print(upper_bounds1) ## ----------------------------------------------------------------------------- in_region1 <- function(X) { (X[1] - 0.5)^2 + (X[2] - 0.5)^2 <= 0.5^2 } in_region0 <- function(X) { (X[1] - 0.5)^2 + (X[2] - 0.5)^2 > 0.5^2 } ## ----fig.width=6, fig.height=6, fig.align="center"---------------------------- X <- gen_data_density(1, seed = 12345) bndry_func <- function(X) { # To ensure that each running variable remains between -1 and 1, I will add a penalty if (any(abs(X) > 1)) { return(999) } else { return(min(X)) } } in_region1 <- function(x) { x[1] >= 0 && x[2] >= 0 && x[1] <= 1 && x[2] <= 1 } in_region0 <- function(x) { !(x[1] >= 0 && x[2] >= 0) && abs(x[1]) <= 1 && abs(x[2]) <= 1 } cloud <- make_cloud(X, bndry_func = bndry_func, num_pts_discr = 2e4, step_size = 1e-1, tol = 1e-3, verbose = FALSE) plot(cloud, xlab = "x1", ylab = "x2", main = "Min Boundary Point Cloud") ## ----------------------------------------------------------------------------- inside <- ifelse(X[,1] >= 0 & X[,2] >= 0, 1, 0) df_bndry <- get_closest_points_bbox(cloud, X, inside) results_gen <- density_disc_test(X, inside, df_bndry[["closest_points"]], in_region0, in_region1, df_bndry[["lower_bounds0"]], df_bndry[["lower_bounds1"]], df_bndry[["upper_bounds0"]], df_bndry[["upper_bounds1"]], verbose = FALSE, rand_mid_split = TRUE, num_splits = 1, mc_samples = 500) results_gen ## ----------------------------------------------------------------------------- closest_points <- closest_boundary_point(X) t_R <- system.time( results_gen <- density_disc_test(X, inside, closest_points, in_region0, in_region1, c(-1, -1), c(0, 0), c(1, 1), c(1, 1), verbose = FALSE, num_splits = 1, mc_samples = 500) ) results_gen ## ----------------------------------------------------------------------------- library(Rcpp) cppFunction( 'SEXP region1_to_xptr() { typedef bool (*region_ptr)(const NumericVector&); return XPtr(new region_ptr(in_region1_cpp), true); }', includes = ' // C++ version of in_region1 bool in_region1_cpp(const NumericVector& x) { return (x[0] >= 0.0 && x[1] >= 0.0 && x[0] <= 1.0 && x[1] <= 1.0); }' ) cppFunction( 'SEXP region0_to_xptr() { typedef bool (*region_ptr)(const NumericVector&); return XPtr(new region_ptr(in_region0_cpp), true); }', includes = ' // C++ version of in_region0 bool in_region0_cpp(const NumericVector& x) { bool not_in_first_quadrant = !(x[0] >= 0.0 && x[1] >= 0.0); bool within_box = (std::abs(x[0]) <= 1.0 && std::abs(x[1]) <= 1.0); return not_in_first_quadrant && within_box; }' ) ptr1 <- region1_to_xptr() ptr0 <- region0_to_xptr() t_cpp <- system.time( results_gen <- density_disc_test(X, inside, closest_points, ptr0, ptr1, c(-1, -1), c(0, 0), c(1, 1), c(1, 1), verbose = FALSE, num_splits = 1, mc_samples = 500) ) cat("R Runtime:\n") print(t_R) cat("\n") cat("C++ Runtime:\n") print(t_cpp)