--- title: "Plots for our `surveyCV` *Stat* paper based on SDSS presentation" author: "Cole Guerin, Thomas McMahon, Jerzy Wieczorek" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{plots-for-Stat-paper} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- Writing cleaner code for **only** the simulations needed for [our submission to *Stat*](https://github.com/ColbyStatSvyRsch/surveyCV/blob/master/data-raw/Stat_SDSS_submission.pdf) (the special issue for SDSS 2021 talks). Based on the earlier sims (now on GitHub in [`data-raw/plot_generation.R`](https://github.com/ColbyStatSvyRsch/surveyCV/blob/master/data-raw/plot_generation.R) and [`data-raw/early-results.html`](https://github.com/ColbyStatSvyRsch/surveyCV/blob/master/data-raw/early-results.html)) -- but instead of having similar functions repeatedly defined separately as in that file, use our functions such as `cv.svy()` directly from the package. ```{r Packages, message=FALSE} library(survey) library(ggplot2) library(splines) library(magrittr) ``` ```{r LoadSurveyCV} library(surveyCV) rerunSims <- FALSE # when we *do* rerun sims, remember to reinstall package & restart R knitr::opts_chunk$set(dpi=300, out.extra ='WIDTH="95%"') ``` ## Generate the artificial population ```{r ArtifPop} # Generate a cubic-ish artificial population with 1000 units set.seed(47) N <- 1000 nrStrata <- 10 nrClusters <- 100 clusSizes <- N/nrClusters x1 = stats::runif(1:(N/2), min = 26, max = 38) y1 = (x1-29)^3 - 13*(x1-29)^2 + 0*(x1-29) + 900 x2 = stats::runif(1:(N/2), min = 38, max = 50) y2 = (x2-36)^3 - 10*(x2-36)^2 + 2*(x2-36) + 600 z1 = jitter(y1, 15000) z2 = jitter(y2, 15000) ds1 <- data.frame(Response = z1, Predictor = x1) ds2 <- data.frame(Response = z2, Predictor = x2) ds12 <- rbind(ds1, ds2) b12 <- data.frame(ID = c(1:1000)) splinepop.df <- cbind(b12, ds12) splinepop.df <- splinepop.df %>% dplyr::arrange(Predictor) %>% # Create Cluster and Stratum variables so that # we can (separately) simulate both kinds of sampling dplyr::mutate(Stratum = dplyr::row_number(), Cluster = dplyr::row_number()) %>% dplyr::mutate(Stratum = cut(Stratum, nrStrata, 1:nrStrata), Cluster = cut(Cluster, nrClusters, 1:nrClusters)) %>% # Create fpc variables for each sampling type: # full pop has 1000 units; 100 units/stratum; 100 clusters (of 10 units each); # & if we added a combined Strat+Clus sim, we'd need different fpc's for that dplyr::mutate(fpcSRS = N, fpcStratum = N/nrStrata, fpcCluster = nrClusters) %>% dplyr::arrange(ID) # Create unequal sampling weights that are intentionally # NOT well matched to the true shape of the population, # so we can see how the use of weights impacts # model-training step vs test-error estimation step # in a case where naively ignoring weights will confidently pick wrong model lm_quad <- stats::lm(Response ~ Predictor + I(Predictor^2), data = splinepop.df) splinepop.df$samp_prob_quad <- (1/(abs(lm_quad$residuals))) / sum(1/(abs(lm_quad$residuals))) splinepop.df$samp_wt_quad <- 1/splinepop.df$samp_prob_quad stopifnot(all.equal(sum(1/splinepop.df$samp_wt_quad), 1)) ``` ```{r PopulationPlot, fig.width=8.3, fig.height=3} n <- 100 srs.df <- dplyr::sample_n(splinepop.df, n) stratcounts <- rep(n/nrStrata, nrStrata) names(stratcounts) <- 1:nrStrata s <- stratsample(splinepop.df$Stratum, stratcounts) strat.df <- splinepop.df[s,] c <- unique(splinepop.df[["Cluster"]]) clus.df <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),] a <- ggplot(splinepop.df, aes(x = Predictor, y = Response)) + geom_point(shape = 1) + labs(title = "Artificial Pop.", x = "Predictor", y = "Response") + ylim(100,1650) + xlim(25, 51) b <- ggplot(srs.df, aes(x = Predictor, y = Response)) + geom_point(shape = 8, color = "darkgreen") + labs(title = "SRS", x = "Predictor", y = "Response") + ylim(100,1650) + xlim(25, 51) c <- ggplot(strat.df, aes(x = Predictor, y = Response)) + geom_point(shape = 3, color = "darkred") + labs(title = "Stratified Sample", x = "Predictor", y = "Response") + ylim(100,1650) + xlim(25, 51) d <- ggplot(clus.df, aes(x = Predictor, y = Response)) + geom_point(shape = 4, color = "steelblue") + labs(title = "Cluster Sample", x = "Predictor", y = "Response") + ylim(100,1650) + xlim(25, 51) gridExtra::grid.arrange(a, b, c, d, ncol = 4, top = "Simulated Data and Examples of How It Was Sampled") ``` ## Sims: Use of surveyCV folds with SRS, clustered, or stratified samples ```{r FoldsSimsSetup} # TODO: redo with different n's? n <- 100 loops <- 500 ``` ```{r FoldsSimsResults, eval = rerunSims} # Use SRS samples and make SRS folds srssrsds <- data.frame(df = c(), MSE = c()) # Use SRS samples and evaluate on full pop srspopds <- data.frame(df = c(), MSE = c()) # Use Cluster samples and make Cluster folds clusclusds <- data.frame(df = c(), MSE = c()) # Use Cluster samples and make SRS folds clussrsds <- data.frame(df = c(), MSE = c()) # Use Cluster samples and evaluate on full pop cluspopds <- data.frame(df = c(), MSE = c()) # Use Strat samples and make Strat folds stratstratds <- data.frame(df = c(), MSE = c()) # Use Strat samples and make SRS folds stratsrsds <- data.frame(df = c(), MSE = c()) # Use Strat samples and evaluate on full pop stratpopds <- data.frame(df = c(), MSE = c()) modelsToFit <- c("Response~splines::ns(Predictor, df=1)", "Response~splines::ns(Predictor, df=2)", "Response~splines::ns(Predictor, df=3)", "Response~splines::ns(Predictor, df=4)", "Response~splines::ns(Predictor, df=5)", "Response~splines::ns(Predictor, df=6)") # Making as many simple random samples as we specify for 'loops' for (i in 1:loops) { # Take a SRS sim.srs <- dplyr::sample_n(splinepop.df, n) # Using our SRS function on SRS samples srs.CV.out <- cv.svy(sim.srs, modelsToFit, nfolds = 5, fpcID = "fpcSRS") %>% as.vector() srssrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out) srssrsds <- rbind(srssrsds, srssrsds2) # Eval SRS samples on full pop sub.srs <- dplyr::sample_n(sim.srs, n*.8) srs.des <- svydesign(ids = ~1, fpc = ~fpcSRS, data = sub.srs) srs.pop.out <- sapply(1:6, function(ii) { yhat <- predict(svyglm(modelsToFit[ii], srs.des), newdata = splinepop.df) return(mean((yhat - splinepop.df$Response)^2)) }) srspopds2 <- data.frame(df = 1:6, MSE = srs.pop.out) srspopds <- rbind(srspopds, srspopds2) } # Making as many cluster samples as we specify for 'loops' for (i in 1:loops) { # Take a Cluster sample c <- unique(splinepop.df[["Cluster"]]) sim.clus <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),] # Using our Cluster function on Cluster samples clus.CV.out <- cv.svy(sim.clus, modelsToFit, clusterID = "Cluster", nfolds = 5, fpcID = "fpcCluster") %>% as.vector() clusclusds2 <- data.frame(df = 1:6, MSE = clus.CV.out) clusclusds <- rbind(clusclusds, clusclusds2) # Using our SRS function on Cluster samples srs.CV.out <- cv.svy(sim.clus, modelsToFit, nfolds = 5, fpcID = "fpcSRS") %>% as.vector() clussrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out) clussrsds <- rbind(clussrsds, clussrsds2) # Eval 80% Cluster samples on full pop sub.c <- unique(sim.clus[["Cluster"]]) sub.clus <- sim.clus[sim.clus[["Cluster"]] %in% sample(sub.c, (n/clusSizes)*.8),] clus.des <- svydesign(ids = ~Cluster, fpc = ~fpcCluster, data = sub.clus) clus.pop.out <- sapply(1:6, function(ii) { yhat <- predict(svyglm(modelsToFit[ii], clus.des), newdata = splinepop.df) return(mean((yhat - splinepop.df$Response)^2)) }) cluspopds2 <- data.frame(df = 1:6, MSE = clus.pop.out) cluspopds <- rbind(cluspopds, cluspopds2) } # Making as many stratified samples as we specify for 'loops' for (i in 1:loops) { # Take a Strat sample stratcounts <- rep(n/nrStrata, nrStrata) names(stratcounts) <- 1:nrStrata s <- stratsample(splinepop.df$Stratum, stratcounts) sim.strat <- splinepop.df[s,] # Using our Strat function on Strat samples strat.CV.out <- cv.svy(sim.strat, modelsToFit, strataID = "Stratum", nfolds = 5, fpcID = "fpcStratum") %>% as.vector() stratstratds2 <- data.frame(df = 1:6, MSE = strat.CV.out) stratstratds <- rbind(stratstratds, stratstratds2) # Using our SRS function on Strat samples srs.CV.out <- cv.svy(sim.strat, modelsToFit, nfolds = 5, fpcID = "fpcSRS") %>% as.vector() stratsrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out) stratsrsds <- rbind(stratsrsds, stratsrsds2) # Eval 80% Strat samples on full pop sub.stratcounts <- rep((n/nrStrata)*.8, nrStrata) names(sub.stratcounts) <- 1:nrStrata sub.s <- survey::stratsample(sim.strat$Stratum, sub.stratcounts) sub.strat <- sim.strat[sub.s,] strat.des <- svydesign(ids = ~1, strata = ~Stratum, fpc = ~fpcStratum, data = sub.strat) strat.pop.out <- sapply(1:6, function(ii) { yhat <- predict(svyglm(modelsToFit[ii], strat.des), newdata = splinepop.df) return(mean((yhat - splinepop.df$Response)^2)) }) stratpopds2 <- data.frame(df = 1:6, MSE = strat.pop.out) stratpopds <- rbind(stratpopds, stratpopds2) } # Making the degrees of freedom variable a factor variable srssrsds$df <- as.factor(srssrsds$df) srspopds$df <- as.factor(srspopds$df) clusclusds$df <- as.factor(clusclusds$df) clussrsds$df <- as.factor(clussrsds$df) cluspopds$df <- as.factor(cluspopds$df) stratstratds$df <- as.factor(stratstratds$df) stratsrsds$df <- as.factor(stratsrsds$df) stratpopds$df <- as.factor(stratpopds$df) usethis::use_data(srssrsds, srspopds, clusclusds, clussrsds, cluspopds, stratstratds, stratsrsds, stratpopds, internal = FALSE, overwrite = TRUE) # TODO: for now setting internal=FALSE # so that we don't overwrite the sysdata.Rda for intro.Rmd; # but eventually when we're ready to replace that old vignette, # we can return to internal=TRUE here # if that's indeed better for R packages # ... # OK, now we've written those dataframes, # AND the 4 below, all into R/sysdata.Rda together ``` ```{r FoldsSimsPlots, fig.width=7, fig.height=4.7} if(!rerunSims) { srssrsds <- surveyCV:::srssrsds clusclusds <- surveyCV:::clusclusds clussrsds <- surveyCV:::clussrsds stratstratds <- surveyCV:::stratstratds stratsrsds <- surveyCV:::stratsrsds srspopds <- surveyCV:::srspopds cluspopds <- surveyCV:::cluspopds stratpopds <- surveyCV:::stratpopds } # Find the y-ranges of MSEs yminMSE <- min(min(srssrsds$MSE), min(clusclusds$MSE), min(clussrsds$MSE), min(stratstratds$MSE), min(stratsrsds$MSE)) ymaxMSE <- max(max(srssrsds$MSE), max(clusclusds$MSE), max(clussrsds$MSE), max(stratstratds$MSE), max(stratsrsds$MSE)) srspopds <- dplyr::group_by(srspopds, df) %>% dplyr::summarise(MSE = mean(MSE)) %>% dplyr::mutate(df = as.numeric(df)) cluspopds <- dplyr::group_by(cluspopds, df) %>% dplyr::summarise(MSE = mean(MSE)) %>% dplyr::mutate(df = as.numeric(df)) stratpopds <- dplyr::group_by(stratpopds, df) %>% dplyr::summarise(MSE = mean(MSE)) %>% dplyr::mutate(df = as.numeric(df)) # Making ggplot objects for the MSEs and AICs plot.srssrs <- ggplot(data = srssrsds, mapping = aes(x = df, y = MSE/1e4)) + geom_boxplot() + geom_line(data = srspopds, mapping = aes(x = df, y = MSE/1e4)) + ggtitle("CV: SRS folds,\nSRS sample") + scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4) plot.blank <- rectGrob(width = 0, height = 0) # empty spot here -- no corresponding sim plot.clusclus <- ggplot(data = clusclusds, mapping = aes(x = df, y = MSE/1e4)) + geom_boxplot() + geom_line(data = cluspopds) + ggtitle("CV: Cluster folds,\nCluster sample") + scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4) plot.clussrs <- ggplot(data = clussrsds, mapping = aes(x = df, y = MSE/1e4)) + geom_boxplot() + geom_line(data = cluspopds) + ggtitle("CV: SRS folds,\nCluster sample") + scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4) plot.stratstrat <- ggplot(data = stratstratds, mapping = aes(x = df, y = MSE/1e4)) + geom_boxplot() + geom_line(data = stratpopds) + ggtitle("CV: Strat folds,\nStrat sample") + scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4) plot.stratsrs <- ggplot(data = stratsrsds, mapping = aes(x = df, y = MSE/1e4)) + geom_boxplot() + geom_line(data = stratpopds) + ggtitle("CV: SRS folds,\nStrat sample") + scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4) gridExtra::grid.arrange(plot.srssrs, plot.clussrs, plot.stratsrs, plot.blank, plot.clusclus, plot.stratstrat, ncol = 3, top = paste0("Simulated Data (Sample Sizes = ", n, ", Clusters = ", n/clusSizes, " or Strata = ", nrStrata, ", Loops = ", loops, ", Folds = 5)")) ``` ## Sims: Use of sampling weights (in training model-fits vs in test loss-estimates) ```{r QuadWeightsPlot, fig.width=7, fig.height=3.5} ggplot(splinepop.df, aes(x = Predictor, y = Response)) + geom_point(aes(size = samp_prob_quad), alpha = 0.1) + stat_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) + labs(title = "Artificial Population with Sampling Probabilities", subtitle = "Sampling prob. is higher for points nearer the solid curve", x = "Predictor", y = "Response", size = "Sampling Probability") ``` ```{r WeightsSimsSetup} # TODO: redo with different n's? n <- 100 loops <- 500 weights <- "samp_wt_quad" ``` ```{r WeightsSimsResults, eval = rerunSims} # In all the following cases, # we are only checking the effect of using sampling **weights** # during model-fitting on training sets, and testing on test sets; # we did not use cluster or stratified sampling, # so all CV folds will (sensibly) be SRS (regardless of samp weights) # Use weighted models, and calculate MSEs using weighted design AllW <- data.frame(df = c(), MSE = c()) # Use SRS models, and calculate MSEs using SRS design NoW <- data.frame(df = c(), MSE = c()) # Use weighted models, and calculate MSEs using SRS design ModW <- data.frame(df = c(), MSE = c()) # Use SRS models, and calculate MSEs using weighted design MSEW <- data.frame(df = c(), MSE = c()) for (i in 1:loops) { # Take a sample of size n, using the sampling probabilities instead of SRS # (using 1/samp_wt as the samp_prob per unit, # or n/samp_wt as overall samp_prob to get a sample of size n) in.sample <- sampling::UPtille(n / splinepop.df[[weights]]) splinesamp.df <- splinepop.df[in.sample > 0, ] modelsToFit <- c("Response~ns(Predictor, df=1)", "Response~ns(Predictor, df=2)", "Response~ns(Predictor, df=3)", "Response~ns(Predictor, df=4)", "Response~ns(Predictor, df=5)", "Response~ns(Predictor, df=6)") AllWdat <- cv.svy(splinesamp.df, modelsToFit, nfolds = 5, weightsID = weights) %>% as.vector() NoWdat <- cv.svy(splinesamp.df, modelsToFit, nfolds = 5) %>% as.vector() ModWdat <- cv.svy(splinesamp.df, modelsToFit, nfolds = 5, weightsID = weights, useSvyForLoss = FALSE) %>% as.vector() MSEWdat <- cv.svy(splinesamp.df, modelsToFit, nfolds = 5, weightsID = weights, useSvyForFits = FALSE) %>% as.vector() # compiling one data frame AllW2 <- data.frame(df = 1:6, MSE = AllWdat, sample = rep(i, 6)) AllW <- rbind(AllW, AllW2) NoW2 <- data.frame(df = 1:6, MSE = NoWdat, sample = rep(i, 6)) NoW <- rbind(NoW, NoW2) ModW2 <- data.frame(df = 1:6, MSE = ModWdat, sample = rep(i, 6)) ModW <- rbind(ModW, ModW2) MSEW2 <- data.frame(df = 1:6, MSE = MSEWdat, sample = rep(i, 6)) MSEW <- rbind(MSEW, MSEW2) } # Making the degrees of freedom variable a factor variable AllW$df <- as.factor(AllW$df) NoW$df <- as.factor(NoW$df) MSEW$df <- as.factor(MSEW$df) ModW$df <- as.factor(ModW$df) usethis::use_data(AllW, NoW, MSEW, ModW, internal = FALSE, overwrite = TRUE) # TODO: as above, for now setting internal=FALSE # so that we don't overwrite the sysdata.Rda for intro.Rmd; # but eventually when we're ready to replace that old vignette, # we can return to internal=TRUE here # if that's indeed better for R packages # ... # OK, now we've written those dataframes, # AND the 8 above, all into R/sysdata.Rda together ``` ```{r WeightsSimsResultsAndPlots, fig.width=7, fig.height=7} if(!rerunSims) { AllW <- surveyCV:::AllW NoW <- surveyCV:::NoW MSEW <- surveyCV:::MSEW ModW <- surveyCV:::ModW } # Find the y-range of MSEs ymin <- min(min(AllW$MSE), min(NoW$MSE), min(MSEW$MSE), min(ModW$MSE)) ymax <- max(max(AllW$MSE), max(NoW$MSE), max(MSEW$MSE), max(ModW$MSE)) # Making a ggplot object for the MSEs collected when using # SRS folds, SRS models, and SRS error calculations pAllW <- ggplot(data = AllW, mapping = aes(x = df, y = MSE/1e5)) + ggtitle("Weights for both training and testing") + scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot() # Making a ggplot object for the MSEs collected when using # SRS folds, Clus models, and SRS error calculations pNoW <- ggplot(data = NoW, mapping = aes(x = df, y = MSE/1e5)) + ggtitle("No weights") + scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot() # Making a ggplot object for the MSEs collected when using # Clus folds, Clus models, and Clus error calculations pModW <- ggplot(data = ModW, mapping = aes(x = df, y = MSE/1e5)) + ggtitle("Weights when training models") + scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot() # Making a ggplot object for the MSEs collected when using # Clus folds, SRS models, and Clus error calculations pMSEW <- ggplot(data = MSEW, mapping = aes(x = df, y = MSE/1e5)) + ggtitle("Weights when estimating test MSE") + scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot() # Making a grid display of the four plot objects above lay <- matrix(c(NA, 1, 1, NA, NA, 1, 1, NA, 2, 2, 3, 3, 2, 2, 3, 3, NA, 4, 4, NA, NA, 4, 4, NA), byrow = TRUE, ncol = 4) gridExtra::grid.arrange(pAllW, pModW, pMSEW, pNoW, ncol = 2, top = paste0("Simulated Data (Sample Sizes = ", n, ", Loops = ", loops, ", 5 Folds, Samp. Wts from Quad. Fit)"), layout_matrix = lay) ```