---
title: "Plots for our `surveyCV` *Stat* paper based on SDSS presentation"
author: "Cole Guerin, Thomas McMahon, Jerzy Wieczorek"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{plots-for-Stat-paper}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

Writing cleaner code for **only** the simulations needed for [our submission to *Stat*](https://github.com/ColbyStatSvyRsch/surveyCV/blob/master/data-raw/Stat_SDSS_submission.pdf) (the special issue for SDSS 2021 talks).

Based on the earlier sims (now on GitHub in [`data-raw/plot_generation.R`](https://github.com/ColbyStatSvyRsch/surveyCV/blob/master/data-raw/plot_generation.R) and [`data-raw/early-results.html`](https://github.com/ColbyStatSvyRsch/surveyCV/blob/master/data-raw/early-results.html)) -- but instead of having similar functions repeatedly defined separately as in that file, use our functions such as `cv.svy()` directly from the package.

```{r Packages, message=FALSE}
library(survey)
library(ggplot2)
library(splines)
library(magrittr)
```
```{r LoadSurveyCV}
library(surveyCV)
rerunSims <- FALSE  # when we *do* rerun sims, remember to reinstall package & restart R
knitr::opts_chunk$set(dpi=300, out.extra ='WIDTH="95%"')
```



## Generate the artificial population

```{r ArtifPop}
# Generate a cubic-ish artificial population with 1000 units
set.seed(47)

N <- 1000
nrStrata <- 10
nrClusters <- 100
clusSizes <- N/nrClusters
  
x1 = stats::runif(1:(N/2), min = 26, max = 38)
y1 = (x1-29)^3 - 13*(x1-29)^2 + 0*(x1-29) + 900

x2 = stats::runif(1:(N/2), min = 38, max = 50)
y2 = (x2-36)^3 - 10*(x2-36)^2 + 2*(x2-36) + 600

z1 = jitter(y1, 15000)
z2 = jitter(y2, 15000)

ds1 <- data.frame(Response = z1, Predictor = x1)
ds2 <- data.frame(Response = z2, Predictor = x2)
ds12 <- rbind(ds1, ds2)

b12 <- data.frame(ID = c(1:1000))
splinepop.df <- cbind(b12, ds12)
splinepop.df <- splinepop.df %>%
  dplyr::arrange(Predictor) %>%
  # Create Cluster and Stratum variables so that
  # we can (separately) simulate both kinds of sampling
  dplyr::mutate(Stratum = dplyr::row_number(),
                Cluster = dplyr::row_number()) %>% 
  dplyr::mutate(Stratum = cut(Stratum, nrStrata, 1:nrStrata),
                Cluster = cut(Cluster, nrClusters, 1:nrClusters)) %>%
  # Create fpc variables for each sampling type:
  # full pop has 1000 units; 100 units/stratum; 100 clusters (of 10 units each);
  # & if we added a combined Strat+Clus sim, we'd need different fpc's for that
  dplyr::mutate(fpcSRS = N,
                fpcStratum = N/nrStrata,
                fpcCluster = nrClusters) %>%
  dplyr::arrange(ID)

# Create unequal sampling weights that are intentionally 
# NOT well matched to the true shape of the population,
# so we can see how the use of weights impacts
# model-training step vs test-error estimation step
# in a case where naively ignoring weights will confidently pick wrong model
lm_quad <- stats::lm(Response ~ Predictor + I(Predictor^2),
                     data = splinepop.df)
splinepop.df$samp_prob_quad <-
  (1/(abs(lm_quad$residuals))) / sum(1/(abs(lm_quad$residuals)))
splinepop.df$samp_wt_quad <- 1/splinepop.df$samp_prob_quad

stopifnot(all.equal(sum(1/splinepop.df$samp_wt_quad), 1))
```


```{r PopulationPlot, fig.width=8.3, fig.height=3}
n <- 100

srs.df <- dplyr::sample_n(splinepop.df, n)

stratcounts <- rep(n/nrStrata, nrStrata)
names(stratcounts) <- 1:nrStrata
s <- stratsample(splinepop.df$Stratum, stratcounts)
strat.df <- splinepop.df[s,]

c <- unique(splinepop.df[["Cluster"]]) 
clus.df <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),]

a <- ggplot(splinepop.df, aes(x = Predictor, y = Response)) + 
  geom_point(shape = 1) +
  labs(title = "Artificial Pop.", x = "Predictor", y = "Response") + 
  ylim(100,1650) + xlim(25, 51)
b <- ggplot(srs.df, aes(x = Predictor, y = Response)) + 
  geom_point(shape = 8, color = "darkgreen") +
  labs(title = "SRS", x = "Predictor", y = "Response") + 
  ylim(100,1650) + xlim(25, 51)
c <- ggplot(strat.df, aes(x = Predictor, y = Response)) +
  geom_point(shape = 3, color = "darkred") +
  labs(title = "Stratified Sample", x = "Predictor", y = "Response") + 
  ylim(100,1650) + xlim(25, 51)
d <- ggplot(clus.df, aes(x = Predictor, y = Response)) +
  geom_point(shape = 4, color = "steelblue") +
  labs(title = "Cluster Sample", x = "Predictor", y = "Response") + 
  ylim(100,1650) + xlim(25, 51)
gridExtra::grid.arrange(a, b, c, d, ncol = 4,
                        top = "Simulated Data and Examples of How It Was Sampled")
```







## Sims: Use of surveyCV folds with SRS, clustered, or stratified samples

```{r FoldsSimsSetup}
# TODO: redo with different n's?
n <- 100
loops <- 500
```

```{r FoldsSimsResults, eval = rerunSims}
# Use SRS samples and make SRS folds
srssrsds <- data.frame(df = c(), MSE = c())
# Use SRS samples and evaluate on full pop
srspopds <- data.frame(df = c(), MSE = c())

# Use Cluster samples and make Cluster folds
clusclusds <- data.frame(df = c(), MSE = c())
# Use Cluster samples and make SRS folds
clussrsds <- data.frame(df = c(), MSE = c())
# Use Cluster samples and evaluate on full pop
cluspopds <- data.frame(df = c(), MSE = c())

# Use Strat samples and make Strat folds
stratstratds <- data.frame(df = c(), MSE = c())
# Use Strat samples and make SRS folds
stratsrsds <- data.frame(df = c(), MSE = c())
# Use Strat samples and evaluate on full pop
stratpopds <- data.frame(df = c(), MSE = c())

modelsToFit <- c("Response~splines::ns(Predictor, df=1)",
                 "Response~splines::ns(Predictor, df=2)",
                 "Response~splines::ns(Predictor, df=3)",
                 "Response~splines::ns(Predictor, df=4)",
                 "Response~splines::ns(Predictor, df=5)",
                 "Response~splines::ns(Predictor, df=6)")

# Making as many simple random samples as we specify for 'loops'
for (i in 1:loops) {
  # Take a SRS
  sim.srs <- dplyr::sample_n(splinepop.df, n)

  # Using our SRS function on SRS samples
  srs.CV.out <- cv.svy(sim.srs, modelsToFit,
                       nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
  srssrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out)
  srssrsds <- rbind(srssrsds, srssrsds2)

  # Eval SRS samples on full pop
  sub.srs <- dplyr::sample_n(sim.srs, n*.8)
  srs.des <- svydesign(ids = ~1, fpc = ~fpcSRS, data = sub.srs)
  srs.pop.out <- sapply(1:6,
    function(ii) {
        yhat <- predict(svyglm(modelsToFit[ii], srs.des),
                        newdata = splinepop.df)
        return(mean((yhat - splinepop.df$Response)^2))
      })
  srspopds2 <- data.frame(df = 1:6, MSE = srs.pop.out)
  srspopds <- rbind(srspopds, srspopds2)
}

# Making as many cluster samples as we specify for 'loops'
for (i in 1:loops) {
  # Take a Cluster sample
  c <- unique(splinepop.df[["Cluster"]])
  sim.clus <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),]
  
  # Using our Cluster function on Cluster samples
  clus.CV.out <- cv.svy(sim.clus, modelsToFit,
                        clusterID = "Cluster", nfolds = 5, fpcID = "fpcCluster") %>% as.vector()
  clusclusds2 <- data.frame(df = 1:6, MSE = clus.CV.out)
  clusclusds <- rbind(clusclusds, clusclusds2)

  # Using our SRS function on Cluster samples
  srs.CV.out <- cv.svy(sim.clus, modelsToFit,
                       nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
  clussrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out)
  clussrsds <- rbind(clussrsds, clussrsds2)
  
  # Eval 80% Cluster samples on full pop
  sub.c <- unique(sim.clus[["Cluster"]])
  sub.clus <- sim.clus[sim.clus[["Cluster"]] %in% sample(sub.c, (n/clusSizes)*.8),]

  clus.des <- svydesign(ids = ~Cluster, fpc = ~fpcCluster, data = sub.clus)
  clus.pop.out <- sapply(1:6,
    function(ii) {
        yhat <- predict(svyglm(modelsToFit[ii], clus.des),
                        newdata = splinepop.df)
        return(mean((yhat - splinepop.df$Response)^2))
      })
  cluspopds2 <- data.frame(df = 1:6, MSE = clus.pop.out)
  cluspopds <- rbind(cluspopds, cluspopds2)
}

# Making as many stratified samples as we specify for 'loops'
for (i in 1:loops) {
  # Take a Strat sample
  stratcounts <- rep(n/nrStrata, nrStrata)
  names(stratcounts) <- 1:nrStrata
  s <- stratsample(splinepop.df$Stratum, stratcounts)
  sim.strat <- splinepop.df[s,]
  
  # Using our Strat function on Strat samples
  strat.CV.out <- cv.svy(sim.strat, modelsToFit,
                         strataID = "Stratum", nfolds = 5, fpcID = "fpcStratum") %>% as.vector()
  stratstratds2 <- data.frame(df = 1:6, MSE = strat.CV.out)
  stratstratds <- rbind(stratstratds, stratstratds2)
  
  # Using our SRS function on Strat samples
  srs.CV.out <- cv.svy(sim.strat, modelsToFit,
                       nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
  stratsrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out)
  stratsrsds <- rbind(stratsrsds, stratsrsds2)
  
  # Eval 80% Strat samples on full pop
  sub.stratcounts <- rep((n/nrStrata)*.8, nrStrata)
  names(sub.stratcounts) <- 1:nrStrata
  sub.s <- survey::stratsample(sim.strat$Stratum, sub.stratcounts)
  sub.strat <- sim.strat[sub.s,]

  strat.des <- svydesign(ids = ~1, strata = ~Stratum, fpc = ~fpcStratum, data = sub.strat)
  strat.pop.out <- sapply(1:6,
    function(ii) {
        yhat <- predict(svyglm(modelsToFit[ii], strat.des),
                        newdata = splinepop.df)
        return(mean((yhat - splinepop.df$Response)^2))
      })
  stratpopds2 <- data.frame(df = 1:6, MSE = strat.pop.out)
  stratpopds <- rbind(stratpopds, stratpopds2)
}

# Making the degrees of freedom variable a factor variable
srssrsds$df <- as.factor(srssrsds$df)
srspopds$df <- as.factor(srspopds$df)
clusclusds$df <- as.factor(clusclusds$df)
clussrsds$df <- as.factor(clussrsds$df)
cluspopds$df <- as.factor(cluspopds$df)
stratstratds$df <- as.factor(stratstratds$df)
stratsrsds$df <- as.factor(stratsrsds$df)
stratpopds$df <- as.factor(stratpopds$df)

usethis::use_data(srssrsds, srspopds,
                  clusclusds, clussrsds, cluspopds,
                  stratstratds, stratsrsds, stratpopds,
                  internal = FALSE, overwrite = TRUE)
# TODO: for now setting internal=FALSE
# so that we don't overwrite the sysdata.Rda for intro.Rmd;
# but eventually when we're ready to replace that old vignette,
# we can return to internal=TRUE here
# if that's indeed better for R packages
# ...
# OK, now we've written those dataframes,
#   AND the 4 below, all into R/sysdata.Rda together
```


```{r FoldsSimsPlots, fig.width=7, fig.height=4.7}
if(!rerunSims) {
  srssrsds <- surveyCV:::srssrsds
  clusclusds <- surveyCV:::clusclusds
  clussrsds <- surveyCV:::clussrsds
  stratstratds <- surveyCV:::stratstratds
  stratsrsds <- surveyCV:::stratsrsds
  srspopds <- surveyCV:::srspopds
  cluspopds <- surveyCV:::cluspopds
  stratpopds <- surveyCV:::stratpopds
}

# Find the y-ranges of MSEs
yminMSE <- min(min(srssrsds$MSE),
               min(clusclusds$MSE), min(clussrsds$MSE),
               min(stratstratds$MSE), min(stratsrsds$MSE))
ymaxMSE <- max(max(srssrsds$MSE),
               max(clusclusds$MSE), max(clussrsds$MSE),
               max(stratstratds$MSE), max(stratsrsds$MSE))

srspopds <- dplyr::group_by(srspopds, df) %>%
  dplyr::summarise(MSE = mean(MSE)) %>% 
  dplyr::mutate(df = as.numeric(df))
cluspopds <- dplyr::group_by(cluspopds, df) %>%
  dplyr::summarise(MSE = mean(MSE)) %>% 
  dplyr::mutate(df = as.numeric(df))
stratpopds <- dplyr::group_by(stratpopds, df) %>%
  dplyr::summarise(MSE = mean(MSE)) %>% 
  dplyr::mutate(df = as.numeric(df))


# Making ggplot objects for the MSEs and AICs
plot.srssrs <- ggplot(data = srssrsds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = srspopds, mapping = aes(x = df, y = MSE/1e4)) +
  ggtitle("CV: SRS folds,\nSRS sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.blank <- rectGrob(width = 0, height = 0) # empty spot here -- no corresponding sim
plot.clusclus <- ggplot(data = clusclusds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = cluspopds) +
  ggtitle("CV: Cluster folds,\nCluster sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.clussrs <- ggplot(data = clussrsds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = cluspopds) +
  ggtitle("CV: SRS folds,\nCluster sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.stratstrat <- ggplot(data = stratstratds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = stratpopds) +
  ggtitle("CV: Strat folds,\nStrat sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.stratsrs <- ggplot(data = stratsrsds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = stratpopds) +
  ggtitle("CV: SRS folds,\nStrat sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)

gridExtra::grid.arrange(plot.srssrs, plot.clussrs, plot.stratsrs, 
                        plot.blank, plot.clusclus, plot.stratstrat,
             ncol = 3,
             top = paste0("Simulated Data (Sample Sizes = ", n,
                          ", Clusters = ", n/clusSizes,
                          " or Strata = ", nrStrata,
                          ", Loops = ", loops, ", Folds = 5)"))
```





## Sims: Use of sampling weights (in training model-fits vs in test loss-estimates)



```{r QuadWeightsPlot, fig.width=7, fig.height=3.5}
ggplot(splinepop.df,
  aes(x = Predictor, y = Response)) +
  geom_point(aes(size = samp_prob_quad), alpha = 0.1) +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) +
  labs(title = "Artificial Population with Sampling Probabilities",
       subtitle = "Sampling prob. is higher for points nearer the solid curve",
       x = "Predictor", y = "Response",
       size = "Sampling Probability")
```



```{r WeightsSimsSetup}
# TODO: redo with different n's?
n <- 100
loops <- 500
weights <- "samp_wt_quad"
```

```{r WeightsSimsResults, eval = rerunSims}
# In all the following cases,
# we are only checking the effect of using sampling **weights**
# during model-fitting on training sets, and testing on test sets;
# we did not use cluster or stratified sampling,
# so all CV folds will (sensibly) be SRS (regardless of samp weights)

# Use weighted models, and calculate MSEs using weighted design
AllW <- data.frame(df = c(), MSE = c())
# Use SRS models, and calculate MSEs using SRS design
NoW <- data.frame(df = c(), MSE = c())
# Use weighted models, and calculate MSEs using SRS design
ModW <- data.frame(df = c(), MSE = c())
# Use SRS models, and calculate MSEs using weighted design
MSEW <- data.frame(df = c(), MSE = c())

for (i in 1:loops) {
  # Take a sample of size n, using the sampling probabilities instead of SRS
  # (using 1/samp_wt as the samp_prob per unit,
  #  or n/samp_wt as overall samp_prob to get a sample of size n)
  in.sample <- sampling::UPtille(n / splinepop.df[[weights]])
  splinesamp.df <- splinepop.df[in.sample > 0, ]
  modelsToFit <- c("Response~ns(Predictor, df=1)", "Response~ns(Predictor, df=2)",
                   "Response~ns(Predictor, df=3)", "Response~ns(Predictor, df=4)",
                   "Response~ns(Predictor, df=5)", "Response~ns(Predictor, df=6)")

  AllWdat <- cv.svy(splinesamp.df, modelsToFit,
                    nfolds = 5, weightsID = weights) %>% as.vector()
  NoWdat <- cv.svy(splinesamp.df, modelsToFit,
                   nfolds = 5) %>% as.vector()
  ModWdat <- cv.svy(splinesamp.df, modelsToFit,
                    nfolds = 5, weightsID = weights,
                    useSvyForLoss = FALSE) %>% as.vector()
  MSEWdat <- cv.svy(splinesamp.df, modelsToFit,
                    nfolds = 5, weightsID = weights,
                    useSvyForFits = FALSE) %>% as.vector()

  # compiling one data frame
  AllW2 <- data.frame(df = 1:6, MSE = AllWdat, sample = rep(i, 6))
  AllW <- rbind(AllW, AllW2)
  NoW2 <- data.frame(df = 1:6, MSE = NoWdat, sample = rep(i, 6))
  NoW <- rbind(NoW, NoW2)
  ModW2 <- data.frame(df = 1:6, MSE = ModWdat, sample = rep(i, 6))
  ModW <- rbind(ModW, ModW2)
  MSEW2 <- data.frame(df = 1:6, MSE = MSEWdat, sample = rep(i, 6))
  MSEW <- rbind(MSEW, MSEW2)
}

# Making the degrees of freedom variable a factor variable
AllW$df <- as.factor(AllW$df)
NoW$df <- as.factor(NoW$df)
MSEW$df <- as.factor(MSEW$df)
ModW$df <- as.factor(ModW$df)


usethis::use_data(AllW, NoW,
                  MSEW, ModW,
                  internal = FALSE, overwrite = TRUE)
# TODO: as above, for now setting internal=FALSE
# so that we don't overwrite the sysdata.Rda for intro.Rmd;
# but eventually when we're ready to replace that old vignette,
# we can return to internal=TRUE here
# if that's indeed better for R packages
# ...
# OK, now we've written those dataframes,
#   AND the 8 above, all into R/sysdata.Rda together
```

```{r WeightsSimsResultsAndPlots, fig.width=7, fig.height=7}
if(!rerunSims) {
  AllW <- surveyCV:::AllW
  NoW <- surveyCV:::NoW
  MSEW <- surveyCV:::MSEW
  ModW <- surveyCV:::ModW
}

# Find the y-range of MSEs
ymin <- min(min(AllW$MSE), min(NoW$MSE), min(MSEW$MSE), min(ModW$MSE))
ymax <- max(max(AllW$MSE), max(NoW$MSE), max(MSEW$MSE), max(ModW$MSE))
# Making a ggplot object for the MSEs collected when using
# SRS folds, SRS models, and SRS error calculations
pAllW <- ggplot(data = AllW, mapping = aes(x = df, y = MSE/1e5)) +
  ggtitle("Weights for both training and testing") +
  scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# SRS folds, Clus models, and SRS error calculations
pNoW <- ggplot(data = NoW, mapping = aes(x = df, y = MSE/1e5)) +
  ggtitle("No weights") +
  scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# Clus folds, Clus models, and Clus error calculations
pModW <- ggplot(data = ModW, mapping = aes(x = df, y = MSE/1e5)) +
  ggtitle("Weights when training models") +
  scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# Clus folds, SRS models, and Clus error calculations
pMSEW <- ggplot(data = MSEW, mapping = aes(x = df, y = MSE/1e5)) +
  ggtitle("Weights when estimating test MSE") +
  scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()

# Making a grid display of the four plot objects above
lay <- matrix(c(NA, 1, 1, NA,
                NA, 1, 1, NA,
                2, 2, 3, 3,
                2, 2, 3, 3,
                NA, 4, 4, NA,
                NA, 4, 4, NA), byrow = TRUE, ncol = 4)
gridExtra::grid.arrange(pAllW, pModW, pMSEW, pNoW, ncol = 2,
             top = paste0("Simulated Data (Sample Sizes = ", n,
                          ", Loops = ", loops, ", 5 Folds, Samp. Wts from Quad. Fit)"),
             layout_matrix = lay)
```