## ---- SETTINGS-knitr, include=FALSE------------------------------------------- stopifnot(require(knitr)) opts_chunk$set( comment=NA, message = FALSE, warning = FALSE, eval = identical(Sys.getenv("NOT_CRAN"), "true"), dev = "png", dpi = 150, fig.asp = 0.618, fig.width = 5, out.width = "60%", fig.align = "center" ) ## ----packages-1, message=FALSE------------------------------------------------ library(rstanarm) library(ggplot2) library(bayesplot) theme_set(bayesplot::theme_default()) # options(mc.cores = 4) ## ----packages-2, eval=FALSE, message=FALSE------------------------------------ # library(dplyr) # library(tidyr) ## ---- include=FALSE, collapse=TRUE-------------------------------------------- simulate_mrp_data <- function(n) { J <- c(2, 3, 7, 3, 50) # male or not, eth, age, income level, state poststrat <- as.data.frame(array(NA, c(prod(J), length(J)+1))) # Columns of post-strat matrix, plus one for size colnames(poststrat) <- c("male", "eth", "age","income", "state",'N') count <- 0 for (i1 in 1:J[1]){ for (i2 in 1:J[2]){ for (i3 in 1:J[3]){ for (i4 in 1:J[4]){ for (i5 in 1:J[5]){ count <- count + 1 # Fill them in so we know what category we are referring to poststrat[count, 1:5] <- c(i1-1, i2, i3,i4,i5) } } } } } # Proportion in each sample in the population p_male <- c(0.52, 0.48) p_eth <- c(0.5, 0.2, 0.3) p_age <- c(0.2,.1,0.2,0.2, 0.10, 0.1, 0.1) p_income<-c(.50,.35,.15) p_state_tmp<-runif(50,10,20) p_state<-p_state_tmp/sum(p_state_tmp) poststrat$N<-0 for (j in 1:prod(J)){ poststrat$N[j] <- round(250e6 * p_male[poststrat[j,1]+1] * p_eth[poststrat[j,2]] * p_age[poststrat[j,3]]*p_income[poststrat[j,4]]*p_state[poststrat[j,5]]) #Adjust the N to be the number observed in each category in each group } # Now let's adjust for the probability of response p_response_baseline <- 0.01 p_response_male <- c(2, 0.8) / 2.8 p_response_eth <- c(1, 1.2, 2.5) / 4.7 p_response_age <- c(1, 0.4, 1, 1.5, 3, 5, 7) / 18.9 p_response_inc <- c(1, 0.9, 0.8) / 2.7 p_response_state <- rbeta(50, 1, 1) p_response_state <- p_response_state / sum(p_response_state) p_response <- rep(NA, prod(J)) for (j in 1:prod(J)) { p_response[j] <- p_response_baseline * p_response_male[poststrat[j, 1] + 1] * p_response_eth[poststrat[j, 2]] * p_response_age[poststrat[j, 3]] * p_response_inc[poststrat[j, 4]] * p_response_state[poststrat[j, 5]] } people <- sample(prod(J), n, replace = TRUE, prob = poststrat$N * p_response) ## For respondent i, people[i] is that person's poststrat cell, ## some number between 1 and 32 n_cell <- rep(NA, prod(J)) for (j in 1:prod(J)) { n_cell[j] <- sum(people == j) } coef_male <- c(0,-0.3) coef_eth <- c(0, 0.6, 0.9) coef_age <- c(0,-0.2,-0.3, 0.4, 0.5, 0.7, 0.8, 0.9) coef_income <- c(0,-0.2, 0.6) coef_state <- c(0, round(rnorm(49, 0, 1), 1)) coef_age_male <- t(cbind(c(0, .1, .23, .3, .43, .5, .6), c(0, -.1, -.23, -.5, -.43, -.5, -.6))) true_popn <- data.frame(poststrat[, 1:5], cat_pref = rep(NA, prod(J))) for (j in 1:prod(J)) { true_popn$cat_pref[j] <- plogis( coef_male[poststrat[j, 1] + 1] + coef_eth[poststrat[j, 2]] + coef_age[poststrat[j, 3]] + coef_income[poststrat[j, 4]] + coef_state[poststrat[j, 5]] + coef_age_male[poststrat[j, 1] + 1, poststrat[j, 3]] ) } #male or not, eth, age, income level, state, city y <- rbinom(n, 1, true_popn$cat_pref[people]) male <- poststrat[people, 1] eth <- poststrat[people, 2] age <- poststrat[people, 3] income <- poststrat[people, 4] state <- poststrat[people, 5] sample <- data.frame(cat_pref = y, male, age, eth, income, state, id = 1:length(people)) #Make all numeric: for (i in 1:ncol(poststrat)) { poststrat[, i] <- as.numeric(poststrat[, i]) } for (i in 1:ncol(true_popn)) { true_popn[, i] <- as.numeric(true_popn[, i]) } for (i in 1:ncol(sample)) { sample[, i] <- as.numeric(sample[, i]) } list( sample = sample, poststrat = poststrat, true_popn = true_popn ) } ## ----include=FALSE, eval=FALSE------------------------------------------------ # mrp_sim <- simulate_mrp_data(n=1200) # save(mrp_sim, file = "mrp-files/mrp_sim.rda", version = 2) ## ----eval=FALSE--------------------------------------------------------------- # mrp_sim <- simulate_mrp_data(n=1200) # str(mrp_sim) ## ---- echo=FALSE-------------------------------------------------------------- load("mrp-files/mrp_sim.rda") str(mrp_sim) ## ---- message=FALSE----------------------------------------------------------- sample <- mrp_sim[["sample"]] rbind(head(sample), tail(sample)) ## ----message=FALSE------------------------------------------------------------ poststrat <- mrp_sim[["poststrat"]] rbind(head(poststrat), tail(poststrat)) ## ----message=FALSE------------------------------------------------------------ true_popn <- mrp_sim[["true_popn"]] rbind(head(true_popn), tail(true_popn)) ## ----order-states------------------------------------------------------------- sample$state <- factor(sample$state, levels=1:50) sample$state <- with(sample, factor(state, levels=order(table(state)))) true_popn$state <- factor(true_popn$state,levels = levels(sample$state)) poststrat$state <- factor(poststrat$state,levels = levels(sample$state)) ## ----state-and-pop-data-for-plots, eval=FALSE, include=FALSE------------------ # # not evaluated to avoid tidyverse dependency # income_popn <- poststrat %>% # group_by(income) %>% # summarize(Num=sum(N)) %>% # mutate(PROP=Num/sum(Num),TYPE='Popn',VAR='Income',CAT=income) %>% # ungroup() # income_data <- sample %>% # group_by(income) %>% # summarise(Num=n()) %>% # mutate(PROP=Num/sum(Num),TYPE='Sample',VAR='Income',CAT=income) %>% # ungroup() # income<-rbind(income_data[,2:6],income_popn[,2:6]) # # age_popn <- poststrat%>% # group_by(age)%>% # summarize(Num=sum(N))%>% # mutate(PROP=Num/sum(Num),TYPE='Popn',VAR='Age',CAT=age)%>% # ungroup() # age_data <- sample%>% # group_by(age)%>% # summarise(Num=n())%>% # mutate(PROP=Num/sum(Num),TYPE='Sample',VAR='Age',CAT=age)%>% # ungroup() # age <- rbind(age_data[,2:6],age_popn[,2:6] ) # # eth_popn <- poststrat%>% # group_by(eth)%>% # summarize(Num=sum(N))%>% # mutate(PROP=Num/sum(Num),TYPE='Popn',VAR='Ethnicity',CAT=eth)%>% # ungroup() # eth_data <- sample%>% # group_by(eth)%>% # summarise(Num=n())%>% # mutate(PROP=Num/sum(Num),TYPE='Sample',VAR='Ethnicity',CAT=eth)%>% # ungroup() # eth<-rbind(eth_data[,2:6],eth_popn[,2:6]) # # male_popn <- poststrat%>% # group_by(male)%>% # summarize(Num=sum(N))%>% # mutate(PROP=Num/sum(Num),TYPE='Popn',VAR='Male',CAT=male)%>% # ungroup() # male_data <- sample%>% # group_by(male)%>% # summarise(Num=n())%>% # mutate(PROP=Num/sum(Num),TYPE='Sample',VAR='Male',CAT=male)%>% # ungroup() # male <- rbind(male_data[,2:6],male_popn[,2:6]) # # state_popn <- poststrat%>% # group_by(state)%>% # summarize(Num=sum(N))%>% # mutate(PROP=Num/sum(poststrat$N),TYPE='Popn',VAR='State',CAT=state)%>% # ungroup() # # state_plot_data <- sample%>% # group_by(state)%>% # summarise(Num=n())%>% # mutate(PROP=Num/nrow(sample),TYPE='Sample',VAR='State',CAT=state)%>% # ungroup() # # state_plot_data <- rbind(state_plot_data[,2:6],state_popn[,2:6]) # state_plot_data$TYPE <- factor(state_plot_data$TYPE, levels = c("Sample","Popn")) # # plot_data <- rbind(male,eth,age,income) # plot_data$TYPE <- factor(plot_data$TYPE, levels = c("Sample","Popn")) # # save(state_plot_data, file = "mrp-files/state_plot_data.rda", version = 2) # save(plot_data, file = "mrp-files/plot_data.rda", version = 2) ## ----plot-data, echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"---- load("mrp-files/plot_data.rda") # created in previous chunk ggplot(data=plot_data, aes(x=as.factor(CAT), y=PROP, group=as.factor(TYPE), linetype=as.factor(TYPE))) + geom_point(stat="identity",colour='black')+ geom_line()+ facet_wrap( ~ VAR, scales = "free",nrow=1,ncol=5)+ theme_bw()+ scale_fill_manual(values=c('#1f78b4','#33a02c', '#e31a1c','#ff7f00','#8856a7'),guide=FALSE)+ scale_y_continuous(breaks=c(0,.25,.5,.75,1), labels=c('0%','25%',"50%","75%","100%"))+ scale_alpha_manual(values=c(1, .3))+ ylab('Proportion')+ labs(alpha='')+ theme(legend.position="bottom", axis.title.y=element_blank(), axis.title.x=element_blank(), legend.title=element_blank(), legend.text=element_text(size=10), axis.text=element_text(size=10), strip.text=element_text(size=10), strip.background = element_rect(fill='grey92')) load("mrp-files/state_plot_data.rda") # created in previous chunk ggplot(data=state_plot_data, aes(x=as.factor(CAT), y=PROP, group=as.factor(TYPE), linetype=as.factor(TYPE))) + geom_point(stat="identity",colour='black')+ geom_line()+ facet_wrap( ~ VAR)+ theme_bw()+ scale_fill_manual(values=c('#1f78b4','#33a02c', '#e31a1c','#ff7f00','#8856a7'),guide=FALSE)+ scale_y_continuous(breaks=c(0,.025,.05,1), labels=c('0%','2.5%',"5%","100%"),expand=c(0,0),limits=c(0,.06))+ scale_alpha_manual(values=c(1, .3))+ ylab('Proportion')+ labs(alpha='')+ theme(legend.position="bottom", axis.title.y=element_blank(), axis.title.x=element_blank(), legend.title=element_blank(), legend.text=element_text(size=10), axis.text.y=element_text(size=10), axis.text.x=element_text(size=8,angle=90), strip.text=element_text(size=10), strip.background = element_rect(fill='grey92')) ## ---- eval=FALSE, echo=FALSE-------------------------------------------------- # # not evaluated to avoid dependency on tidyverse # # #Summarise # summary_by_poststrat_var <- sample %>% # gather(variable,category,c("income","eth","age","male")) %>% # group_by(variable,category) %>% # #Wald confidence interval # summarise(y_mean=mean(cat_pref),y_sd=sqrt(mean(cat_pref)*(1-mean(cat_pref))/n())) %>% # ungroup() # summary_by_poststrat_var$variable <- as.factor(summary_by_poststrat_var$variable) # levels(summary_by_poststrat_var$variable) <- list('Age'='age','Ethnicity'='eth','Income'='income','Male'='male') # # save(summary_by_poststrat_var, file = "mrp-files/summary_by_poststrat_var.rda", # version = 2) ## ----plot-summary-by-poststrat-var, echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"---- load("mrp-files/summary_by_poststrat_var.rda") # created in previous chunk ggplot(data=summary_by_poststrat_var, aes(x=as.factor(category), y=y_mean,group=1)) + geom_errorbar(aes(ymin=y_mean-y_sd, ymax=y_mean+y_sd), width=0)+ geom_line()+ geom_point()+ scale_colour_manual(values=c('#1f78b4','#33a02c','#e31a1c','#ff7f00', '#8856a7'))+theme_bw()+ facet_wrap(~variable,scales = "free_x",nrow=1,ncol=5)+ scale_y_continuous(breaks=c(.5,.75,1), labels=c("50%","75%", "100%"), limits=c(0.4-.4*.05,.9),expand = c(0,0))+ labs(x="",y="Cat preference")+ theme(legend.position="none", axis.title.y=element_text(size=10), axis.title.x=element_blank(), axis.text=element_text(size=10), strip.text=element_text(size=10), strip.background = element_rect(fill='grey92')) ## ----interaction-summary, eval=FALSE, echo=FALSE------------------------------ # # not evaluated to avoid dependency on tidyverse # # #Summarise # interaction <- sample %>% # gather(variable, category, c("age", "eth")) %>% # group_by(variable, category, male) %>% # summarise(y_mean = mean(cat_pref), # y_sd = sqrt(mean(cat_pref) * (1 - mean(cat_pref)) / n())) %>% # ungroup() # # #Tidy for nice facet labels # interaction$variable <- as.factor(interaction$variable) # levels(interaction$variable) <- list('Ethnicity' = 'eth', 'Age' = 'age') # save(interaction, file = "mrp-files/interaction.rda", version = 2) ## ----plot-interaction, echo=FALSE, fig.height = 4, fig.width = 7, fig.align = "center"---- load("mrp-files/interaction.rda") # created in previous chunk ggplot(data=interaction, aes(x=as.factor(category), y=y_mean, colour=as.factor(male),group=as.factor(male))) + geom_errorbar(aes(ymin=y_mean-y_sd, ymax=y_mean+y_sd),width=0 )+ geom_line(aes(x=as.factor(category), y=y_mean,colour=as.factor(male)))+ geom_point()+ facet_wrap(~variable,scales = "free_x",nrow=1,ncol=2)+ labs(x="",y="Cat preference",colour='Gender')+ scale_y_continuous(breaks=c(0,.25,.5,.75,1), labels=c("0%",'25%',"50%","75%", "100%"), limits=c(0,1),expand=c(0,0))+ scale_colour_manual(values=c('#4575b4','#d73027'))+theme_bw()+ theme(axis.title=element_text(size=10), axis.text=element_text(size=10), legend.position='none', strip.text=element_text(size=10), strip.background = element_rect(fill='grey92')) ## ---- eval=FALSE, echo=FALSE-------------------------------------------------- # # not evaluated to avoid dependency on tidyverse # # #Summarise by state # preference_by_state <- sample %>% # group_by(state) %>% # summarise(y_mean = mean(cat_pref), # y_sd = sqrt(mean(cat_pref) * (1 - mean(cat_pref)) / n())) %>% # ungroup() # # save(preference_by_state, file = "mrp-files/preference_by_state.rda", version = 2) ## ---- echo=FALSE, fig.height = 4, fig.width = 8, fig.align = "center"--------- load("mrp-files/preference_by_state.rda") compare <- ggplot(data=preference_by_state, aes(x=state, y=y_mean,group=1)) + geom_ribbon(aes(ymin=y_mean-y_sd,ymax=y_mean+y_sd,x=state),fill='lightgrey',alpha=.7)+ geom_line(aes(x=state, y=y_mean))+ geom_point()+ scale_y_continuous(breaks=c(0,.25,.5,.75,1), labels=c("0%","25%","50%","75%","100%"), limits=c(0,1), expand=c(0,0))+ scale_x_discrete(drop=FALSE)+ scale_colour_manual(values=c('#1f78b4','#33a02c','#e31a1c','#ff7f00', '#8856a7'))+ theme_bw()+ labs(x="States",y="Cat preference")+ theme(legend.position="none", axis.title=element_text(size=10), axis.text.y=element_text(size=10), axis.text.x=element_text(angle=90,size=8), legend.title=element_text(size=10), legend.text=element_text(size=10)) compare2 <- ggplot()+ geom_hline(yintercept = mean(sample$cat_pref),size=.8)+ geom_text(aes(x = 5.2, y = mean(sample$cat_pref)+.025, label = "Sample"))+ scale_y_continuous(breaks=c(0,.25,.5,.75,1), labels=c("0%","25%","50%","75%","100%"), limits=c(-0.25,1.25),expand=c(0,0))+ theme_bw()+ labs(x="Popn",y="")+ theme(legend.position="none", axis.title.y=element_blank(), axis.title.x=element_text(size=10), axis.text=element_blank(), axis.ticks=element_blank(), legend.title=element_text(size=10), legend.text=element_text(size=10)) bayesplot_grid(compare,compare2, grid_args = list(nrow=1, widths = c(8,1))) ## ---- message=FALSE, warning=FALSE, results='hide'---------------------------- fit <- stan_glmer( cat_pref ~ factor(male) + factor(male) * factor(age) + (1 | state) + (1 | age) + (1 | eth) + (1 | income), family = binomial(link = "logit"), data = sample ) ## ----------------------------------------------------------------------------- print(fit) ## ---- message=FALSE----------------------------------------------------------- posterior_prob <- posterior_linpred(fit, transform = TRUE, newdata = poststrat) poststrat_prob <- posterior_prob %*% poststrat$N / sum(poststrat$N) model_popn_pref <- c(mean = mean(poststrat_prob), sd = sd(poststrat_prob)) round(model_popn_pref, 3) ## ---- message=FALSE----------------------------------------------------------- sample_popn_pref <- mean(sample$cat_pref) round(sample_popn_pref, 3) ## ---- message=FALSE,fig.height = 4, fig.width = 8, fig.align = "center"------- compare2 <- compare2 + geom_hline(yintercept = model_popn_pref[1], colour = '#2ca25f', size = 1) + geom_text(aes(x = 5.2, y = model_popn_pref[1] + .025), label = "MRP", colour = '#2ca25f') bayesplot_grid(compare, compare2, grid_args = list(nrow = 1, widths = c(8, 1))) ## ---- message=FALSE----------------------------------------------------------- true_popn_pref <- sum(true_popn$cat_pref * poststrat$N) / sum(poststrat$N) round(true_popn_pref, 3) ## ---- echo=FALSE, message=FALSE,fig.height = 4, fig.width = 8, fig.align = "center"---- compare2 <- compare2 + geom_hline(yintercept = mean(true_popn_pref), linetype = 'dashed', size = .8) + geom_text(aes(x = 5.2, y = mean(true_popn_pref) - .025), label = "True") bayesplot_grid(compare, compare2, grid_args = list(nrow = 1, widths = c(8, 1))) ## ---- message=FALSE----------------------------------------------------------- state_df <- data.frame( State = 1:50, model_state_sd = rep(-1, 50), model_state_pref = rep(-1, 50), sample_state_pref = rep(-1, 50), true_state_pref = rep(-1, 50), N = rep(-1, 50) ) for(i in 1:length(levels(as.factor(poststrat$state)))) { poststrat_state <- poststrat[poststrat$state == i, ] posterior_prob_state <- posterior_linpred( fit, transform = TRUE, draws = 1000, newdata = as.data.frame(poststrat_state) ) poststrat_prob_state <- (posterior_prob_state %*% poststrat_state$N) / sum(poststrat_state$N) #This is the estimate for popn in state: state_df$model_state_pref[i] <- round(mean(poststrat_prob_state), 4) state_df$model_state_sd[i] <- round(sd(poststrat_prob_state), 4) #This is the estimate for sample state_df$sample_state_pref[i] <- round(mean(sample$cat_pref[sample$state == i]), 4) #And what is the actual popn? state_df$true_state_pref[i] <- round(sum(true_popn$cat_pref[true_popn$state == i] * poststrat_state$N) / sum(poststrat_state$N), digits = 4) state_df$N[i] <- length(sample$cat_pref[sample$state == i]) } state_df[c(1,3:6)] state_df$State <- factor(state_df$State, levels = levels(sample$state)) ## ----------------------------------------------------------------------------- round(100 * c( mean = mean(abs(state_df$sample_state_pref-state_df$true_state_pref), na.rm = TRUE), max = max(abs(state_df$sample_state_pref-state_df$true_state_pref), na.rm = TRUE) )) ## ----------------------------------------------------------------------------- round(100 * c( mean = mean(abs(state_df$model_state_pref-state_df$true_state_pref)), max = max(abs(state_df$model_state_pref-state_df$true_state_pref)) )) ## ---- message=FALSE, echo=FALSE, fig.height = 4, fig.width = 8, fig.align = "center",warning=FALSE, fig.align = "center"---- #Summarise by state compare <- compare + geom_point(data=state_df, mapping=aes(x=State, y=model_state_pref), inherit.aes=TRUE,colour='#238b45')+ geom_line(data=state_df, mapping=aes(x=State, y=model_state_pref,group=1), inherit.aes=TRUE,colour='#238b45')+ geom_ribbon(data=state_df,mapping=aes(x=State,ymin=model_state_pref-model_state_sd, ymax=model_state_pref+model_state_sd,group=1), inherit.aes=FALSE,fill='#2ca25f',alpha=.3)+ geom_point(data=state_df, mapping=aes(x=State, y=true_state_pref), alpha=.5,inherit.aes=TRUE)+ geom_line(data=state_df, mapping=aes(x=State, y=true_state_pref), inherit.aes = TRUE,linetype='dashed') bayesplot_grid(compare, compare2, grid_args = list(nrow = 1, widths = c(8, 1))) ## ---- eval=FALSE-------------------------------------------------------------- # # not evaluated to avoid dependency on tidyverse # sample_alt <- sample %>% # group_by(male, age, income, state, eth) %>% # summarise(N_cat_pref = sum(cat_pref), N = n()) %>% # ungroup() ## ---- include=FALSE----------------------------------------------------------- load("mrp-files/sample_alt.rda") ## ---- message=FALSE, warning=FALSE, results='hide'---------------------------- fit2 <- stan_glmer( cbind(N_cat_pref, N - N_cat_pref) ~ factor(male) + factor(male) * factor(age) + (1 | state) + (1 | age) + (1 | eth) + (1 | income), family = binomial("logit"), data = sample_alt, refresh = 0 ) ## ----------------------------------------------------------------------------- print(fit2) ## ---- message=FALSE----------------------------------------------------------- posterior_prob_alt <- posterior_linpred(fit2, transform = TRUE, newdata = poststrat) poststrat_prob_alt <- posterior_prob_alt %*% poststrat$N / sum(poststrat$N) model_popn_pref_alt <- c(mean = mean(poststrat_prob_alt), sd = sd(poststrat_prob_alt)) round(model_popn_pref_alt, 3) ## ----------------------------------------------------------------------------- print(simulate_mrp_data)