## ---- echo = FALSE---------------------------------------- options(width = 60) knitr::opts_chunk$set( collapse = TRUE, comment = "#>" # tidy.opts = list(width.cutoff = 60, # args.newline = TRUE), # tidy = T ) ## ----installation, eval = F------------------------------- # install.packages("riskCommunicator") ## ----setup------------------------------------------------ library(riskCommunicator) library(tidyverse) library(printr) library(formatR) library(sandwich) library(stringr) library(ggpubr) ## ---- printr.help.sections = c('usage','arguments')------- ?gComp ## --------------------------------------------------------- data(cvdd) ## ----binary_outcome, paged.print = FALSE, cache=T--------- ## Specify the regression formula cvdd.formula <- cvd_dth ~ DIABETES + AGE + SEX + BMI + CURSMOKE + PREVHYP ## For reproducibility, we should always set the seed since the g-computation uses ## random resampling of the data to calculate confidence intervals and random ## sampling of the distribution when predicting outcomes. set.seed(1298) ## Call the gComp function binary.res <- gComp(data = cvdd, formula = cvdd.formula, outcome.type = "binary", R = 200) binary.res ## ----gComp_class_explaination----------------------------- class(binary.res) ## The names of the different items in the list: names(binary.res) ## Sample size of the original data: binary.res$n ## Contrast being compared in the analysis: binary.res$contrast ## --------------------------------------------------------- summary(binary.res) ## ----binary_outcome_plot, fig.width = 12, fig.height = 9, out.width = "100%"---- plot(binary.res) ## ----std-regression-odds-ratio, echo = F------------------ std.reg.or = glm(formula = cvd_dth ~ DIABETES + AGE + SEX + BMI + CURSMOKE + PREVHYP, data = cvdd, family=binomial(link='logit')) std.reg.or df.std.reg.or = data.frame(exp(cbind(OR = coef(std.reg.or), confint.default(std.reg.or, level = 0.95)))) %>% # and we pull out only the estimate we are concerned with for the predictor DIABETES filter(rownames(.) == "DIABETES1") %>% rename(LL = `X2.5..`, UL = `X97.5..`) ## ----std-regression-risk-ratio, echo = F------------------ std.reg.rr = glm(formula = cvd_dth ~ DIABETES + AGE + SEX + BMI + CURSMOKE + PREVHYP, ## to use Poisson regression, we need to change DIABETES from a factor to a numeric (0,1) variable data = cvdd %>% mutate(cvd_dth = ifelse(cvd_dth == "0", 0, ifelse(cvd_dth == "1", 1, NA))), family="poisson") std.reg.rr ## calculate robust Standard errors cov.std.reg.rr = sandwich::vcovHC(std.reg.rr) std.err <- sqrt(diag(cov.std.reg.rr)) df.std.reg.rr = data.frame(Estimate = exp(coef(std.reg.rr)[2]), Robust_SE = exp(std.err[2]), LL = exp(coef(std.reg.rr)[2] - 1.96 * std.err[2]), UL = exp(coef(std.reg.rr)[2] + 1.96 * std.err[2])) ## ----std-regression-risk-difference, eval = F------------- # std.reg.rd = glm(formula = cvd_dth ~ DIABETES + AGE + SEX + BMI + CURSMOKE + PREVHYP, # data = cvdd %>% # ## To use linear regression, we need to change DIABETES from a # ## factor to a numeric (0,1) variable # mutate(cvd_dth = ifelse(cvd_dth == "0", 0, # ifelse(cvd_dth == "1", 1, NA))), # family=gaussian(link = 'log')) ## ----table 2---------------------------------------------- # combine standard regression results std.regression.res = df.std.reg.or %>% mutate(Parameter = "Odds Ratio", Std_regression = paste0(round(OR, 2), " (", round(LL, 2), ", ", round(UL, 2), ")")) %>% bind_rows(df.std.reg.rr %>% mutate(Parameter = "Risk Ratio", Std_regression = paste0(round(Estimate, 2), " (", round(LL, 2), ", ", round(UL, 2), ")"))) %>% select(Parameter, Std_regression) %>% rename(`Standard regression` = Std_regression) rownames(std.regression.res) = NULL (table2 = binary.res$results.df %>% mutate(riskCommunicator = paste0(format(round(Estimate, 2), 2), " (", format(round(`2.5% CL`, 2), 2), ", ", format(round(`97.5% CL`, 2), 2), ")"), riskCommunicator = ifelse(Parameter == "Number needed to treat/harm", round(Estimate, 2), riskCommunicator)) %>% select(Parameter, riskCommunicator) %>% left_join(std.regression.res, by = "Parameter")) ## ----rate_outcome, cache = T------------------------------ ## Modify the dataset to change the variable cvd_dth from a factor ## to a numeric variable since the outcome for Poisson ## regression must be numeric. cvdd.t <- cvdd %>% dplyr::mutate(cvd_dth = as.numeric(as.character(cvd_dth)), timeout = as.numeric(timeout)) set.seed(6534) rate.res <- gComp(data = cvdd.t, Y = "cvd_dth", X = "DIABETES", Z = c("AGE", "SEX", "BMI", "CURSMOKE", "PREVHYP"), outcome.type = "rate", rate.multiplier = 365.25*100, offset = "timeout", R = 200) rate.res ## ----rate_outcome_subgroup, cache = T--------------------- rate.res.subgroup <- gComp(data = cvdd.t, Y = "cvd_dth", X = "DIABETES", Z = c("AGE", "SEX", "BMI", "CURSMOKE", "PREVHYP"), subgroup = "SEX", outcome.type = "rate", rate.multiplier = 365.25*100, offset = "timeout", R = 200) rate.res.subgroup ## ----plot-rate-outcome-results, warning = F--------------- # Saving the output and modifying the labels of the SEX variable to specify # male/female instead of 0/1 # Also adding a new variable to show the line indicating no difference found # (0 for rate difference, 1 for rate ratio) df = rate.res.subgroup$results.df %>% mutate(Subgroup = ifelse(Subgroup == "SEX0", "Male", "Female"), hline = ifelse(Parameter == "Incidence Rate Ratio", 1, 0)) ggplot(df, aes(x = Subgroup, y = Estimate)) + geom_point(size = 2) + geom_errorbar(aes(ymin = `2.5% CL`, ymax = `97.5% CL`), width = 0.2) + facet_wrap(~Parameter) + theme_bw() + labs(x = "", color = "") + geom_hline(aes(yintercept = hline), color = "red", linetype = "dashed", alpha = 0.3) ## ----std-regression-models-rate--------------------------- # Standard Poisson regression (spr) for the rate question, across both sexes spr.rate = glm( formula = cvd_dth ~ DIABETES + AGE + SEX + BMI + CURSMOKE + PREVHYP + offset(log(timeout+0.001)), data = cvdd.t, family = "poisson" ) # get the parameter estimate and CI from the model object df.spr.rate = as.data.frame( exp(cbind(Estimate = coef(spr.rate), confint.default(spr.rate, level = 0.95))) ) %>% rename(`2.5% CL` = `2.5 %`, `97.5% CL` = `97.5 %`) %>% filter(rownames(.) == "DIABETES1") %>% mutate(Subgroup = "All") # Standard Poisson regression (spr) for the rate question, by subgroup (SEX) spr.rate.subgroup = glm( formula = cvd_dth ~ DIABETES + AGE + SEX + BMI + CURSMOKE + PREVHYP + DIABETES*SEX + offset(log(timeout+0.001)), data = cvdd.t, family = "poisson" ) # Get the variance-covariance matrix so we can calculate standard errors se.subgroup = vcov(spr.rate.subgroup) # Get estimates and CIs df.spr.rate.subgroup = data.frame( Subgroup = c("Male", "Female"), raw.est = c(coef(spr.rate.subgroup)[2], coef(spr.rate.subgroup)[2] + coef(spr.rate.subgroup)[8]), SE = c(sqrt(se.subgroup[2,2]), sqrt(se.subgroup[2,2] + se.subgroup[8,8] + 2*se.subgroup[2,8]))) %>% mutate(Estimate = exp(raw.est), `2.5% CL` = exp(raw.est - 1.96 * SE), `97.5% CL` = exp(raw.est + 1.96 * SE)) # Combine the results from the subgroup and full model into a single data.frame combined.std.reg = df.spr.rate %>% bind_rows(df.spr.rate.subgroup %>% select(Subgroup, Estimate:`97.5% CL`)) %>% mutate(Parameter = "Incidence Rate Ratio", model = "Standard Poisson Regression") ## ----plot-rate-outcome-results-v-std-regression, warning = F, fig.width = 8, fig.height = 4---- # Combine the riskCommunicator results with the standard Poisson regression results df.combined = rate.res$results.df %>% mutate(Subgroup = "All") %>% bind_rows(df) %>% mutate(model = "riskCommunicator") %>% select(-Outcome, -Comparison) %>% bind_rows(combined.std.reg) %>% mutate(hline = ifelse(Parameter == "Incidence Rate Ratio", 1, 0)) rate.diff = ggplot(df.combined %>% filter(Parameter == "Incidence Rate Difference"), aes(x = Subgroup, y = Estimate, color = model)) + geom_point(size = 2, position = position_dodge(width = .5)) + geom_errorbar(aes(ymin = `2.5% CL`, ymax = `97.5% CL`), width = 0.2, position = position_dodge(width = .5)) + scale_color_manual(values = c("#481567FF", "#3CBB75FF")) + theme_bw() + labs(x = "", y = str_wrap('Incidence rate difference of cardiovascular disease or death (cases/100 person-years)', width = 32), color = "") + geom_hline(aes(yintercept = hline), color = "red", linetype = "dashed", alpha = 0.3) + theme(legend.position = "none") rate.ratio = ggplot(df.combined %>% filter(Parameter == "Incidence Rate Ratio"), aes(x = Subgroup, y = Estimate, color = model)) + geom_point(size = 2, position = position_dodge(width = .5)) + geom_errorbar(aes(ymin = `2.5% CL`, ymax = `97.5% CL`), width = 0.2, position = position_dodge(width = .5)) + scale_y_continuous(trans = "log2") + scale_color_manual(values = c("#481567FF", "#3CBB75FF")) + theme_bw() + labs(x = "", y = str_wrap('Incidence rate ratio of cardiovascular disease or death (shown on natural log scale)', width = 32), color = "") + geom_hline(aes(yintercept = hline), color = "red", linetype = "dashed", alpha = 0.3) + theme(legend.position = "bottom") ggarrange(rate.ratio, rate.diff, ncol = 2, common.legend = T, legend = "bottom", labels = c("A", "B"), widths = c(1, 1)) ## ----save-Fig1, include = F, eval = F--------------------- # ggsave(filename = "~/Dropbox/Framingham teaching data set/Manuscript drafts/Fig1.tiff", device = "tiff", width = 10, height = 6) ## --------------------------------------------------------- sessionInfo()