## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warning = FALSE, message = FALSE ) # save built-in output hook hook_output = knitr::knit_hooks$get("output") # set a new output hook to cut long output texts knitr::knit_hooks$set(output = function(x, options) { if (!is.null(n <- options$out.lines)) { x = xfun::split_lines(x) if (length(x) > n) { # cut x = c(head(x, n), '....\n') } x = paste(x, collapse = '\n') } hook_output(x, options) }) ## ----installation, eval=FALSE------------------------------------------------- # install.packages('devtools') # devtools::install_github('microfan1/hdcate') ## ----load package------------------------------------------------------------- library(hdcate) ## ----message=TRUE------------------------------------------------------------- set.seed(1) # set an arbitrary random seed to ensure the reproducibility data <- HDCATE.get_sim_data(n_obs=500, n_var=100, n_rel_var=4, intercept=10) # generate data ## ----data, out.lines=6-------------------------------------------------------- data ## ----x_formula---------------------------------------------------------------- x_formula <- "X1+X2+X3+X4+X5+X6" x_formula ## ----model-------------------------------------------------------------------- model <- HDCATE(data=data, y_name='Y', d_name='D', x_formula=x_formula) ## ----set conditional variable------------------------------------------------- HDCATE.set_condition_var(model, 'X2', min=-1, max=1, step=0.01) ## ----fit---------------------------------------------------------------------- HDCATE.fit(model) ## ----get fitted ATE----------------------------------------------------------- # ATE model$ate ## ----get fitted CATE, out.lines=6--------------------------------------------- # CATE model$hdcate ## ----inferences--------------------------------------------------------------- HDCATE.inference(model) ## ----two-sided test, out.lines=6---------------------------------------------- # two-sided bands model$i_two_side ## ----left-sided test, out.lines=6--------------------------------------------- # left-sided bands model$i_left_side ## ----right-sided test, out.lines=6-------------------------------------------- # right-sided bands model$i_right_side ## ----plot, fig.show='hold' , fig.width=6, fig.height=6, fig.align='center', fig.cap='HDCATE Estimation (propensity function is misspecified)', dpi=200, out.width="80%"---- HDCATE.plot(model) # Alternatively, if the model is not inferenced, use the code below: # HDCATE.plot(model, include_band=FALSE) # Alternatively, we may want to save the full plot as a high-definition PDF file: # HDCATE.plot(model, output_pdf=TRUE, pdf_name='hdcate_plot.pdf', # include_band=TRUE, test_side='both', y_axis_min='auto', y_axis_max='auto', # display.hdcate='HDCATEF', display.ate='ATE', display.siglevel='sig_level' # ) ## ----change formula, fig.show='hold' , fig.width=6, fig.height=6, fig.align='center', fig.cap='HDCATE Estimation (expectation function is misspecified)', dpi=200, out.width="80%"---- x_formula = paste(paste0('X', c(90:100)), collapse ='+') # x_formula="X90+...+X100" model <- HDCATE(data=data, y_name='Y', d_name='D', x_formula=x_formula) HDCATE.set_condition_var(model, 'X2', min=-1, max=1, step=0.01) HDCATE.fit(model) HDCATE.inference(model) HDCATE.plot(model) ## ----cross-fitting------------------------------------------------------------ # create the model first, or use an existing `model` and skip this line model <- HDCATE(data=data, y_name='Y', d_name='D', x_formula=x_formula) # suppose we want to use the 5-fold cross-fitting approach: HDCATE.use_cross_fitting(model, k_fold=5) ## ----manual fold-------------------------------------------------------------- # In this case, the param k_fold is auto detected, you can pass any value to it. HDCATE.use_cross_fitting(model, k_fold=1, folds=list(c(1:250), c(251:500))) ## ----change to full sample---------------------------------------------------- HDCATE.use_full_sample(model) ## ----temp function------------------------------------------------------------ my_method_to_fit_expectation <- function(df) { # fit the input data.frame (df), where the first column is outcome value (Y) # and the rest are covariates in x_formula # ... # return a fitted object } my_method_to_predict_expectation <- function(fitted_model, df) { # use the fitted object (returned by your function above) to predict the conditional expectation # fucntion on the input data.frame (df) # ... # return a vector of predicted values } my_method_to_fit_propensity <- function(df) { # fit the input data.frame (df), where the first column is dummy treatment # value (D) and the rest are covariates in x_formula # ... # return a fitted object } my_method_to_predict_propensity <- function(fitted_model, df) { # use the fitted object (returned by your function above) to predict the propensity score # fucntion on the input data.frame (df) # ... # return a vector of predicted values } ## ----lasso example------------------------------------------------------------ x_formula <- "X1+X2+X3+X4+X5+X6" # propensity function is misspecified # create a model model <- HDCATE(data=data, y_name='Y', d_name='D', x_formula=x_formula) # using the template above, we can write: my_method_to_fit_expectation <- function(df) { # fit the input data.frame (df), where the first column is outcome value (Y) # and the rest are covariates in x_formula hdm::rlasso(as.formula(paste0('Y', "~", x_formula)), df) # return a fitted object } my_method_to_predict_expectation <- function(fitted_model, df) { # use the fitted object (fitted_model) to predict the conditional expectation # fucntion on the input data.frame (df) predict(fitted_model, df) # return a vector of predicted values } my_method_to_fit_propensity <- function(df) { # fit the input data.frame (df), where the first column is dummy treatment # variable (D) and the rest are covariates in x_formula hdm::rlassologit(as.formula(paste0('D', "~", x_formula)), df) # return a fitted object } my_method_to_predict_propensity <- function(fitted_model, df) { # use the fitted object (fitted_model) to predict the propensity score # fucntion on the input data.frame (df) predict(fitted_model, df, type="response") # return a vector of predicted values } ## ----reset1, include=FALSE---------------------------------------------------- model <- HDCATE(data=data, y_name='Y', d_name='D', x_formula=x_formula) ## ----plug in------------------------------------------------------------------ HDCATE.set_first_stage( model, fit.treated=my_method_to_fit_expectation, fit.untreated=my_method_to_fit_expectation, fit.propensity=my_method_to_fit_propensity, predict.treated=my_method_to_predict_expectation, predict.untreated=my_method_to_predict_expectation, predict.propensity=my_method_to_predict_propensity ) ## ----re estimate 1, fig.show='hold' , fig.width=6, fig.height=6, fig.align='center', fig.cap='HDCATE Estimation (using user-defined LASSO on the first stage)', dpi=200, out.width="80%"---- HDCATE.set_condition_var(model, 'X2', min=-1, max=1, step=0.01) HDCATE.fit(model) HDCATE.inference(model) HDCATE.plot(model) ## ----recover------------------------------------------------------------------ HDCATE.unset_first_stage(model) ## ----reset before rf, include=FALSE------------------------------------------- model <- HDCATE(data=data, y_name='Y', d_name='D', x_formula=x_formula) ## ----ramdom forest example, fig.show='hold' , fig.width=6, fig.height=6, fig.align='center', fig.cap='HDCATE Estimation (using user-defined Random Forest model on the first stage)', dpi=200, out.width="80%"---- # propensity function is misspecified, for example x_formula <- "X1+X2+X3+X4+X5+X6" # create a model model <- HDCATE(data=data, y_name='Y', d_name='D', x_formula=x_formula) # =========== User-defined ML approach start =========== # using the template above, we can write: my_method_to_fit_expectation <- function(df) { # fit the input data.frame (df), where the first column is outcome value (Y) # and the rest are covariates in x_formula randomForest::randomForest(as.formula(paste0('Y', "~", x_formula)), data = df) # return a fitted object } my_method_to_predict_expectation <- function(fitted_model, df) { # use the fitted object (fitted_model) to predict the conditional expectation # fucntion on the input data.frame (df) predict(fitted_model, df) # return a vector of predicted values } my_method_to_fit_propensity <- function(df) { # fit the input data.frame (df), where the first column is dummy treatment # variable (D) and the rest are covariates in x_formula randomForest::randomForest(as.formula(paste0('D', "~", x_formula)), data = df) # return a fitted object } my_method_to_predict_propensity <- function(fitted_model, df) { # use the fitted object (fitted_model) to predict the propensity score # fucntion on the input data.frame (df) predict(fitted_model, df) # return a vector of predicted values } HDCATE.set_first_stage( model, fit.treated=my_method_to_fit_expectation, fit.untreated=my_method_to_fit_expectation, fit.propensity=my_method_to_fit_propensity, predict.treated=my_method_to_predict_expectation, predict.untreated=my_method_to_predict_expectation, predict.propensity=my_method_to_predict_propensity ) # =========== User-defined ML approach end =========== # set conditional variable in CATE, same as above HDCATE.set_condition_var(model, 'X2', min=-1, max=1, step=0.01) # fit, inference and plot HDCATE.fit(model) HDCATE.inference(model) HDCATE.plot(model) ## ----var importance in rf, fig.show='hold' , fig.width=6, fig.height=6, fig.align='center', fig.cap='Variable Importance Extracted from the User-defined Random Forest Model'---- # the list have only one element since we use full-sample estimator rf_model <- model$treated_cond_exp_model_list[[1]] # then, we can use the fitted object `rf_model` for futher analysis. # the rest of codes are omitted since they are unconcerned for this package. # ... ## ----plot rf, echo=FALSE, fig.align='center', fig.cap='Variable Importance Extracted from the User-defined Random Forest Model', fig.height=6, fig.width=6, dpi=200, fig.show='hold', out.width="80%"---- # the code below uses randomForest and ggplot2 to plot out variable importance library('ggplot2') # visualization library('ggthemes') # visualization library('dplyr') # data manipulation importance <- randomForest::importance(rf_model) # varImportance <- data.frame(Variables = row.names(importance), Importance = round(importance[ ,'MeanDecreaseGini'],2)) # for classification problem varImportance <- data.frame(Variables = row.names(importance), Importance = round(importance[ ,'IncNodePurity'],2)) # for regression problem rankImportance <- varImportance %>% mutate(Rank = paste0('#',dense_rank(desc(Importance)))) ggplot(rankImportance, aes(x = reorder(Variables, Importance), y = Importance, fill = Importance)) + geom_bar(stat='identity') + # geom_text(aes(x = Variables, y = 0.5, label = Rank), # hjust=0, vjust=0.55, size = 4, colour = 'red') + labs(x = 'Variables', y= 'Importance (measured in squared OOB residuals)') + coord_flip() + theme_few() ## ----user-defined inf--------------------------------------------------------- HDCATE.inference(model, sig_level=0.05, n_rep_boot=3000) ## ----bw----------------------------------------------------------------------- HDCATE.set_bw(model, 0.15) # for example, set bandwidth=0.15 ## ----recover bw--------------------------------------------------------------- HDCATE.set_bw(model) ## ----help, eval=FALSE--------------------------------------------------------- # help(HDCATE) # help(HDCATE.set_condition_var) # help(HDCATE.fit) # help(HDCATE.inference) # help(HDCATE.plot) # help(HDCATE.use_cross_fitting) # help(HDCATE.use_full_sample) # help(HDCATE.set_first_stage) # help(HDCATE.unset_first_stage) # help(HDCATE.set_bw)