\documentclass[11pt]{article} %\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{Using qbld} %\VignettePackage{qbld} \usepackage[left=1.2in,top=.8in,right=1.2in,nohead,bottom=1.2in]{geometry} \usepackage{amsmath} \usepackage{amsfonts} \title{An Introduction to Bayesian Quantile Regression for Binary Longutudinal Data with R Package \texttt{qbld}} \author{Ayush Agarwal} \begin{document} \maketitle \setlength\parindent{0pt} \tableofcontents \break \bigskip \section{Introduction} The R package \texttt{qbld} implements the Bayesian quantile regression model for binary longitudinal data (QBLD) developed in \textbf{Rahman and Vossmeyer (2019)}. The model handles both fixed and random effects and implements both a blocked and an unblocked Gibbs sampler for posterior inference. \bigskip \section{Quantile Regression for Binary Longitudinal Data} %\subsection{The Model} Let $y_{it}$ be the response variable at time $t$ for the $i^{th}$ case, , and $z_{it}$ be an unobserved latent variable. The \textbf{QBLD} model can be conveniently expressed in the latent variable formulation (Albert \& Chib, 1993) as follows: \begin{equation}\label{eq1} \begin{gathered} z_{it}=x_{it}^{'}\beta+s_{it}^{'}\alpha_{i}+\epsilon_{it},\hspace{30pt}\forall i = 1,...,n; t = 1,...,T_{i}\\y_{it}=\begin{cases}\text{1} &\quad {if\,z_{it}> 0} \\\text{0} &\quad\text{otherwise,}\\\end{cases}\\\epsilon_{it} = w_{it}\theta + \tau\sqrt{w_{it}}u_{it} \hspace{30pt} \forall i = 1,...,n; t = 1,...,T_{i}\,, \,\end{gathered}\end{equation} %$y_{it} =$ response variable $y$ at $t^{th}$ time period for the $i^{th}$ case, %$z_{it} =$ unobserved latent variable $z$ at $t^{th}$ time period for the $i^{th}$ case, where $x_{it}$ is a $k \times 1$ vector of fixed-effects covariates, $\beta$ is a $k \times 1$ vector of fixed-effects parameters, $s_{it}$ is an $l \times 1$ vector of covariates that have case-specific effects, $\alpha_{i}$ is an $l \times 1$ vector of case-specific parameters, and $\epsilon_{it} \overset{\mathrm{iid}}{\sim} AL(0, 1, p)$ are the error terms where AL refers to an asymmteric Laplace distribution with location, $\mu = 0$, scale $\sigma = 1$, and skew parameter $p$. The error term is decomposed into a normal-exponential mixture representation of the AL distribution, presented in Kozumi and Kobayashi (2011). Here, $u_{it} \sim N(0,1)$, is mutually independent of $w_{it} \sim \text{Exp}(1)$, where $\text{Exp}(.)$ is the exponential distribution. Random samples from the AL distribution are generated using \texttt{raldmix} function. (See Appendix) %\bigskip %\subsection{Model priors} %Longitudinal data models often involve a moderately large amount of data, so we stack the model for each case $i$. \medskip Define, $z_{i} = (z_{i1},...,z_{iT_{i}})^{'}$, $X_{i} = (x_{i1},...,x_{iT_{i}})$, $S_{i} = (s_{i1},...,s_{iT_{i}})$, $w_{i} = (w_{i1},...,w_{iT_{i}})^{'}$, $D_{\tau\sqrt{w_{i}}} = \text{diag}(\tau\sqrt{w_{i1}},...,\tau\sqrt{w_{iT_{i}}})^{'}$, and $u_{i} = (u_{i1},...,u_{iT_{i}})^{'}$. Further, define: \[ \theta = \frac{1-2p}{p(1-p)} \text{ and } \tau = \sqrt{\frac{2}{p(1-p)}}. \] Building on Eq.(\ref{eq1}), the following priors are assumed on the model: \begin{equation}\label{eq3} \begin{gathered} %z_{i} = X_{i}\beta+S_{i}\alpha_{i}+w_{i}\theta + D_{\tau\sqrt{w_{i}}}u_{i}\\y_{it}=\begin{cases}\text{1} &\quad {if\,z_{it}> 0} \\\text{0} &\quad\text{otherwise,}\\\end{cases}\\ \alpha_{i}|\varphi^{2}\sim N_{l}(0,\varphi^{2}I_{l}), w_{it} \sim \text{Exp}(1), u_{it} \sim N(0,1)\\\beta\sim N_{k}(\beta_{0},B_{0}), \varphi^{2}\sim IG(c_1/2, d_1/2)\end{gathered}\end{equation} $IG(.)$ refers to the Inverse-Gamma distribution, $\text{Exp}(.)$ refers to the exponential distribution. The starting values for the sampler are sampled from the respective assumed priors, however, one is free to tweak $\beta_{0}$, $B_{0}$, $c_1$, and $d_1$ values. \bigskip The resulting posterior of $(\alpha, \beta, \varphi^2, w)$ is intractable and two Gibbs samplers are available to sample from it. The \textbf{unblocked} Gibbs sampler is faster, but can demonstrate poor mixing properties due to correlation between the covariates. We recommend using \texttt{Unblock} for larger datasets. See Appendix for details of the sampler \smallskip To avoid potential slow mixing, an alternative \textbf{blocked} algorithm is presented. This algorithm is computationally involved but exhibits superior mixing properites. We would recommend using \texttt{Block} for smaller datasets. See Appendix for details of the sampler %\newpage \section{Using \texttt{qbld} package} Let us examine the dataset we will use to demonstrate the sample usage of the package. %<>= %library(knitr) %opts_chunk$set(comment = NA,background='white') %opts_knit$set(out.format = "latex") %knit_theme$set("seashell") %@ \subsection{Dataset:- Airpollution} This example datset is a subset of data from Six Cities study, a longitudinal study of the health effects of air pollution. The dataset contains complete records on 537 children from Ohio, each child was examined annually at ages 7 through 10. The repeated binary response is the wheezing status (1 = ``yes", 0 = ``no") of a child at each occasion. Each mother's smoking pattern was also recorded at the time of the study. Although mother's smoking status could vary with time, it was determined in the first interview and was treated as a time-independent covariate. Maternal smoking was categorized as 1 if the mother smoked regularly and 0 otherwise. <>= set.seed(10) library(qbld) data(airpollution) str(airpollution) @ \subsection{\texttt{model.qbld}: Running the QBLD sampler} \texttt{model.qbld} estimates the QBLD model as described in Section~1, and outputs a \texttt{qbld} class object. In this example, we will model the wheezing status (\texttt{wheeze}) in terms of \texttt{age} and \texttt{smoking}. We will not treat \texttt{counts} as a covariate of interest, and allow intercepts for both fixed and random effects. <>= ##modelling the output :- Blocked #no burn, no verbose, no summary output.block <- model.qbld(fixed_formula = wheeze~smoking+I(age^2), data = airpollution, id="id", random_formula = ~1, p=0.25, nsim=1000, method="block", burn=0, summarize=FALSE, verbose=FALSE) @ \begin{itemize} \item{\texttt{fixed\char`_formula:} A description of the model to be fitted of the form \texttt{$response\sim fixed$} effects predictors i.e $X_{i}$ in the model (\ref{eq3}). Response variable is mandatory, and empty formula will throw error. In this example, \texttt{wheeze $\sim$ smoking+I(age$^2$)} translates to response variable, $y_{i}$ = \texttt{wheeze}, and $x_{i}$ as \texttt{smoking, age$^2$,} and \texttt{Intercept}.} \item{\texttt{id:} An identifier variable in the dataset that specifies individual profile. Every row needs to contain an id value that maps the data point to the individual. By default, id = ``id", and hence, data is expected to contain an id variable. Note that this is not a covariate, and is omitted while modelling. } \item{\texttt{data:} Data are contained in a \texttt{data.frame}. Each element of the data argument must be identifiable by a name. All subjects need to be observed at the same number of time points. Using datasets with different time points is not allowed. NAs are not allowed and should throw errors. All factor variables are auto-converted to numeric levels.} %Two datasets, \texttt{airpollution} and \texttt{locust} are built into the package.} \item{\texttt{random\char`_formula:} A description of the model to be fitted of the form \texttt{response $\sim$ random} effects predictors i.e $S_{i}$ in the model. Response variable is not required, and is ignored. This defaults to $S_{i}$ being only an intercept. In this example, \texttt{$\sim 1$} translates to $s_{i}$ as \texttt{Intercept}.} \item{\texttt{p:} Quantile for the AL distribution on the error term, $p = 0.25$ by default. For very low $(\leq 0.025)$ or very high $(\geq 0.970)$ values of $p$, sampler forces to unblock version to avoid errors in the block procedure. } \item{\texttt{nsim:} No. of simulations to run the sampler.} \item{\texttt{b0, B0:} Prior model parameters for Beta as in the model (\ref{eq3}). These are defaulted to 0 vector, and Identity matrix of appropriate dimensions. Full Gibbs Sampler is not affected by starting values, and need not be specified. } \item{\texttt{c1, d1:} Prior model parameters for Varphi2 as in the model (\ref{eq3}). These are defaulted to 9, 10 (arbitrary) respectively. Full Gibbs Sampler is not affected by starting values, and need not be specified. } \item{\texttt{method:} Choose between the``Block" vs ``Unblock" sampler, Block is slower, but produces lower correlation. Check section 3 for a detailed comparsion. I would recommend using ``Unblock" for larger datasets. The code uses regex and is impervious to alphabet case related errors. } \item{\texttt{burn:} Burn in percentage, number between (0,1). Burn-in values are discarded while outputting and are not used for summary statistical calculations. No. of simulations are adjusted for burn-in before ESS calculations.} \item{\texttt{summarize:} False by default. Outputs a summary table (same as \texttt{summary(output)}). In addition to this, also prints Model fit diagonstics such as AIC, BIC, and Log-likelihood values. This is a bit unusual for a Bayesian analysis; however, useful to check alignment with the classical models or choose among quantile $p$ values.} \item{\texttt{verbose:} False by default. If True, spits out progress reports while the sampler is running. This will print simulation progress for 10 times. i.e prints every 100th simulation if \texttt{nsim} = 1000.} \end{itemize} \subsection{\texttt{qbld} class object} The output of \texttt{model.qbld} function is a \texttt{qbld} class object. <>= str(output.block) @ \texttt{qbld} class object contains the following attributes: \begin{itemize} \item{\texttt{Beta:} Matrix of MCMC samples of fixed-effects parameters.} \item{\texttt{Alpha:} 3-dimensional matrix (of the form $\mathbb{R}^{k \times l \times m}$) of MCMC samples of random-effects parameters.} \item{\texttt{Varphi2:} Matrix of MCMC samples for $\varphi^2$.} \item{\texttt{nsim:} numeric; No. of simulations of MCMC.} \item{\texttt{burn:} logical; Whether or not burn-in used.} \item{\texttt{which:} Attribute; \texttt{block} or \texttt{unblock} sampler used} \end{itemize} \bigskip \subsection{\texttt{summary.qbld:} Summarizing the \texttt{qbld} output} One way of summarizing the model is to use the \texttt{summarize} argument. Continuing with the example in the previous subsection, let us have a look at the unblocked sampler and understand the output. <>= ##modelling the output :- Unblocked #Using burn, no verbose, and summary # p = 0.50 i.e 50th quantile output.unblock <- model.qbld(fixed_formula = wheeze~smoking+I(age^2)+age, data = airpollution, id="id", random_formula = ~1, p=0.50, nsim=5000, method="Unblock", burn=0.2, summarize=TRUE, verbose=FALSE) @ \textbf{Note:} that we are missing significance stars on the \texttt{Multi Gelman-Rubin} level as described in the output above. This is indicative of a lack of enough samples for MCMC. We will increase \texttt{nsim} to 20000 for the next run and try to achieve the significance level. \bigskip Let us also explore the second way of summarizing a \texttt{qbld} object through \texttt{summmary} S3 method, which produces a \texttt{qbld.summmary} class object. <>= ##modelling the output :- Unblocked #Using burn, no verbose, and summary output.unblock2 <- model.qbld(fixed_formula = wheeze~smoking+I(age^2)+age, data = airpollution, id="id", random_formula = ~1, p=0.50, nsim=20000, method="Unblock", burn=0.2, summarize=FALSE, verbose=FALSE) @ <>= summary.unblock2 = summary(output.unblock2, quantiles = c(0.025, 0.25, 0.5, 0.75, 0.975), epsilon=0.10) print(summary.unblock2) str(summary.unblock2) @ Note that the \texttt{foo} attribute is now TRUE, which means the significance on the \texttt{Multi Gelman-Rubin} level have been reached. Note that, in such a case, the summary table for this run will contain the stars unlike the last run. The \texttt{summary} function has the following arguments: \begin{itemize} \item{\texttt{quantiles:} Vector of quantiles for summary of the covariates, defaulted to c(0.025, 0.25, 0.5, 0.75, 0.975)} \item{\texttt{epsilon:} 0.05 by default. Epsilon value is used for calculating the target ESS and Gelman-Rubin criteria. The default value is recommended for producing reliable estimates of posterior quantities.} \end{itemize} \bigskip \texttt{qbld.summary} class object contains the following attributes: \begin{itemize} \item{\texttt{statistics:} Contains the mean, sd, MC std error, ess and Gelman-Rubin diagnostic} \item{\texttt{quantiles:} Contains quantile estimates for each variable} \item{\texttt{nsim:} No. of simulations run, adjusted for burn-in} \item{\texttt{burn:} Burn-in used or not} \item{\texttt{which:} Block, or Unblock version of sampler} \item{\texttt{p:} quantile for the AL distribution on the error term} \item{\texttt{multiess:} multiess value for the sample} \item{\texttt{multigelman:} multivariate version of Gelman-Rubin} \end{itemize} \bigskip \subsection{\texttt{plot.qbld:} Creating plots} Let us now try and create some diagnostic plots to understand the density spread of the covariate, as well as trace of the MCMC run. <>= par(mfrow=c(4,2)) plot(output.block, trace = TRUE, density = TRUE, auto.layout = FALSE, ask = NULL) @ Plot function has the following arguments: \begin{itemize} \item{\texttt{trace:} Whether or not to plot trace plots for covariates, TRUE by default} \item{\texttt{density:} Whether or not to plot density for covariates, TRUE by default.} \item{\texttt{auto.layout:} Auto set layout or not, TRUE as default. Plots according to the local settings if false.} \end{itemize} \appendix \section{Appendix} \bigskip \subsection{Asymmetric Laplace Distribution} The error term as described in (\ref{eq1}) is a random variable from the AL distribution. For the sake of completeness, random generation and a few other AL functions have been made available to the user. For help using the functions, use \texttt{?aldmix}. The asymmetric Laplace distribution (ALD), has the following pdf: \begin{equation}\label{ald} f(x;\mu,\sigma,p) = \frac{p(1-p)}{\sigma}\exp\{-\frac{(x-\mu)}{\sigma}(p-I(x \le \mu))\}\end{equation} where $\mu$ is the location paramter, $\sigma$ is the scale parameter, and $p$ is the skew paramter. <>= #generate 1e4 samples ald.sample <- raldmix(n = 5e4, mu = 0, sigma = 1, p = 0.5) plot(density(ald.sample), main="AL(0,1,0.5)") ## additional functions ald.density <- daldmix(c(4,5),mu = 0,sigma = 1,p = 0.5) ald.cdf <- paldmix(c(1,4),mu = 0,sigma = 1,p = 0.5,lower.tail=TRUE) ald.quantile <- qaldmix(0.5,mu = 0,sigma = 1,p = 0.5,lower.tail=TRUE) @ \bigskip \subsection{Generalized Inverse Gaussian Distribution} Gibbs sampler used in the model requires random sampling from Generalized Inverse Gaussian(GIG) distribution. For the sake of completeness, the random generation function \texttt{rgig}, and the density function, \texttt{dgig} are made available to the user. For help using the functions, use \texttt{?gig}. The Generalised Inverse Gaussian distrubtion(GIG), which has the following pdf: \begin{equation}\label{gig} f(a,b,p)= \frac{(a/b)^{p/2}}{2K_{p}(\sqrt{ab})}\exp\{-\frac{ax + b/x}{2}\},\hspace{10 pt} x > 0 \end{equation} where $a,b>0$ and $p\in\mathbb{R}$ are the parameters, and $K_{p}$ is a modified Bessel function of the second kind. <>= # random generation gig.sample <- rgig(n = 5e4, lambda = 0.5, a = 1, b = 2) plot(density(gig.sample),main="GIG(1,2,0.5)") # density gig.density <- dgig(x = 1, a = 1, b = 2, p = 0.5, log_density = FALSE) @ \subsection{Blocked Sampling} \bigskip \begin{itemize} \item{Sample $(\beta,z_{i})$ in one block. These are sampled in following two substeps.} \begin{itemize} \item{Sample $\beta$ \begin{equation}\begin{gathered}\label{eq4}\\\beta|z,w,\varphi^{2}\sim N(\tilde{\beta},\tilde{B}),\\where,\hspace{10pt}\tilde{B}^{-1}=(\sum_{i=1}^{n}X_{i}^{'}\Omega_{i}^{-1}X_{i}+B_{0}^{-1}),\\\tilde{\beta}=\tilde{B}(\sum_{i=1}^{n}X_{i}^{'}\Omega_{i}^{-1}(z_{i}-w_{i}\theta)+B_{0}^{-1}\beta_{0}),\\\Omega_{i}=(\varphi^{2}S_{i}S_{i}^{'}+D^{2}_{\tau\sqrt{w_{i}}}).\end{gathered}\end{equation} } \item{Sample the vector $z_{i}|y_{i},\beta,w_{i},\varphi^{2}\sim TMVN_{B_{i}}(X_{i}\beta+w_{i}\theta,\Omega_{i})$ for all $i=1,...,n$, where $B_{i}=(B_{i1}*B_{i2}*...*B_{iT_{i}})$ and $B_{it}$ are interval $(0,\infty)$ if $y_{it}=1$, and the interval $(-\infty,0]$ if $y_{it}=0$. This is done by sampling $z_{i}$ at the $j^{th}$ pass of the MCMC iteration using a series of conditional posteriors: \begin{equation}\begin{gathered}\label{eq5}z_{it}^{j}|z_{i1}^{j},...z_{i(t-1)}^{j},z_{i(t+1)}^{j-1},...,z_{iT_{i}}^{j-1}\sim TN_{B_{i}}(\mu_{t|-t},\Sigma_{t|-t}),\hspace{20pt}t=1,...,T_{i}.\\where,\hspace{10pt}\mu_{t|-t}=x_{it}^{'}\beta+w_{it}\theta + \Sigma_{t,-t}\Sigma_{-t,-t}^{-1}(z_{i,-t}^{j}-(X_{i}\beta+w_{i}\theta)_{-t}),\\\Sigma_{t|-t}=\Sigma_{t,t}-\Sigma_{t,-t}\Sigma_{-t,-t}^{-1}\Sigma_{-t,t},\end{gathered}\end{equation} where $z_{i,-t}^{j}=(z_{i1}^{j},...z_{i(t-1)}^{j},z_{i(t+1)}^{j-1},...,z_{iT_{i}}^{j-1})$, $(X_{i}\beta+w_{i}\theta)_{-t}$ is column vector with $t^{th}$ element removed, $\Sigma_{t,t},\Sigma_{t,-t},\Sigma_{-t,-t}$ are $(t,t)^{th}$ element, $t^{th}$ row with $t^{th}$ element removed, and $t^{th}$ row and column removed respectively. } \end{itemize} \item{Sample $\alpha$ \begin{equation}\begin{gathered}\label{eq6}\alpha_{i}|z,\beta,w,\varphi^{2}\sim N(\tilde{a},\tilde{A}),\hspace{10pt} \forall i = 1,...,n\\where,\hspace{10pt}\tilde{A^{-1}}=(S_{i}^{'}D^{-2}_{\tau\sqrt{w_{i}}}S_{i}+\frac{1}{\varphi^{2}}I_{l}),\\\tilde{a}=\tilde{A}(S_{i}^{'}D^{-2}_{\tau\sqrt{w_{i}}}(z_{i}-X_{i}\beta-w_{i}\theta)).\end{gathered}\end{equation} } \item{Sample $w$ \begin{equation}\begin{gathered}\label{eq7}w_{it}|z_{it},\beta,\alpha_{i}\sim GIG(0.5,\tilde{\lambda_{it}},\tilde{\eta})\hspace{10pt} \forall i = 1,...,n; t = 1,...,T_{i},\\where,\hspace{10pt} \tilde{\lambda_{it}}=(\frac{z_{it}-x_{it}^{'}\beta-s_{it}^{'}\alpha_{i}}{\tau})^{2}\\\tilde{\eta}=(\frac{\theta^{2}}{\tau^{2}}+2).\end{gathered}\end{equation} } \item{Sample $\varphi^{2}$ \begin{equation}\begin{gathered}\label{eq8}\varphi^{2}|\alpha\sim IG(\tilde{c_{1}}/2,\tilde{d_{1}}/2),\\where, \hspace{10pt}\tilde{c_{1}}=(nl+c_{1}),\\\tilde{d_{1}}=(\sum_{i=1}^{n}\alpha_{i}^{'}\alpha_{i} + d_{1}).\end{gathered}\end{equation}. } \end{itemize} \subsection{Unblocked Sampling} \bigskip \begin{itemize} \item{Sample $\beta$ \begin{equation}\begin{gathered}\label{eq9}\\\beta|z,w,\varphi^{2}\sim N(\tilde{\beta},\tilde{B}),\\where,\hspace{10pt}\tilde{B}^{-1}=(\sum_{i=1}^{n}X_{i}^{'}\Psi_{i}^{-1}X_{i}+B_{0}^{-1}),\\\tilde{\beta}=\tilde{B}(\sum_{i=1}^{n}X_{i}^{'}\Psi_{i}^{-1}(z_{i}-w_{i}\theta-S_{i}\alpha_{i})+B_{0}^{-1}\beta_{0}),\\\Psi_{i}=D^{2}_{\tau\sqrt{w_{i}}}.\end{gathered}\end{equation} } \item{Sample $\alpha$ as in (\ref{eq6}). } \item{Sample $w$ as in (\ref{eq7}). } \item{Sample $\varphi^{2}$ as in (\ref{eq8}). } \item{ Sample $z|y,\alpha,w \hspace{5pt}\forall i = 1,...,n; t = 1,...,T_{i}$, from univariate truncated normal as: \begin{equation}\label{eq10}z_{it}|y,\beta,w=\begin{cases}{TN_{(-\infty,0]}(x_{it}^{'}\beta+s_{it}^{'}\alpha_{i}+w_{it}\theta,\tau^{2}w_{it})}&\quad{if\,y_{it}=0}\\{TN_{(0,\infty)}(x_{it}^{'}\beta+s_{it}^{'}\alpha_{i}+w_{it}\theta,\tau^{2}w_{it})}&\quad {if\,y_{it}=1}\\\end{cases}\end{equation} } \end{itemize} \bigskip \section{References} \begin{itemize} \item{Rahman, Mohammad Arshad and Angela Vossmeyer, “Estimation and Applications of Quantile Regression for Binary Longitudinal Data,”, Advances in Econometrics, 40B, 157-191, 2019.} \item{Vats, Dootika and Christina Knudson. “Revisiting the Gelman-Rubin Diagnostic.” arXiv} \item{Keming Yu \& Jin Zhang (2005) A Three-Parameter Asymmetric Laplace Distribution and Its Extension, Communications in Statistics - Theory and Methods.} \item{Kobayashi, Genya. (2011). Gibbs Sampling Methods for Bayesian Quantile Regression. J Stat Comput Simul.} \item{Devroye, L. Random variate generation for the generalized inverse Gaussian distribution. Stat Comput 24, 239–246 (2014).} \item{Wolfgang Hörmann and Josef Leydold (2013). Generating generalized inverse Gaussian random variates, Statistics and Computing.} \item{J. S. Dagpunar (1989). An easily implemented generalised inverse Gaussian generator, Comm. Statist. B – Simulation Comput. 18, 703–710.} \end{itemize} \end{document}