% \VignetteIndexEntry{Generalized linear models with clustering}
% \VignetteKeyword{regression}
\documentclass[a4paper,11pt]{article}

\usepackage[utf8]{inputenc}

\usepackage{amsmath}
\usepackage{amssymb}
%\usepackage{array}
%\usepackage{verbatim}
\usepackage{graphicx}
%\usepackage{endfloat}
%\renewcommand{\efloatseparator}{\mbox{}}
\usepackage[british]{babel}


\author{G{\"o}ran Brostr{\"o}m and Henrik Holmberg \\
Department of Statistics \\
      Umeå University \\
      SE--901\,87 Umeå, Sweden}

\date{glmmML Version 0.81-8, March 21, 2011}
\usepackage[longnamesfirst,round]{natbib}
%\usepackage{harvard}
%\bibliographystyle{agsm}

\bibliographystyle{apalike}

\newtheorem{theorem}{Theorem}
\newcommand{\bx}{\ensuremath{\mathbf{x}}}
\newcommand{\by}{\ensuremath{\mathbf{y}}}
\newcommand{\bu}{\ensuremath{\mathbf{u}}}
\newcommand{\bb}{\ensuremath{\boldsymbol{\beta}}}
\newcommand{\bg}{\ensuremath{\boldsymbol{\gamma}}}
\newcommand{\btheta}{\ensuremath{\boldsymbol{\theta}}}
\newcommand{\R}{{\bf R}}
\newcommand{\vect}[1]{\ensuremath{\mathbf{#1}}}

\newcommand{\bs}{\ensuremath{\backslash}}

\newcommand{\logit}{\ensuremath{\text{logit}}}
\newcommand{\ylogist}{
\ensuremath{
\frac
{e^{(\bb \bx_{ij} + u_i) y_{ij}}}
{1 + e^{\bb \bx_{ij} + u_i}}
}
}

\newcommand{\ylogistnorm}{
\ensuremath{
\frac
{e^{(\bb \bx_{ij} + u_i \sigma) y_{ij}}}
{1 + e^{\bb \bx_{ij} + u_i \sigma}}
}
}

\newcommand{\dnorm}{\ensuremath{\frac{1}{\sqrt{2\pi} \sigma}
e^{-\frac{u_i^2}{2\sigma^2}}}} 

\newcommand{\snorm}{\ensuremath{\frac{1}{\sqrt{2\pi}}
e^{-\frac{u_i^2}{2}}}} 

\newcommand{\dnormnorm}{\ensuremath{\frac{1}{\sqrt{2\pi}}
e^{-\frac{u_i^2}{2}}}} 

\newcommand{\pPy}{\ensuremath{P\big(\bb\bx_{ij} + \gamma_i(\bb), \;
    y_{ij}\big)}} 

\newcommand{\py}{\ensuremath{P(\bb\bx_{ij} + u_i, y_{ij})}}
\newcommand{\pyg}{\ensuremath{P(\bb\bx_{ij} + \gamma_i, y_{ij})}}

\newcommand{\Py}{\ensuremath{P(\bb\bx_{ij} + \sigma u, \; y_{ij})}}

\newcommand{\Gy}{\ensuremath{G(\bb\bx_{ij} + \sigma u, \; y_{ij})}}

\newcommand{\dGy}{\ensuremath{H(\bb\bx_{ij} + \sigma u, \; y_{ij})}}

\newcommand{\Iy}{\ensuremath{I(\bb\bx_{ij} + \sigma u, \; y_{ij})}}

\newcommand{\Ky}{\ensuremath{K(\bb\bx_{ij} + \sigma u, \; y_{ij})}}

\newcommand{\Hy}{\ensuremath{H\big(\bb\bx_{ij} + \gamma_i, \;
    y_{ij}\big)}} 

\newcommand{\pGy}{\ensuremath{G\big(\bb\bx_{ij} + \gamma_i(\bb), \;
    y_{ij}\big)}} 

\newcommand{\pHy}{\ensuremath{H\big(\bb\bx_{ij} + \gamma_i(\bb), \;
    y_{ij}\big)}} 

\newcommand{\gGy}{\ensuremath{G\big(\bb\bx_{ij} + \gamma_i, \;
    y_{ij}\big)}} 

\newcommand{\gHy}{\ensuremath{H\big(\bb\bx_{ij} + \gamma_i, \;
    y_{ij}\big)}} 

\newcommand{\dgdbm}{\ensuremath{\frac{\partial\gamma_i(\bb)}{\partial\beta_m}}}
\newcommand{\dgdbs}{\ensuremath{\frac{\partial\gamma_i(\bb)}{\partial\beta_s}}}

\newcommand{\be}{\begin{equation}}
\newcommand{\ee}{\end{equation}}

%\journal{Computational Statistics \& Data Analysis}
%\bibliographystyle{Elsevier/elsarticle-harv}

\begin{document}


%\begin{frontmatter}

\title{Generalised Linear Models with Clustered Data: 
Fixed and random effects models with glmmML}

%\author{Göran Broström \\
%gb@stat.umu.se \\
%Version 0.81-8 and later}
%Department of Statistics \\
%Umeå University \\
%SE--901\,87 Umeå, Sweden} %\corref{cor1}}
%\ead{gb@stat.umu.se}
%\cortext[cor1]{Corresponding address: Department of Statistics, Umeå University, SE-90187 Umeå, Sweden}
%\author{Henrik Holmberg\corref{cor2}}
%\ead{henrik.holmberg@stat.umu.se}
%\address{Department of Statistics, Umeå University, Umeå, Sweden}
%\date{Version 0.81-3 and later}
\maketitle

\begin{abstract}

In situations where a large data set is partitioned into many relatively
small clusters, and where members within clusters have something in common, the
number of parameters tend to increase with sample size, if a fixed-effects
model is applied. This fact causes
the standard assumptions underlying asymptotic results to be violated.
The standard solution to this problem is to apply a random
intercepts model, where each cluster has its own intercept. The cluster
intercepts are usually taken to be a random sample from a Normal
distribution with  mean zero and unknown variance. In the statistical
computing environment {\bf R}, there are a few packages, most notably {\tt
lme4}, that estimates models of this kind. For binary and Poisson data,
{\tt lme4} is a de facto standard for analyzing generalized linear mixed
models (\emph{GLMM}). It also generalises from the random intercepts
model to include random slopes as well as nested clustering.
The package {\tt glmmML} generalises in other directions. First, it is only
implemented for the simple random intercepts model and the Binomial and
Poisson distributions, but it allows for other
distributions than the Normal for the random intercepts. Test of the null
hypothesis of no clustering is performed by a modified likelihood ratio
test and, on request, by bootstrapping. Second, it allows 
for estimating a fixed effects model, assuming that all cluster intercepts
are distinct fixed parameters, and, as a replacement for asymptotics, a
bootstrapping tecnique is implemented. 
The random intercepts model is fitted through maximum likelihood with
adaptive Gauss-Hermite and Laplace quadrature approximations of the likelihood
function. The fixed effects model is fitted through a profiling approach,
which is necessary when the number of clusters is large, because the
standard function {\tt glm} in {\bf R} will choke on a huge design matrix.
In a simulation study the two approaches are compared regarding two
aspects. The first aspect is test of grouping effect, and the second is
performance of regression parameter estimates. The main result is that the
fixed effects model has severe bias when the mixed effects variance is
positive and the number of clusters is large. It is also shown that the
Laplace approximation 
works fairly well when cluster sizes are not too small.

\end{abstract}

%\begin{keyword}
%Bernoulli distribution \sep Gauss-Hermite quadrature \sep Laplace
%approximation \sep Implicit derivation \sep Profiling \sep Poisson distribution. 
%\end{keyword}

%\end{frontmatter}

%\maketitle


\section{Introduction}

Data sets with many small groups where there is within-group correlation
and between-group variation are commonly encountered in many fields of
application, 
for example in medical studies with repeated observations of patients, and
in demographic investigations, where it is realistic to assume that members
of a family have common characteristics. In this paper we investigate the
properties of mixed effects models for these situations. We concentrate
on binary and count data responses, i.e., the binomial and
Poisson distributions in the framework of generalised linear models.

The generalised linear model with random intercept is a well-known model with
some implementations in standard software. It is available in 
{\bf SAS}, {\bf Stata}, and in {\bf R} \cite{R}. There are also (at
least) five {\bf R} packages available, the {\tt lme4} package \cite{lme4}
includes the {\tt lmer} function, the {\tt MASS}
package \cite{mass} includes Ripley's {\tt glmmPQL} function, and, adding to 
that, the presently discussed {\tt glmmML} package \citep{glmmML}. 

So what is the motivation for the package {\tt glmmML}? Because
it is different from the
others (except \texttt{lme4}) in that it, as the name implies, fits the
model via a direct maximum 
likelihood approach. The (marginal) likelihood function is a multi-dimensional
integral in the general case. Furthermore, fixed effects models can be
estimated efficiently through a profiling approach. This means that even
with a huge number of clusters, the estimation procedure is fast and exact.

The random effects version of {\tt glmmML}
only fits models with random intercepts. This means that the multiple
integral can be expanded into several one-dimensional integrals, and the
numerical integration of the log likelihood function by the Laplace or
Gauss-Hermite approximations is fast and accurate. Maximisation of the log
likelihood function is done using the \emph{optim} function {\tt vmmin} in
C code version. For 
this purpose we use the {\sc Linpack} 
Fortran subroutines {\tt dpoco}, {\tt dposl}, and {\tt dpodi},
combined with {\tt blas} routines. All are found within {\bf R}.

In the fixed effects model, testing is performed via a simple bootstrap. 
Under the null hypothesis of no cluster effect, the grouping factor can be
randomly permuted without changing the probability distribution. This is
the basic idea in estimating the $p$-value by simulation.

In Section 2 we define the likelihood function for the distributions we
consider, i.e., the binomial and the Poisson. We then consider fixed group
effects in Section~3, introducing the profile approach. In Section~4 we
introduce the random effects model with a symmetric mixture distribution. We
show how to construct the log-likelihood 
function and we derive the first and second partial derivatives. 

For comparisons of choice of random or fixed effects
of clustering, see the forthcoming paper by \cite{gbhh11}.

\section{The likelihood function}

Assume that there are $n$ clusters in our data, of sizes $n_i, i = 1,
\ldots, n$. For each cluster we observe
responses $(y_{i1}, \ldots, y_{in_i})$ and vectors of explanatory variables
$(\bx_{i1}, \ldots, \bx_{in_i})$, where $\bx_{ij}$ is a $p$-dimensional
vector with the first element identically equal to unity, corresponding to
the mean value of the random intercept. The random part, $u_i$, of the
intercept is 
assumed to follow a distribution with density 
\be
h(u; \sigma) = \frac{1}{\sigma}p\biggl(\frac{u}{\sigma}\biggr), \quad -\infty
< u < \infty,\; \sigma > 0,
\ee 
i.e., with location zero and scale $\sigma$. It is assumed that $u_1, \ldots, u_n$
are independent.

The conditional distribution of the response, given the random intercepts
$\beta_1 + u_i, i = 1, \ldots, n$, is assumed to follow a multivariate
distribution according to
\begin{multline}
\mbox{Pr}(Y_{ij} = y_{ij} \mid u_i; \bx) = \py, \\ 
\quad y_{ij} = 0, 1, \ldots; \; j = 1, \ldots, n_i, \; i = 1, \ldots, n.
\end{multline}
For instance, with the Bernoulli distribution and the
\emph{logit link}, we get
\begin{equation*}
P(x, y) = \frac{e^{xy}}{1 + e^x}, \quad y = 0, 1; \; -\infty < x < \infty,
\end{equation*}
and with the \emph{cloglog} link we have
\begin{equation*}
P(x, y) = \bigl(1 - \exp(-e^x)\bigr)^y \exp\bigl(-(1-y)e^x\bigr),
 \quad y = 0, 1; \; -\infty < x < \infty,
\end{equation*}
The Poisson distribution with \emph{log link} gives rise to
\begin{equation}\label{eq:poisson}
P(x, y) = \frac{e^{xy}}{y!} e^{-e^x}, \quad y = 0, 1, 2, \ldots; \; -\infty < x
< \infty
\end{equation}
These are in fact all the possibilities that are available in {\tt glmmML}
and will be considered in this paper. 

In the fixed effects model (and in the random effects model, if we
condition on these effects), the likelihood function becomes
\begin{equation}\label{eq:lik0}
L\big((\bb, \bg); \by, \bx\big) = \prod_{i = 1}^n
\prod_{j = 1}^{n_i} \pyg.
\end{equation}
 
The log likelihood function becomes
\begin{equation}\label{eq:loglik0}
\ell\big((\bb, \bg); \by, \bx\big) 
= \sum_{i = 1}^n \sum_{j=1}^{n_i}\log \pyg ,
%\frac{1}{\sigma}\phi(\frac{u_i}{\sigma})
\end{equation}

\section{Fixed group effects}

\subsection{Without profiling}

The partial derivatives with respect to $\beta_m, \; m = 1; \ldots, p$, of the log
likelihood function~\eqref{eq:loglik0} are: 
\begin{equation}\label{eq:parb3}
\begin{split}
U_m(\bb, \bg) &= 
\frac{\partial}{\partial \beta_m} 
\ell\big((\bb, \bg); \by, \bx\big)\\
&=
\sum_{i=1}^n \sum_{j=1}^{n_i} x_{ijm}\gGy, \quad m = 1, \ldots, p. 
\end{split}
\end{equation}
where
\begin{equation*}
G(x, y) = \frac{\partial}{\partial x} \log P(x, y) = \frac{
\frac{\partial}{\partial x}P(x, y)}{P(x, y)}
\end{equation*}
The partial derivatives with respect to $\gamma_i, \;i  = 1, \ldots, n$ of
~\eqref{eq:loglik0} are:
\begin{equation}\label{eq:paro2}
\begin{split}
U_{p + i}(\bb, \bg) &= 
\frac{\partial}{\partial \gamma_i} 
\ell\big((\bb, \bg); \by, \bx\big)\\
&=
\sum_{j=1}^{n_i} \gGy, \quad i = 1, \ldots, n.
\end{split}
\end{equation}

The Hessian, or minus the observed information, $-I(\bb, \bg)$ has entries
\begin{equation}\label{eq:hess1}
\begin{split}
-I_{ms}(\bb, \bg) &= \frac{\partial}{\partial\beta_s}U_m(\bb, \bg) \\
&= \sum_{i=1}^n\sum_{j=1}^{n_i} x_{ijm}x_{ijs} \Hy, \quad m, s = 1, \ldots, p,
\end{split}
\end{equation}
\begin{equation}\label{eq:hess12}
\begin{split}
-I_{(p+i)s}(\bb, \bg) &= \frac{\partial}{\partial\beta_s}U_{p+i}(\bb, \bg) \\
&= \sum_{j=1}^{n_i}x_{ijs} \Hy, \quad s = 1, \ldots, p;\; i =
1, \ldots, n.
\end{split}
\end{equation}
\begin{equation}\label{eq:hess2}
\begin{split}
-I_{(p+i)(p+i)}(\bb, \bg) &= \frac{\partial}{\partial\gamma_i}U_{p+i}(\bb,
\bg) \\ 
&= \sum_{j=1}^{n_i}\Hy, \quad i = 1, \ldots, n.
\end{split}
\end{equation}
\begin{equation*}
-I_{(p+i)(p+k)}(\bb, \bg) = 0, \quad k \ne i; \; k, i = 1, \ldots, n.
\end{equation*}
where
\begin{equation*}
H(x, y) = \frac{\partial}{\partial x} G(x, y)
\end{equation*}
When $n$ is large, we may utilise the fact that $I$ is partially diagonal.

\subsection{With profiling}
Setting \eqref{eq:paro2} equal to zero defines $\bg$ implicitly as
functions of $\bb$, $\gamma_i = \gamma_i(\bb), \; i = 1, \ldots, n$:
\begin{equation}\label{eq:impl}
F\big(\bb, \gamma_i(\bb)\big) = \sum_{j=1}^{n_i} \pGy = 0, \quad i = 1,
\ldots, n.
\end{equation}
Generally, we get no explicit form for $\gamma_i(\bb)$, but we can
calculate its partial derivatives via implicit derivation. From
\begin{equation*}
\frac{\partial}{\partial\beta_m}F\big(\bb, \gamma_i(\bb)\big) =
\frac{\partial\gamma_i}{\partial \beta_m} \frac{\partial F}{\partial
  \gamma_i} + \frac{\partial F}{\partial\beta_m} = 0
\end{equation*}
we get
\begin{equation}\label{eq:implscore}
\begin{split}
\dgdbm &= -\frac{\frac{\partial F}{\partial\beta_m}}
{\frac{\partial F}{\partial \gamma_i}} \\
&= -\frac{\sum_{j=i}^{n_i} x_{ijm} \Hy}{\sum_{j=1}^{n_i} \Hy}, \quad i = 1,
\ldots, n; \; m = 1, \ldots, p.  
\end{split}
\end{equation}


Replacing $\bg$ by $\bg(\bb)$ in \eqref{eq:loglik0} gives the profile log
likelihood $\ell^{(p)}$:
\begin{equation}\label{eq:profil}
\ell^{(p)}\big(\bb; \by, \bx\big) 
= \sum_{i = 1}^n \sum_{j=1}^{n_i}\log \pPy ,
%\frac{1}{\sigma}\phi(\frac{u_i}{\sigma})
\end{equation}


\subsubsection{Profile partial derivatives}
The partial derivatives with respect to $\beta_m, \; m = 1; \ldots, p$, of the log
profile likelihood function~\eqref{eq:profil} becomes: 
\begin{equation}\label{eq:pparb3}
\begin{split}
U_m^{(p)}(\bb) &= \frac{\partial}{\partial \beta_m} 
\ell^{(p)}(\bb; \by, \bx)\\
& =\sum_{i=1}^n \sum_{j=1}^{n_i} \bigg(x_{ijm} +
\frac{\partial\gamma_i(\bb)}{\partial\beta_m}\bigg)\pGy \\
&=  U_m\big(\bb, \bg(\bb)\big) +
%\sum_{i=1}^n \sum_{j=1}^{n_i} x_{ijm} \pGy \\
%&\quad +
\sum_{i=1}^n\frac{\partial\gamma_i(\bb)}{\partial\beta_m}
\sum_{j=1}^{n_i}\pGy \\
&=   U_m(\bb, \bg(\bb)),
\end{split}
\end{equation}
where the last equality follows from \eqref{eq:impl}.
 Thus we get
back the unprofiled partial derivatives \eqref{eq:parb3}.  
\subsubsection{The profile Hessian}

From \eqref{eq:pparb3} we get the Hessian, or minus the information matrix
\begin{equation}\label{eq:parhess}
\begin{split}
-I_{ms}^{(p)}(\bb) &= \frac{\partial}{\partial\beta_s} 
U_m\big(\bb, \bg(\bb)\big) \\
&= \sum_{i=1}^{n}\sum_{j=1}^{n_i} x_{ijm}\bigg(x_{ijs} + \dgdbs\bigg)\pHy
\\
&= \sum_{i=1}^n\sum_{j=1}^{n_i} x_{ijm} x_{ijs} H_{ij} \\
&\quad -\sum_{i=1}^n \frac{\sum_{j=1}^{n_i}x_{ijm} H_{ij} 
\sum_{j=1}^{n_i}x_{ijs}H_{ij}}
{\sum_{j=1}^{n_i}H_{ij}}, \\
&\qquad m, s = 1, \ldots, p.
\end{split}
\end{equation}  
where
\begin{equation*}
H_{ij} = \pHy, \quad j = 1, \ldots n_i; \; i = 1, \ldots, n.
\end{equation*} 

\subsubsection{At the maximum}
The following theorem by \citet{patefield} justifies the use of the
profile likelihood for statistical inference.

\begin{theorem}[Patefield] The inverse Hessians from the full likelihood 
and from the profile likelihood for $\bb$ are identical when 
\begin{equation*}
(\bg, \bb) = (\hat{\bg}, \hat{\bb}).
\end{equation*}

\end{theorem}

\subsection{Optimisation considerations}

There are a few practical things to note in the optimisation by
profiling. First, a new iteration step starts by solving the $n$ equations
given by setting \eqref{eq:paro2} equal to zero. The left-hand sides are
simple, monotone functions of one 
variable and easy and fast to solve numerically, but see below. This gives
$(\gamma_1, \ldots, \gamma_n)$, which 
are plugged into \eqref{eq:implscore}, \eqref{eq:pparb3}, and \eqref{eq:parhess}. 

Second, it is easy to see that clusters where all the responses are zero
can be removed from the calculations, after noting that the corresponding
$\gamma = -\infty$. Correspondingly, in the binomial case, clusters with
all responses equal to one can be removed and the corresponding $\gamma =
+\infty$. These extreme cases correspond to probabilities equal to zero and
one, respectively. 

In order to illustrate the performance boost of the profile approach over
using the standard {\tt glm} function in {\bf R}, consider the following
numerical example with 1000 clusters and five individuals in each, one
covariate:
%\begin{Schunk}
%\begin{Sinput}

\begin{verbatim}

> dat <- data.frame(y = rbinom(5000, size = 1, prob = 0.5), 
+     x = rnorm(5000), group = rep(1:1000, each = 5))
> system.time(fit1 <- glm(y ~ factor(group) + x, data = dat, 
+     family = binomial))
\end{verbatim}
%\end{Sinput}
%\begin{Soutput}
\begin{verbatim}
   user  system elapsed 
 86.430   0.840  87.546
\end{verbatim} 
%\end{Soutput}
%\begin{Sinput}
\begin{verbatim}
> library(glmmML)
> system.time(fit2 <- glmmboot(y ~ x, cluster = group, 
+     data = dat))
\end{verbatim}
%\end{Sinput}
%\begin{Soutput}
\begin{verbatim}
   user  system elapsed 
  0.080   0.010   0.097 
\end{verbatim}
The huge difference in computing time is not due to lack of enough computer
memory; the test was performed on a machine with 64 GB RAM. The resulting
parameter estimates, standard errors and $p$-values are indistinguishable.

\subsubsection{Profiling with the Poisson distribution}

When data are Poisson, the profiling can be made explicit. The log
likelihood function is, from \eqref{eq:poisson} and \eqref{eq:loglik0},
\begin{equation}\label{eq:poislog}
\ell \propto \sum_{i=1}^n\sum_{j=1}^{n_i}\bigl\{y_{ij}(\bb\bx_{ij} + \gamma_i) -
\exp(\bb\bx_{ij} + \gamma_i)\bigr\}, 
\end{equation}
and equations \eqref{eq:impl} become
\begin{equation*}
\sum_{j=1}^{n_i} \bigl\{y_{ij} - \exp(\bb\bx_{ij} + \gamma_i)\bigr\} = 0,
\quad i = 1, \ldots, n.
\end{equation*}
with solutions
\begin{equation}\label{eq:profsol}
\gamma_i = \log\biggl(\sum_{j=1}^{n_i}y_{ij}\biggr) - \log\biggl(\sum_{j=1}^{n_i}
\exp(\bb\bx_{ij})\biggr), \quad i = 1, \ldots, n.
\end{equation}
Inserting \eqref{eq:profsol} into \eqref{eq:poislog} and simplifying
results in the profile likelihood 
\begin{equation*}%\label{eq:profpois}
\ell^{(p)} \propto \sum_{i=1}^n\sum_{j=1}^{n_i} y_{ij} \biggl\{\bb\bx_{ij} -
\log\biggl(\sum_{j=1}^{n_i} \exp(\bb\bx_{ij})\biggr)\biggr\}
\end{equation*}
This is also recognized as a \emph{partial likelihood} \citep{cox75}. In
fact, when the responses $y_{ij}$ are indicators, zero or one, and the
clusters are interpreted as \emph{risk sets} at times when events occur,
the profile likelihood is identical to the partial likelihood in Cox
regression (with Breslow's approximation for ties). This was observed by
\cite{joh83}. 

\section{Random group effects}
We assume conditional independence, and, conditionally on \bu ($= \bg$), an ordinary
logistic regression model with \emph{offsets} $\vect{u} = (u_1, \ldots, u_n)$. 
Since $\vect{u}$ is unobserved, the unconditional likelihood function is of
greater interest, and we get it by ``integrating out'' \vect{u}:
\begin{equation*}
L\big((\bb, \sigma); \by, \bx\big) = \idotsint_{R^n} \prod_{i = 1}^n
\prod_{j = 1}^{n_i} \py \frac{1}{\sigma}p\biggl(\frac{u_i}{\sigma}\biggr) du_1\,\cdots\,du_n.
\end{equation*}
Due to independence, this $n$-dimensional integral can be written as a
product of $n$ simple integrals:
\begin{equation}\label{eq:lik}
L\big((\bb, \sigma); \by, \bx\big) = \prod_{i = 1}^n \int_{-\infty}^{\infty} 
p(u) \prod_{j = 1}^{n_i} \Py  du,
%\frac{1}{\sigma}\phi(\frac{u_i}{\sigma})
\end{equation}
where a variable substitution ($u \rightarrow \sigma u$) has
taken place.
 
The log likelihood function thus becomes
\begin{equation}\label{eq:loglik}
\ell\big((\bb, \sigma); \by, \bx\big) 
= \sum_{i = 1}^n \log 
\int_{-\infty}^{\infty} 
p(u) \prod_{j = 1}^{n_i} \Py  du,
%\frac{1}{\sigma}\phi(\frac{u_i}{\sigma})
\end{equation}
and the remaining part of this section is devoted to the problem of
maximising~\eqref{eq:loglik} with respect to $(\bb, \sigma)$. For this we will need the
score vector, 
and for estimation of the
variance-covariance matrix of the parameter estimates we will use the
Hessian of \eqref{eq:loglik}, or rather an approximation thereof.

For the numerical evaluation of integrals we use quadrature methods,
briefly described in Subsections 4.3 and 4.4. For more detail, consult,
e.g., \citet{gray02}. There are two ways to proceed: (i) Calculate the
analytic partial first and second order derivatives of the log likelihood
function \eqref{eq:loglik} and make numerical approximations of them, and
(ii) from the approximation of \eqref{eq:loglik}, calculate the analytic
partial first and second order derivatives. We follow the latter route. 

In the next subsection we introduce the Laplace transform of this problem,
and after that the Gauss-Hermite approximation is introduced. The latter is a
generalisation of the former. In both cases, a fully adaptive version is
implemented.  

\subsection{The Laplace approximation}

The integrals will be evaluated by Laplace approximation. We follow the
approach of approximating only the log-likelihood function and from there
calculate all the necessary derivatives.

\subsubsection{The log-likelihood function}

We look at one
group, i.e., a fixed $i$ for the moment. Let
\begin{equation*}
p(u) \prod_{j = 1}^{n_i} \Py = \exp\{g(u, \btheta)\},
\end{equation*}
with $\btheta = (\bb, \sigma)$. Then the integral in~\eqref{eq:loglik} can
be written, using the Laplace approximation, as
\begin{equation*}
\int_{-\infty}^{\infty} \exp\{g(u, \btheta)\} du \approx \sqrt{2
  \pi}\hat{\omega} \exp\{g(\hat{u}, \btheta)\},
\end{equation*}
where $\hat{\omega}$ and $\hat{u}$ are defined below. For that we need
some partial derivatives of $g$.
\begin{equation}\label{eq:gder}
\begin{split}
g(u, \btheta) &= \log\{p(u)\} + \sum_{j=1}^{n_i} \log
P(\bb x_{ij} + \sigma u, y_{ij}), \\
g_u(u, \btheta) &= \frac{\partial}{\partial u}\log\{p(u)\}
 + \sigma \sum _{j=1}^{n_i} 
G(\bb x_{ij} + \sigma u, y_{ij}), \\
g_{uu}(u, \btheta) &= \frac{\partial^2}{\partial u^2}\log\{p(u)\} + \sigma^2 \sum _{j=1}^{n_i} 
H(\bb x_{ij} + \sigma u, y_{ij}), \\
\end{split}
\end{equation}
where 
\be
\begin{split}
G(x, y) &= \frac{\partial}{\partial x} \log P(x, y) \\
H(x, y) &= \frac{\partial}{\partial x} G(x, y) \\
%I(x, y) &= \frac{\partial}{\partial x} H(x, y) \\
%K(x, y) &= \frac{\partial}{\partial x} IP(x, y) \\
\end{split}
\ee
Now, $\hat{u}$ is defined as 
\begin{equation}\label{eq:equhat}
\hat{u} = \hat{u}(\btheta) = \text{argmax}_u g(u, \btheta),
\end{equation}
(where we emphasise that $\hat{u}$ is a function of $\btheta$), i.e., we
also have 
\begin{equation}\label{eq:uhat}
g_u(\hat{u}, \btheta) = 0.
\end{equation}
Then $\hat{\omega}$ is defined as
\begin{equation}\label{eq:sigmahat}
\begin{split}
\hat{\omega} = \hat{\omega}(\btheta) &= \sqrt{-\frac{1}{g_{uu}(\hat{u},
    \btheta)}} \\ 
&= \biggl(\frac{d^2}{du^2} \log\{p(u)\}\mid_{u=\hat{u}} - \sigma^2 \sum _{j=1}^{n_i}
H(\bb x_{ij} + \sigma \hat{u}, y_{ij})\biggr)^{-\frac{1}{2}},
\end{split}
\end{equation}
which also gives the relation
\be
g_{uu}(\hat{u}, \btheta) = - \frac{1}{\hat{\omega}^2(\btheta)}
\ee
Thus, the contribution to the log likelihood from the $i$th group is
approximated by
\begin{equation}\label{eq:laplik}
\begin{split}
\ell_i(\btheta) &\approx 0.5 \log(2 \pi) + \log\{\hat{\omega}(\btheta)\} +
g(\hat{u}(\btheta), \btheta) \\
&= 0.5 \log(2 \pi) -0.5 \log\{g_{uu}(\hat{u}(\btheta), \btheta)\} +
g(\hat{u}(\btheta), \btheta) \\
\end{split} 
\end{equation}

\subsubsection{The score vector}

In the maximisation of 
\begin{equation*}
\ell(\btheta) = \sum_{i=1}^n \ell_i(\btheta),
\end{equation*}
we will make use of the score vector. For that purpose we will need the partial
derivatives of $\hat{u}(\btheta)$ and $\hat{\omega}(\btheta)$. From \eqref{eq:uhat} we get, by implicit
differentiation, 
\be\label{eq:uhatprim}
\begin{split}
\frac{\partial}{\partial \btheta} g_u(\hat{u}(\btheta), \btheta) &=
g_{uu}(\hat{u}, \btheta) \frac{\partial \hat{u}}{\partial \btheta}  +
\frac{\partial g_u}{\partial \btheta}\\
 &= \frac{\partial \hat{u}}{\partial \btheta}  
g_{uu}(\hat{u}, \btheta) +
g_{u\btheta}(\hat{u}, \btheta) \\
&= \hat{u}_{\btheta}(\btheta) g_{uu}(\hat{u}, \btheta) + g_{u\btheta}(\hat{u},
\btheta) = 0  
\end{split}
\ee
 which gives
\be
\hat{u}_{\btheta}(\btheta) = \frac{\partial \hat{u}(\btheta)}{\partial
  \btheta} =  
-\frac{g_{u\btheta}(\hat{u}, \btheta)}{g_{uu}(\hat{u}, \btheta)}
= \hat{\omega}^2(\btheta) g_{u\btheta}(\hat{u}, \btheta) = 
\hat{\omega}^2(\btheta) g_{u\btheta}(\hat{u}(\btheta), \btheta) 
\ee

In calculating the partial first order derivatives, we utilise the formula
\be\label{eq:imp}
\begin{split}
\frac{\partial}{\partial \btheta} \ell(\hat{u}(\btheta), \btheta) &=
\frac{\partial}{\partial \btheta}\log\hat{\omega}(\btheta) +
\hat{u}_{\btheta} g_u  + g_{\btheta}\\
&= \frac{\hat\omega_{\btheta}}{\hat{\omega}} +
\hat{u}_{\btheta} g_u  + g_{\btheta}.
\end{split}
\ee 
So, we will need
\be
\begin{split}
\frac{\partial}{\partial \btheta} \hat{\omega}({\hat{u}}(\btheta), \btheta) = \hat{\omega}_{\btheta} &= 
\frac{1}{2} (-g_{uu})^{-\frac{3}{2}}\{\hat{u}_{\btheta}g_{uuu} +
g_{uu\btheta}\} \\
&= \frac{1}{2} \hat{\omega}^3 \{\hat{u}_{\btheta}g_{uuu} +
g_{uu\btheta}\}  
\end{split}
\ee
\subsubsection{The Hessian}

The Hessian will be needed for
variance estimation. For that purpose we will need the partial
derivatives of second order of $\hat{u}(\btheta)$. Therefore, we continue by
calculating the partial derivatives of \eqref{eq:uhatprim} 
 with respect to $\btheta^{\prime}$.
\begin{equation*}
\begin{split}
\frac{\partial^2}{\partial \btheta \partial \btheta^{\prime}}
g_u(\hat{u}(\btheta), \btheta) &= \hat{u}_{\btheta\btheta^{\prime}} g_{uu} +
\hat{u}_{\btheta} (\hat{u}_{\btheta^{\prime}}g_{uuu} +
g_{uu\btheta^{\prime}})\\ 
& \quad \; + \hat{u}_{\btheta^{\prime}} g_{uu\btheta} +
g_{u\btheta\btheta^{\prime}} \\ 
&= 0.
\end{split}
\end{equation*}
Solving for $\hat{u}_{\btheta\btheta^{\prime}}$ gives
\begin{equation*}
\hat{u}_{\btheta\btheta^{\prime}} = \frac{\partial}{\partial
  \btheta^{\prime}} \hat{u}_{\btheta}(\btheta) = \hat{\omega}^2 (\hat{u}_{\btheta}\hat{u}_{\btheta^{\prime}} g_{uuu} + 
\hat{u}_{\theta} g_{uu\btheta^{\prime}} +
\hat{u}_{\theta^{\prime}} g_{uu\btheta} + g_{u\btheta\btheta^{\prime}}).
\end{equation*}

We will also need 
\begin{equation*}
\begin{split}
\frac{\partial^2}{\partial \btheta \btheta^{\prime}} \hat{\omega}(\btheta)
&= \frac{3}{4} 
\hat{\omega}^5 (\hat{u}_{\btheta^{\prime}}g_{uuu} + g_{uu\btheta^{\prime}})
( \hat{u}_{\btheta} g_{uuu} + g_{uu\btheta}) \\
& + \frac{1}{2} \hat{\omega}^3 (
\hat{u}_{\btheta\btheta^{\prime}} g_{uuu} + \hat{u}_{\btheta}
\hat{u}_{\btheta^{\prime}} g_{uuuu} + \hat{u}_{\btheta}
g_{uuu\btheta^{\prime}} + \hat{u}_{\btheta\btheta^{\prime}}
  g_{uuu\btheta} + g_{uu\btheta\btheta^{\prime}}).
\end{split}
\end{equation*}
From \eqref{eq:imp}, with the same formula, we get
\begin{equation*}
\begin{split}
\frac{\partial^2}{\partial \btheta \partial \btheta^{\prime}}
\ell(\hat{u}(\btheta), \btheta) &= 
\frac{\partial}{\partial \btheta^{\prime}}\biggl\{
\frac{\hat{\omega_{\btheta}}}{\hat{\omega}} + 
\hat{u}_{\btheta} g_u  + g_{\btheta}\biggr\}
\\
&= \frac{\hat{\omega}_{\btheta\btheta^{\prime}} \hat{\omega} -
  \hat{\omega}_{\btheta}\hat{\omega}_{\btheta^{\prime}}}{\hat{\omega}^2} + 
\hat{u}_{\btheta\btheta^{\prime}}g_u \\
& + \hat{u}_{\btheta}(
\hat{u}_{\btheta^{\prime}} g_{uu} + g_{u\btheta^{\prime}}) + 
\hat{u}_{\btheta^{\prime}} g_{u\btheta} +
g_{\btheta\btheta^{\prime}}. 
\end{split}
\end{equation*}

All the necessary derivatives can be found in~\ref{sec:appendix}. 

\subsection{The Gauss-Hermite approximation}

The Gauss-Hermite approximation can be viewed upon as a generalisation of
the Laplace approximation; instead of using just a single point, the
approximation is built around approximations in several points. More
specifically, the formula is 
\begin{equation}\label{eq:gh}
\int_{-\infty}^{\infty} \exp\{g(u, \btheta)\} du \approx
\sqrt{2\pi}\hat{\omega}\sum_{i=1}^nh_i \exp\{g(\hat{u} + \sqrt{2\pi}\hat{\omega}
x_i, \btheta) + x_i^2\}, 
\end{equation}
where $x_1, \ldots, x_n$ and $h_1, \ldots, h_n$ are the \emph{abscissas}
and \emph{weights} of the transform, and $n$ is the number of quadrature
points. When $n = 1$, this coincides with the Laplace approximation.

The constants (but functions of $\theta$) $\hat{u}$ and $\hat{\omega}$ are
the same as in the Laplace transform of Section 4.3. The calculations of
the approximations of the first and second order partial derivatives (or
vice versa) are straightforward (cf. Section 4.3), and we omit them here. 
They are implemented in the {\bf R} package {\tt glmmML}.

%%\end{document}

% \section{Comparisons}

% \subsection{Preliminaries}

% In the simulation study, two questions are asked; (i) How should the presence
% or non-presence of a clustering effect be tested? (ii) Regarding the
% (possible) clustering effect as a nuisance, which method gives best
% performance of fixed parameter estimates? In all simulations, the fixed
% effects and the random effects models are compared, given that simulated
% data comes from a random effects model with a Normal random effects
% distribution.

% In Section~\ref{sec:simulation}, details about the simulation are given,
% and in the following two subsections, results from the simulations are
% presented. 

% \subsubsection{Simulation}\label{sec:simulation}

% The responses, $Y_{ij}, \quad j = 1, \ldots, n_i, \quad i = 1, \ldots, n$,
% in the simulations are Bernoulli distributed with success parameters
% $P_{ij}$. Two mixed effects models were used to simulate data, 
% one including an explanatory variable and the other with
% just an intercept;
% \begin{equation}\label{eq:covmodel}
% \begin{split}
% \logit(P_{ij}) &= \log\biggl\{\frac{P_{ij}}{1 - P_{ij}}\biggr\} \\
% &= \beta_1+\beta_2 x_{ij}+u_{i}, \quad i = 1, \ldots, n; \; j = 1, \ldots, n_i,
% \end{split}
% \end{equation}
% and 
% \begin{equation}\label{eq:model}
% \logit(P_{ij})=\beta_1+u_{i}, \quad i = 1, \ldots, n; \; j = 1, \ldots,n_i,
% \end{equation}
% respectively. For both these models, $u_1, \ldots, u_n$ are independent and Gaussian 
% distributed with mean 0 and variance $\sigma^2_{u}$. A cluster effect is shared by all the members 
% of a cluster. In all simulations, we keep  
% \begin{equation*}
% \theta = \frac{e^{\beta_1}}{1 + e^{\beta_1}} = 0.85
% \end{equation*}
% fixed (implies $\beta_1 \approx 1.7346$). This makes
% the results comparable to the results by \citet{murphydunne05}. In the model including
% a covariate, the covariate is either $\text{Bernoulli}(0.5)$ or uniformly
% distributed on $(-1, 1)$. 
% The response for each individual is the outcome of
% a Bernoulli trial using the individual specific probability. A number of clusters, with a fixed 
% number of cluster members, were simulated using both these models. For each parameter configuration, 150 
% samples were drawn. All configurations presented in
% Table~\ref{tab:simncov} and Table~\ref{tab:simcov} were simulated with two
% different variances for the mixing distribution, 
% $\sigma^2_{u}=0$ and $\sigma^2_{u}=2$. In total sixteen parameter
% configurations were used in the simulations,
% eight with just the intercept and eight with an extra covariate.


% \begin{table}
% \caption[Simulation design with no covariate]{Simulation design for data with 
% no explanatory variable. All four settings were simulated with two values
% of the variance of 
% the mixing distribution, $\sigma_{u}^2=0$ and $\sigma_{u}^2=2$.}
% \label{tab:simncov} %for ref.
% \small
% \centering
% \begin{tabular}{ccc}
% \hline%\hline
% Number of clusters &   Cluster size \\
% \hline
%        40       &        2\\
%       100       &        2\\
%        40       &        5\\
%       100       &        5\\
% \hline
% \end{tabular}
% \end{table}

% \vspace{\smallskipamount}


% \begin{table}
% \caption[Simulation design with one covariate]{Simulation design for data with an 
% explanatory variable. All four settings were simulated with two values of the variance of the 
% mixing distribution, $\sigma_{u}^2=0$ and $\sigma_{u}^2=2$.}
% \label{tab:simcov}
% \small
% \begin{tabular}{ccccccc}
% \hline%\hline
% \multicolumn{3}{c}{Bernoulli covariate} & &\multicolumn{3}{c}{Uniform covariate}\tabularnewline
% Number of clusters &  & Cluster size &  & Number of clusters &  & Cluster size\tabularnewline
% \hline
% 100 &  & 2 &  & 100 &  & 2\tabularnewline
% 500 &  & 5 &  & 500 &  & 5\tabularnewline
% \hline
% \end{tabular}
% \end{table}


% \subsubsection{Performance measures of the estimators}

% We use two performance measures of the estimators, bias and root mean
% squared error (RMSE), with
% \begin{equation}
% \text{RMSE}=\sqrt{\frac{1}{N}\sum_{i=1}^{n}(\hat{\theta_{i}}-\theta)^2}=\sqrt{\text{se}^2(\hat{\theta})+\text{bias}^2(\hat{\theta})}
% \end{equation}
% Like \citet{murphydunne05} we use the same notation for the parameters in both the 
% fixed effects models (FE) and the mixed effects models (ME) despite the
% difference in interpretation between the two. In the case with no
% covariates, $\theta$ estimated via the fixed effects
% model equals the mean across all individuals, while in the mixed model
% $\theta$ should be interpreted as the probability of an event for 
% an individual which has a zero random effect.


% \subsubsection{Estimation}

% We estimate the model parameters in three different ways, first a fixed
% effects model (FE).
% The FE estimator of $\theta$ is the sample mean. 
% With a mixing distribution present this is a biased estimator of $\theta$ 
% \cite{murphydunne05}. For a normal mixing distribution the true average probability is given by
% \begin{equation}
% \int_{-\infty}^{\infty} \frac{e^{(\theta+u_i)}}{1+e^{(\theta+u_i)}} \frac{e^{\frac{-.5u^2_i}{\sigma_{u}^2}}}{\sqrt{2\pi\sigma_{u}^2}}du_i
% \end{equation}

% Second and third, mixed effects models are fitted by maximum likelihood. The necessary
% numerical integration is performed either by a Laplace approximation or Gauss-Hermite quadrature. 
% The number of quadrature points used in the Gauss-Hermite approximation
% is eight. Mixed effect models fitted via the Laplace and Gauss-Hermite
% approximations 
% are denoted ME(lap) and ME(GHQ), respectively, in text, figures and tables.


% \subsection{Testing the presence of a clustering effect}\label{sec:testing}

% Testing whether there is a clustering effect or not can be done in
% essentially two ways. The first way is to fit a mixed effects model and
% test the hypothesis $H_0: \sigma_{u} = 0$. It is superficially straightforward
% to do, since the glmmML function reports an estimate of $\sigma_{u}$ together
% with a standard error. We can thus form a sort of $t$ statistic as the
% ratio between the two. However, the null hypothesis is on the border of the
% parameter space, and therefore the standard asymptotic distributional
% properties are not present. In the simulations we use the often suggested
% approach of simply doubling the (naively) calculated $p$-value.

% The second way is to regard the cluster effects as fixed and perform a
% likelihood ratio test. Since the asymptotic theory breaks down in this case
% (the number of parameters grows with sample size), $p$-values are estimated
% by a bootstrap procedure.

% In the comparison presented here, there are no covariates but
% the clustering. The 
% results are shown in Table~\ref{tab:nocov}. 

% \begin{table}[ht!]
% \caption[Test of cluster effects in the absence of other covariates.]
% {Test of cluster effects in the absence of other covariates. The
%  number of replicates is 1000, the number of clusters is 100, and the
%  nominal level of significance is 5\%.}
% \label{tab:nocov}
% \small
% \begin{center}
% \begin{tabular}{rr|rr}
%  & & \multicolumn{2}{c}{Effect, $p$-value (\%)} \\
% Cluster size & $\sigma$ & Random  & Fixed \\ \hline
% 2 & 0.0 & 2.5 & 3.6 \\
% 2 & 0.5 & 7.5 & 9.8 \\
% 2 & 1.0 & 37.0 & 43.3 \\
% &&&\\
% 5 & 0.0 & \emph{8.9} & 3.2\\
% 5 & 0.5 & 55.5 & 37.4 \\
% 5 & 1.0 & 100.0 & 99.4 \\
% &&&\\
% 10 & 0.0 & \emph{6.7} & 3.3 \\
% 10 & 0.5 & 92.2 & 77.5 \\
% 10 & 1.0 & 100.0 & 99.4
% \end{tabular}
% \end{center}
% \end{table}


% When the cluster size is small
% (2), the fixed effects test has better performance in terms of higher
% power, but both seem to be very conservative. When cluster size is higher (5
% or 10), the random effects test violates the significance level, i.e., it
% rejects the null hypothesis, when it is true, too often, see also
% Figure~\ref{fig:clustest}. 
% \begin{figure}[t]
% \begin{center}
% \includegraphics[width=0.8\textwidth]{R/clustest.eps}
%  \caption[Test of clustering, power.]{Test of clustering, power. 1000
%    replicates, $\sigma = 0, 0.5, 1$, 100 clusters of sizes 2, 5, or
%    10. Solid line is the random effects model, the dashed line is the fixed
%  effects model.}
% \label{fig:clustest}
% \end{center}
% \end{figure}
% We have done the same kind of analyses for situations where covariates are
% present, but since the results were very close to the situation without
% covariates, they are not presented here.

% %In the second comparison, there is one covariates added to the clustering. The
% %results are shown in Table~\ref{tab:onecov}.

% %\begin{table}
% %\caption[Test of cluster effects in the presence of one covariate.]
% %{Test of cluster effects in the presence of one covariate. The
% %  number of replicates is 1000, the number of clusters is 100, and the
% %  nominal level of significance is 5\%.}
% %\label{tab:onecov}
% %\begin{center}
% %\begin{tabular}{rr|rr}
% % & & \multicolumn{2}{c}{Effect, $p$-value (\%)} \\
% %Cluster size & $\sigma$ & Random  & Fixed \\ \hline
% %2 & 0.0 & 3.0 & 1.5 \\
% %2 & 0.5 & 3.7  & 3.0 \\
% %2 & 1.0 & 3.4 & 2.1 \\
% %&&&\\
% %5 & 0.0 & 7.2 & 0.4 \\
% %5 & 0.5 & 8.0 & 1.0 \\
% %5 & 1.0 & 5.4 & 0.9 \\
% %&&&\\
% %10 & 0.0 & 7.5 & 0 \\
% %10 & 0.5 & 7.4 & 0.2 \\
% %10 & 1.0 &  &
% %\end{tabular}
% %\end{center}
% %\end{table}

% The distribution of the $p$-value for the test based on the random effects
% model under the null hypothesis is far from uniform, see
% Figure~\ref{fig:clustest}. An explanation for this odd phenomenon is that it is
% an irregular case with the null hypothesis on the border of the parameter
% space. Moreover, under the null, the estimator of the random effects
% variance has a (not so small) positive probability of being exactly zero.

% % The 
% % \begin{figure}[ht!]\label{fig:n.5}
% % \includegraphics[width=\textwidth]{R/s05.100.2.eps}
% % \caption[$\sigma = 0.5$]{1000 replicates, $\sigma = 0.5$, 100 clusters of size five each. Estimated
% %   $p$-values (power) are 7.5\% for the mixed effects and 9.8\% for the fixed
% %   effects model. Nominal level is 5\%.}
% % \end{figure}

% % \begin{figure}[ht!]\label{fig:ett}
% % \includegraphics[width=\textwidth]{R/s1.100.2.eps}
% % \caption[$\sigma = 1$]{1000 replicates, 100 clusters of size five each. Estimated
% %   $p$-values (power) are 37.0\% for the mixed effects and 43.3\% for the fixed
% %   effects model. Nominal level is 5\%.}
% % \end{figure}

% % \begin{figure}[ht!]\label{fig:null}
% % \includegraphics[width=\textwidth]{R/s0.100.5.eps}
% % \caption[$\sigma = 0$]{1000 replicates, $\sigma = 0$, 100 clusters of size five each. Estimated
% %   $p$-values are 8.9\% for the mixed effects and 3.2\% for the fixed
% %   effects model. Nominal level is 5\%.}
% % \end{figure}

% % The 
% % \begin{figure}[ht!]\label{fig:n.5}
% % \includegraphics[width=\textwidth]{R/s05.100.5.eps}
% % \caption[$\sigma = 0.5$]{1000 replicates, $\sigma = 0.5$, 100 clusters of size five each. Estimated
% %   $p$-values (power) are 55.5\% for the mixed effects and 37.4\% for the fixed
% %   effects model. Nominal level is 5\%.}
% % \end{figure}

% % \begin{figure}[ht!]\label{fig:ett}
% % \includegraphics[width=\textwidth]{R/s1.100.5.eps}
% % \caption[$\sigma = 1$]{1000 replicates, 100 clusters of size five each. Estimated
% %   $p$-values (power) are 100\% for the mixed effects and 99.4\% for the fixed
% %   effects model. Nominal level is 5\%.}
% % \end{figure}

% % \begin{figure}[ht!]\label{fig:null}
% % \includegraphics[width=\textwidth]{R/s0.100.10.eps}
% % \caption[$\sigma = 0$]{1000 replicates, $\sigma = 0$, 100 clusters of size ten each. Estimated
% %   $p$-values are 6.7\% for the mixed effects and 3.3\% for the fixed
% %   effects model. Nominal level is 5\%.}
% % \end{figure}

% % The 
% % \begin{figure}[ht!]\label{fig:n.5}
% % \includegraphics[width=\textwidth]{R/s05.100.10.eps}
% % \caption[$\sigma = 0.5$]{1000 replicates, $\sigma = 0.5$, 100 clusters of size ten each. Estimated
% %   $p$-values (power) are 92.9\% for the mixed effects and 77.5\% for the fixed
% %   effects model. Nominal level is 5\%.}
% % \end{figure}

% % \begin{figure}[ht!]\label{fig:ett}
% % \includegraphics[width=\textwidth]{R/s1.100.10.eps}
% % \caption[$\sigma = 1$]{1000 replicates, 100 clusters of size ten each. Estimated
% %   $p$-values (power) are 100\% for the mixed effects and 99.4\% for the fixed
% %   effects model. Nominal level is 5\%.}
% % \end{figure}


% \subsection{Bias and MSE of parameter estimators}

% \subsubsection{Estimation of $\theta$, data with no cluster effect}
% Looking at the tabulated estimates of $\theta$ in Table~\ref{tab:est_s0} we can see
% that the FE estimator outperform the ME estimators when there is in fact no clustering
% present. In all four settings presented, the FE performed best with respect to RMSE, the 
% Laplace estimator consistently performed worst of all three estimators. This was expected
% since Laplace approximation of the log likelihood function is unreliable when the number of
% observations per cluster is small.

% \begin{table}
% \caption[Performance of the estimators for $\theta$ (no cluster effect).]
% {Performance of the estimators of $\theta$ (no cluster effect, $\sigma_u^2~=~0$).
% Numbers presented are means based on 150 replicates.}
% \label{tab:est_s0} %for ref.
% \small
% \centering
% \begin{tabular}{ccccc}
% \hline %\hline 
% Cluster size & No. of clusters    &Model   &Bias     &RMSE\tabularnewline
% \hline
%   &      &FE      &-0.00608 &0.03785\tabularnewline
% 2 & 40  &ME(lap) &0.02126  &0.06370\tabularnewline
%    &     &ME(GHQ) &0.01256  &0.04857\tabularnewline
% \tabularnewline
%     &    &FE      &0.00083  &0.02524\tabularnewline
% 2 & 100 &ME(lap) &0.01928  &0.04813\tabularnewline
%       &  &ME(GHQ) &0.01196  &0.03517\tabularnewline
% \tabularnewline
%       &  &FE      &0.00230  &0.02486\tabularnewline
% 5 & 40 &ME(lap) &0.00716  &0.02654\tabularnewline
%     &    &ME(GHQ) &0.00685  &0.02626\tabularnewline
% \tabularnewline
%      &   &FE      &-0.00343 &0.01793\tabularnewline
% 5 & 100 &ME(lap) &0.00132  &0.01894\tabularnewline
%     &    &ME(GHQ) &0.00095  &0.01859\tabularnewline
% \hline
% \end{tabular}
% \end{table}

% \subsubsection{Estimation of $\theta$, data with cluster effect}
% In the scenario where we have clustering, the estimators perform in a different manner.
% Bias and RMSE are presented in Table \ref{tab:est_s2}, where ME(GHQ) gives the best estimate of
% $\theta$ with respect to RMSE, and has smallest bias in three of the cases. What is notable is 
% that despite the fact that FE is a biased estimator for $\theta$ when a mixing distribution is 
% present, it performs well in terms of standard error compared with the ME estimators. 
% One could say that FE has better precision but worse accuracy than the ME estimators. 
% For cluster size 2 with 40 or 100 clusters the ME(lap) has a considerably smaller bias 
% than the FE, but with respect to RMSE they do not differ that much. 
% This is due to the fact that FE has much smaller standard error.

% \begin{table}
% \caption[Performance of the estimators for $\theta$ (with cluster effect,
% $\sigma_u^2~=~2$).]
% {Performance of the estimators of $\theta$ (with cluster effect $\sigma_u^2~ =~2$).
% Numbers presented are means based on 150 replicates.}
% \label{tab:est_s2} %for ref.
% \small
% \centering
% \begin{tabular}{ccccc}
% \hline% \hline 
% Cluster size & No. of clusters    &Model   &Bias     &RMSE\tabularnewline
% \hline
%     &    &FE      &-0.06758 &0.08569\tabularnewline
% 2 & 40  &ME(lap) &0.01840  &0.09213\tabularnewline
%     &    &ME(GHQ) &0.00562  &0.07353\tabularnewline
% \tabularnewline
%      &   &FE      &-0.05977 &0.06790\tabularnewline
% 2 & 100 &ME(lap) &0.01468  &0.06466\tabularnewline
%     &    &ME(GHQ) &0.00598  &0.04593\tabularnewline
% \tabularnewline
%      &   &FE      &-0.07183 &0.08557\tabularnewline
% 5 & 40 &ME(lap) &-0.00773 &0.05207\tabularnewline
%       &  &ME(GHQ) &-0.00827 &0.04994\tabularnewline
% \tabularnewline
%     &    &FE      &-0.06996 &0.07390\tabularnewline
% 5 & 100 &ME(lap) &-0.00263 &0.02990\tabularnewline
%     &    &ME(GHQ) &-0.00252 &0.02885\tabularnewline
% \hline
% \end{tabular}
% \end{table}


% In Table \ref{tab:est_s2} the values are means of 150 replicates, in Figures \ref{fig:lap_ghq2} 
% and \ref{fig:lap_ghq5} we can see the estimates from each of the 150 replicates. Here it becomes obvious that the Laplace
% method has problems when we have few observations in each cluster or the cluster effect variance is large. 
% This weakness is easiest to observe in Figure~\ref{fig:lap_ghq2}, the lower left plot.

% \begin{figure}[t]
% \begin{center}
% \includegraphics[totalheight=0.5\textheight]{thet_fams2.eps}
% \end{center}
% \caption[Laplace vs. Gauss-Hermite, cluster size 2]
% {Laplace vs.\ Gauss-Hermite, cluster size two. The number of clusters is 40
%   (upper panels)
%   and 100 (lower panel), the random effect variance is 0 (left panels) and
%   2 (right panels), 150 replicates for each combination,
% estimates of $\beta_1$. Illustrates how ME(lap) overestimates $\beta_1$ compared
% to ME(GHQ) when cluster size is small.}
% \label{fig:lap_ghq2}
% \end{figure}


% A group of points, eight to be exact, clearly deviates from the rest. 
% What the data underlying these points have in common are that all have 80 or more
% clusters with two events. In Figure~\ref{fig:lap_ghq5} cluster size is
% increased to five and the ME(lap) estimators tendency to overestimate 
% is less pronounced. Only in the most extreme case with 40 clusters and the variance of the cluster effect
% set to 2 can a slight tendency of overestimation be observed.


% \begin{figure}[t]
% \begin{center}
% \includegraphics[totalheight=0.5\textheight]{thet_fams5.eps}
% \end{center}
% \caption[Laplace vs. Gauss-Hermite, cluster size 5]
% {Estimates of $\hat{\beta_1}$  for Laplace vs.\ Gauss-Hermite, cluster size is five. No.\ of clusters is 40
%   (upper panels) or 100 (lower panels). $\sigma^2_u = 0$ (left panels) or 2
%   (right panels). Illustrates
% how the estimates by Me(lap) approach the Me(GHQ) estimates when cluster
% size is larger. Compare to Figure~\ref{fig:lap_ghq2}.} 

% \label{fig:lap_ghq5}
% \end{figure}


% % \subsubsection{Estimation of $\theta$ with covariate present}
% % Data is generated by \eqref{eq:covmodel} with $x_{ij}$ either a dichotomous covariate (dc) or a covariate drawn 
% % from a uniform distribution (uc). For all data generated by this model $\beta_2$ is equal to 1. In Table 
% % \ref{tab:perB0} we can see that for cluster size 2 the ME(lap) estimator perform worst both with regard 
% % to bias and RMSE. While the FE estimator bias-wise performs on par with ME(GHQ) when the covariate is binary,
% % ME(GHQ) performs better relative to FE when the covariate is uniform. This is due to symmetry around 0, FE does not take
% % the covariate into account when estimating $\theta$. 

% % When cluster size is increased to 5, ME(GHQ) is best with respect to both RMSE and bias.
% % Both the fitted ME models give a considerably smaller bias when estimating the fixed effect component,
% % but when looking at the relation between bias and RMSE it is clear that the standard error of FE is much
% % smaller.

% % For data with a cluster effect, Table \ref{tab:perB2}, the ME(GHQ) performs the best. In three of the data
% % configuration ME(GHQ) is best in bias sense, but in the case with 100 clusters of size 2 with uniform covariate 
% % ME(lap) has smaller bias. But with respect to RMSE ME(GHQ) is better, this is even more obvious in Figure
% % \ref{fig:lap_ghq2beta} (lower right plot) where ME(lap) produces 14
% % estimates larger the 6. 


% % \begin{figure}[t]
% % \begin{center}
% % \includegraphics[totalheight=0.5\textheight]{thet_fams2beta.eps}
% % \end{center}
% % \caption[Laplace vs. Gauss-Hermite, model with covariate, cluster size 2]
% % {Plot title, a.b.c where a=number of cluster members,
% %  b=number of clusters, c=variance of cluster effect. 
% % 150 estimates of $\beta_1$ from a mixed effects model with no covariate,
% % fitted via Laplace approximation of the likelihood plotted against it's 
% % Gauss-Hermite quadrature counterpart.}
% % \label{fig:lap_ghq2beta}
% % \end{figure}

% % On a probability
% % scale observations that extreme equals a probability of 0.9975 or larger. When the number of observations per
% % cluster increases the ME(lap) stabilizes and comes closer and closer to ME(GHQ). The estimates produced by
% % the ME(lap) is almost identical with the ones produced by ME(GHQ), when the
% % number cluster members is increased to five. In Figure~\ref{fig:boxtheta}
% % the distributions of the estimators are compared. 

% % \begin{figure}[t]
% % \begin{center}
% % \includegraphics[totalheight=0.5\textheight]{box4_prob.eps}
% % \end{center}
% % \caption[Box plot for $\hat{\theta}$ from a model with covariate]
% % {Plot title: betaX.a.b.c where a=number of cluster members, b=number of clusters, c=variance in 
% % mixing distribution, X=f binary (0,1) covariate, X=u uniform (-1,1) covariate. The plots illustrates estimates of
% % $\theta$ from a model with covariate, 150 replicates.}
% % \label{fig:boxtheta}
% % \end{figure}


% % \begin{figure}[t]
% % \begin{center}
% % \includegraphics[totalheight=0.5\textheight]{thet_fams5beta.eps}
% % \end{center}
% % \caption[Laplace vs. Gauss-Hermite, model  with covariate, cluster size 5]
% % {Plot title, a.b.c where a=number of cluster members,
% %  b=number of clusters, c=variance of cluster effect. 
% % 150 estimates of $\beta_1$ from a mixed effects model with covariate,
% % fitted via Laplace approximation of the likelihood plotted against its 
% % Gauss-Hermite quadrature counterpart.}
% % \label{fig:lap_ghq5beta}
% % \end{figure}


% \subsubsection{Estimation of $\beta_2$}
% We use a single covariate for this simulation study. The covariate is on an individual level. We use
% both a dichotomous covariate (dc) which can assume values 0 or 1 and uniform covariate (uc) which is drawn from
% the uniform (-1, 1) distribution. For each simulated sample we generated a covariate vector from either of these
% distributions. We also ran some simulations with a fixed covariate structure, to compare with our results. In the
% fixed covariate structure simulations two covariate vectors were generated before the start of the simulations,
% one dichotomous and the other from the uniform distribution. These covariate vectors were used for all samples 
% generated during the simulation. But there was no distinguishable change in the parameter estimates.

% \begin{figure}[t]
% \begin{center}
% \includegraphics[totalheight=0.5\textheight]{box1_beta.eps}
% \end{center}
% \caption[Box plot for $\hat{\beta_2}$ from a model with covariate (no cluster effect)]
% {The plots illustrates estimates of
% $\beta_1$ from 150 replicates. Data is generated with variance in the
% mixing distribution equal to 0. Bernoulli covariate distribution (left
% panels) and uniform(-1, 1) covariate distribution (right panels). Cluster size
%  2 and 100 clusters (upper panels), and cluster size 5 and 500 clusters
%  (bottom panels).}
% \label{fig:betabox0}
% \end{figure}

% \begin{figure}[t]
% \begin{center}
% \includegraphics[totalheight=0.5\textheight]{box2_beta.eps}
% \end{center}
% \caption[Box plot for $\hat{\beta_2}$ from a model with covariate (with cluster effect)]
% {The plots illustrates estimates of
% $\beta_1$ from 150 replicates. Data is generated with variance in the
% mixing distribution equal to 2. Bernoulli covariate distribution (left
% panels) and uniform(-1, 1) covariate distribution (right panels). Cluster size
%  2 and 100 clusters (upper panels), and cluster size 5 and 500 clusters
%  (bottom panels).}
% \label{fig:betabox2}
% \end{figure}

% In Table \ref{tab:perB0}, estimates from data with no cluster effect, we see 
% that in all cases that FE has a large bias relative the other estimators. 
% With cluster size 2, 100 clusters and dichotomous covariate, FE has a extreme 
% bias compared to equivalent settings with uniform covariate. ME(lap) also 
% exhibits large bias for data with small cluster size and dichotomous covariate. 
% ME(GHQ) seems stable for all settings simulated. In Figures~\ref{fig:betabox0} 
% and \ref{fig:betabox2} we show box plots of the estimates from all 150 
% replicates. The plots clearly illustrates that the ME estimators, particularly 
% ME(GHQ), are superior to FE for estimating $\beta_1$. The difference between 
% the ME estimators almost disappears when cluster size is increased to 5.

% \begin{table}[t]
% \caption[Performance of the estimators for $\theta$ and $\beta_2$ (no cluster effect)]
% {Performance of the estimators for $\theta$ and $\beta_2$ in a model with covariate
% and the cluster effect variance equal to 0.Numbers presented are means based on the 150
%  replication made in the simulation.}
% \label{tab:perB0} %for ref.
% \small
% \centering
% \begin{tabular}{ccccccc}
% \hline% \hline
% Cluster & No.\ of & & \multicolumn{2}{c}{$\hat{\theta}$}&\multicolumn{2}{c}{$\hat{\beta}_2$}\tabularnewline
% size & clusters      & Model  & Bias    & RMSE    & Bias    & RMSE \\
% \hline
% \multicolumn{7}{c}{Bernoulli covariate} \\ \hline
%     &          & FE     & 0.04607 & 0.05084 & 3.16596 & 6.66285 \tabularnewline
% 2 & 100  & ME(lap)& 0.05247 & 0.08735 & 1.17481 & 2.60386 \tabularnewline
%     &          & ME(GHQ)& 0.01914 & 0.04375 & 0.18482 & 0.85635 \tabularnewline
%                                                                \tabularnewline
%      &         & FE     & 0.04463 & 0.04504 & 0.24669 & 0.32428 \tabularnewline
% 5 & 500  & ME(lap)& 0.00441 & 0.01198 & 0.00636 & 0.14520 \tabularnewline
%       &        & ME(GHQ)& 0.00321 & 0.01091 & 0.00492 & 0.14483 \tabularnewline
%                                                                \hline
% \multicolumn{7}{c}{Uniform(-1, 1) covariate} \\ \hline
%        &       & FE     &-0.01300 & 0.02896 & 1.16805 & 1.87843 \tabularnewline
% 2 & 100  & ME(lap)& 0.01592 & 0.04256 & 0.06231 & 0.56353 \tabularnewline
%        &       & ME(GHQ)& 0.01300 & 0.03715 & 0.02973 & 0.45861 \tabularnewline
%                                                                \tabularnewline
%         &      & FE     &-0.01377 & 0.01578 & 0.26786 & 0.30993 \tabularnewline
% 5 & 500  & ME(lap)& 0.00300 & 0.00926 & 0.00413 & 0.10483 \tabularnewline
%        &       & ME(GHQ)& 0.00290 & 0.00917 & 0.00416 & 0.10491 \tabularnewline
% \hline
% \end{tabular}
% \end{table}


% Estimates for data with cluster effect are tabulated in Table \ref{tab:perB2}. The pattern is repeated, ME(GHQ) performs
% best throughout. FE and ME(lap) have problems with small cluster sizes especially when the covariate is dichotomous.
% When cluster size is increased to 5 the difference between the two ME estimation procedures becomes small. In Figure
% \ref{fig:betabox2} we can see the estimates from all 150 replications.

% \begin{table}[t]
% \caption[Performance of the estimators for $\theta$ and $\beta_1$ (with cluster effect)]
% {Performance of the estimators for $\theta$ and $\beta_1$ in a model with covariate 
% and the cluster effect variance equal to 2. Numbers presented are means based on the 150 
% replication made in the simulation.}
% \label{tab:perB2} %for ref.
% \small
% \centering
% \begin{tabular}{ccccccc}
% \hline% 
% &&&&&&\tabularnewline
% \hline
% Cluster & No.\ of & &\multicolumn{2}{c}{$\hat{\theta}$}&\multicolumn{2}{c}{$\hat{\beta}$}\\
% size & clusters          & Model  & Bias     & RMSE    & Bias     & RMSE \\
% \hline
% \multicolumn{7}{c}{Bernoulli covariate} \\ \hline
%      &         & FE     & -0.01253 & 0.02822 &  2.56887 & 6.43923 \tabularnewline
% 2 & 100  & ME(lap)&  0.02783 & 0.08395 &  0.66964 & 2.29844 \tabularnewline
%        &       & ME(GHQ)& -0.00373 & 0.05243 &  0.18694 & 0.94111 \tabularnewline
%                                                                  \tabularnewline
%         &      & FE     & -0.01264 & 0.01516 &  0.25807 & 0.31410 \tabularnewline
% 5 &500  & ME(lap)&  0.00355 & 0.01593 & -0.01677 & 0.12653 \tabularnewline
%        &       & ME(GHQ)&  0.00161 & 0.01483 & -0.00249 & 0.12775 \tabularnewline
%                                                                  \hline
% \multicolumn{7}{c}{Uniform(-1, 1) covariate} \\ \hline             
%         &      & FE     & -0.07437 & 0.08134 &  1.11189 & 1.52533 \tabularnewline
% 2 & 100  & ME(lap)& -0.00164 & 0.05936 &  0.05326 & 0.67810 \tabularnewline
%        &       & ME(GHQ)& -0.00402 & 0.04640 &  0.00127 & 0.40519 \tabularnewline
%                                                                  \tabularnewline
%         &      & FE     & -0.07471 & 0.07537 &  0.27501 & 0.30797 \tabularnewline
% 5 & 500  & ME(lap)& -0.00139 & 0.01189 & -0.01094 & 0.10169 \tabularnewline
%     &          & ME(GHQ)& -0.00031 & 0.01155 &  0.00140 & 0.10235 \tabularnewline
% \hline
% \end{tabular}
% \end{table}

\section{Conclusion}
What model should be used to a particular data set, i.e., when is a mixed
effects model preferable over a fixed effects model? 
Generally
speaking, a random effects model is appropriate if the observed clusters may be
regarded as a random sample from a (large, possibly infinite) pool of possible clusters. The
observed clusters are of no practical interest per se, but the distribution in the
pool is. Or this distribution is regarded as a nuisance that needs to be
controlled for. A fixed effects model, on the other hand, is appropriate if
we consider the given clusters as the full universe of clusters.

In the random effects case, we expect the number of clusters to grow as
sample size grows, and the cluster sizes to remain stable. In the fixed
effects approach, on the other hand, it is expected that the number of
clusters is stable, while cluster size grows with sample size.


%\begin{figure}
%\begin{center}
%\includegraphics[totalheight=0.6\textheight]{box4_prob.eps}
%\end{center}
%\caption[Box plot for $\hat{\theta}$ from a model with covariate]
%{Plot title: betaX.a.b.c where a=number of cluster members, b=number of clusters, c=variance in 
%mixing distribution, X=f binary (0,1) covariate, X=u uniform (-1,1) covariate. The plots illustrates estimates of
%$\theta$ from a model with covariate, 150 replicates.}
%\label{fig:boxtheta}
%\end{figure}
%
%\begin{figure}
%\begin{center}
%\includegraphics[totalheight=0.6\textheight]{box1_beta.eps}
%\end{center}
%\caption[Box plot for $\hat{\beta_1}$ from a model with covariate (no cluster effect)]
%{Plot title: betaX.a.b.c where a=number of cluster members, b=number of clusters, c=variance in 
%mixing distribution, X=f binary (0,1) covariate, X=u uniform (-1,1) covariate. The plots illustrates estimates of
%$\beta_1$ from 150 replicates. Data is generated with variance in the mixing distribution equal to 0.}
%\label{fig:betabox0}
%\end{figure}

%\begin{figure}
%\begin{center}
%\includegraphics[totalheight=0.6\textheight]{box2_beta.eps}
%\end{center}
%\caption[Box plot for $\hat{\beta_1}$ from a model with covariate (with cluster effect)]
%{Plot title: betaX.a.b.c where a=number of cluster members, b=number of clusters, c=variance in 
%mixing distribution, X=f binary (0,1) covariate, X=u uniform (-1,1) covariate. The plots illustrates estimates of
%$\beta_1$ from 150 replicates. Data is generated with variance in the mixing distribution equal to 2.}
%\label{fig:betabox2}
%\end{figure}

\bibliography{cluster}

% \end{document}

% \section{References}

% %\bibliography{cluster}

% \begin{thebibliography}{12}
% \expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
% \expandafter\ifx\csname url\endcsname\relax
%   \def\url#1{\texttt{#1}}\fi
% \expandafter\ifx\csname urlprefix\endcsname\relax\def\urlprefix{URL }\fi

% \bibitem[{Bates et~al.(2008)Bates, Maechler, and Bolker}]{lme4}
% Bates, D., Maechler, M., and Bolker, B., 2011. lme4: Linear mixed-effects models using
%   S4 classes. R package version 0.999375-38.
% \newline\urlprefix\url{http://lme4.r-forge.r-project.org/}

% \bibitem[{Broström(2011)}]{glmmML}
% Broström, G., 2011. glmmML: Generalized linear models with clustering. R
%   package version 0.81-3.

% \bibitem[BroströmHolmberg(2011)]{CSDA}
% Broström, G., Holmberg, H., 2011. Generalised linear models with clustered
% data: Fixed and random effects models. Computational Statistics and Data
% Analysis, forthcoming.

% \bibitem[{Brown(2009)}]{glmmBUGS}
% Brown, P., 2010. glmmBUGS: Generalised Linear Mixed Models and Spatial Models
%   with BUGS. R package version 1.9.
% \newline\urlprefix\url{http://CRAN.R-project.org/package=glmmBUGS}

% \bibitem[{Gray(2001)}]{gray01}
% Gray, R., 2001. Advanced statistical computing. Course Notes, BIO 248 cd.
% \newline\urlprefix\url{www.stat.wisc.edu/~mchung/teaching/stat471/stat.computi%
% ng.pdf}

% \bibitem[{Komarek(2010)}]{glmmAK}
% Komarek, A., 2010. glmmAK: Generalized Linear Mixed Models. R package version
%   1.4.
% \newline\urlprefix\url{http://CRAN.R-project.org/package=glmmAK}

% \bibitem[{Lesaffre and Spiessens(2001)}]{lesaffrespiessens01}
% Lesaffre, E., Spiessens, B., 2001. On the effect of the number of quadrature
%   points in a logistic random-effects model: an example. Applied Statistics 50,
%   325--335.

% \bibitem[{Murphy and Dunne(2005)}]{murphydunne05}
% Murphy, V., Dunne, A., 2005. Mixed effects versus fixed effects modelling of
%   binary data with inter-subject variability. Journal of Pharmacokinetics and
%   Pharmadynamics, 245--260.

% \bibitem[{Patefield(1977)}]{patefield}
% Patefield, W., 1977. On the maximized likelihood function. Sankhya Series B 39,
%   92--96.

% \bibitem[{{R Development Core Team}(2011)}]{R}
% {R Development Core Team}, 2011. R: A Language and Environment for Statistical
%   Computing. R Foundation for Statistical Computing, Vienna, Austria, {ISBN}
%   3-900051-07-0.
% \newline\urlprefix\url{http://www.R-project.org}

% \bibitem[{Venables and Ripley(2002)}]{mass}
% Venables, W.~N., Ripley, B.~D., 2002. Modern Applied Statistics with S, 4th
%   Edition. Springer, New York, iSBN 0-387-95457-0.
% \newline\urlprefix\url{http://www.stats.ox.ac.uk/pub/MASS4}

% \bibitem[{Vonesh(1996)}]{vonesh96}
% Vonesh, E.~F., 1996. A note on the use of {L}aplace's approximation for
%   nonlinear mixed-effects models. Biometrika 83, 447--452.

% \bibitem[{Yano et~al.(2001)Yano, Beal, and Sheiner}]{yanoal01}
% Yano, I., Beal, S.~L., Sheiner, L.~B., 2001. The need for mixed-effects
%   modeling with population dichotomous data. Journal of Pharmacokinetics and
%   Pharmadynamics 28, 389--412.

% \end{thebibliography}

\newpage

\appendix

\section{Analytic derivatives}\label{sec:appendix}

\subsection{The score}
The partial derivatives with respect to $\beta_m, m = 1; \ldots, p$, of the log
likelihood function~\eqref{eq:loglik} are: 
\begin{equation}%\label{eq:parb3}
\frac{\partial}{\partial \beta_m} 
\ell\big((\bb, \omega); \by, \bx\big) =
\sum_{i=1}^n  \frac{\frac{\partial}{\partial \beta_m} \int_{-\infty}^{\infty} 
\varphi(u) \prod_{j = 1}^{n_i} \Py  du}
{\int_{-\infty}^{\infty} 
\varphi(u) \prod_{j = 1}^{n_i} \Py  du}
\end{equation}
The partial derivatives in the numerators are given by
\begin{multline}\label{eq:partbeta}
\frac{\partial}{\partial \beta_m} \int_{-\infty}^{\infty} \varphi(u)
\prod_{j = 1}^{n_i} \Py  du = \\
\int_{-\infty}^{\infty} \varphi(u)
\prod_{j = 1}^{n_i} \Py \sum_{j=1}^{n_i}
x_{ijm} \Gy du, 
\end{multline}
with
\begin{equation}
G(x, y) = \frac{\partial}{\partial x} \log P(x, y) = \frac{
\frac{\partial}{\partial x}P(x, y)}{P(x, y)}
\end{equation}
The partial derivative with respect to $\omega = \log(\sigma)$ is
\begin{equation}%\label{eq:paro2}
\frac{\partial}{\partial \omega} 
\ell\big((\bb, \omega); \by, \bx\big) =
\sum_{i=1}^n  \frac{
\frac{\partial}{\partial \omega} \int_{-\infty}^{\infty} 
\varphi(u) \prod_{j = 1}^{n_i} \Py  du}
{\int_{-\infty}^{\infty} 
\varphi(u) \prod_{j = 1}^{n_i} \Py  du}
\end{equation}
From this, we get the partial derivatives in the numerators as
\begin{multline}\label{eq:partsigma}
\frac{\partial}{\partial \omega} \int_{-\infty}^{\infty} \varphi(u)
\prod_{j = 1}^{n_i} \Py  du = \\
\sigma \int_{-\infty}^{\infty} u \varphi(u)
\prod_{j = 1}^{n_i} \Py \sum_{j=1}^{n_i}
\Gy du 
\end{multline}

\subsection{The Hessian}

Some ``symbolic'' notation:
\begin{equation}
\ell = \sum_{i=1}^n \log h_i
\end{equation}

\begin{equation*}
\begin{split}
\frac{\partial \ell}{\partial \beta_m} 
&= \sum_{i=1}^n \frac{h_{\beta}(m,
  i)}{h_i}, \quad m = 1, \ldots, p,\\
\frac{\partial \ell}{\partial \omega} &= \sum_{i=1}^n \frac{h_{\omega}(i)}{h_i}
\end{split}
\end{equation*}
Here $h_{\beta}(m, i), \; i = 1, \ldots, n; \; m = 1, \ldots, p$ are given
by equation~\eqref{eq:partbeta}, and $h_{\omega}(i), \; i = 1, \ldots, n$ are
given by equation~\eqref{eq:partsigma}.

The second derivatives are needed at the solution in order to estimate
standard errors.

\begin{equation}
\begin{split}
\frac{\partial^2 \ell}{\partial \beta_k \partial \beta_m} 
&=
\sum_{i=1}^n \biggl\{\frac{h_{\beta\beta}(k, m, i)}{h_i} -
\frac{h_{\beta}(k, i)}{h_i}\frac{h_{\beta}(m, i)}{h_i}\biggr\}, \quad k, m
= 1, \ldots, p \\ 
\frac{\partial^2 \ell}{\partial \beta_k \partial \omega} &=
\sum_{i=1}^n \biggl\{\frac{h_{\beta\omega}(k, i)}{h_i} -
\frac{h_{\beta}(k, i)}{h_i}\frac{h_{\omega}(i)}{h_i}\biggr\}, \quad k = 1,
\ldots, p \\ 
\frac{\partial^2 \ell}{\partial \omega^2} &=
\sum_{i=1}^n \biggl\{\frac{h_{\omega\omega}(i)}{h_i} -
\frac{h_{\omega}(i)}{h_i}\frac{h_{\omega}(i)}{h_i}\biggr\}
\end{split}
\end{equation}
So we need to calculate $h_{\beta\beta}$,  $h_{\beta\omega}$,  and
$h_{\omega\omega}$,
\begin{multline*}
h_{\beta\beta}(k, m, i) = \\
\frac{\partial}{\partial \beta_k} \int_{-\infty}^{\infty} \varphi(u)
\prod_{j = 1}^{n_i} \Py \sum_{j=1}^{n_i}
x_{ijm} \Gy du = \\
\int_{-\infty}^{\infty} \varphi(u)
\prod_{j = 1}^{n_i} \Py \bigg\{\sum_{j=1}^{n_i}
x_{ijk} \Gy \\
\times \sum_{j=1}^{n_i} x_{ijm} \Gy
+ \sum_{j=1}^{n_i}
x_{ijk}x_{ijm} \dGy du \\
k, m = 1, \ldots, p; \; i = 1, \ldots, n
\end{multline*}
where
\begin{equation*}
H(x, y) = \frac{\partial^2}{\partial x^2} \log P(x, y) =
\frac{\partial}{\partial x} G(x, y). 
\end{equation*}

\begin{multline*}
h_{\beta\omega}(k, i) = \\
\frac{\partial}{\partial \beta_k} \sigma \int_{-\infty}^{\infty} u \varphi(u)
\prod_{j = 1}^{n_i} \Py \sum_{j=1}^{n_i}
\Gy du = \\
\sigma \int_{-\infty}^{\infty} u \varphi(u)
\prod_{j = 1}^{n_i} \Py \bigg\{\sum_{j=1}^{n_i}
x_{ijk} \Gy \\
\times \sum_{j=1}^{n_i} \Gy
+ \sum_{j=1}^{n_i}
x_{ijk} \dGy \bigg\} du \\
k = 1, \ldots, p; \; i = 1, \ldots, n
\end{multline*}

\begin{multline*}
h_{\omega\omega}(i) = \\
\frac{\partial}{\partial \omega} \sigma \int_{-\infty}^{\infty} u \varphi(u)
\prod_{j = 1}^{n_i} \Py \sum_{j=1}^{n_i}
\Gy du = \\
\sigma \int_{-\infty}^{\infty} u \varphi(u)
\prod_{j = 1}^{n_i} \Py \bigg\{\sum_{j=1}^{n_i}
\Gy \\
\times \big(1 + \sigma u \sum_{j=1}^{n_i} \Gy\big) +
\sigma u \sum_{i=1}^{n_i} \dGy\bigg\} du \\
i = 1, \ldots, n
\end{multline*}


\section{Some necessary derivatives using the Laplace or Gauss-Hermite
  approximation} 

The basic partial derivatives are:
\be
\begin{split}
g_{\sigma} &= u \sum \Gy \\
g_{\beta_m} &= \sum x_{ijm} \Gy, \quad m = 1, \ldots, p \\
g_{u\sigma} &= u \sigma \sum \dGy + \sum \Gy \\ 
g_{u\beta_m} &= \sigma \sum x_{ijm} \dGy, \quad m = 1, \ldots, p
\\
g_{uu\sigma} &= 2\sigma \sum \dGy + u \sigma^2 \sum \Iy \\
g_{uu\beta_m} &= \sigma^2 \sum x_{ijm} \Iy \\
g_{uuu} &= \frac{d^3}{d u^3}\log p(u) + \sigma^3 \sum \Iy
\end{split}
\ee  
where
\be
\Iy = \frac{\partial}{\partial x} H(x, y),
\ee

For the calculation of the Hessian we need
\be
\begin{split}
g_{\sigma\sigma} &= u^2 \sum \dGy \\
g_{\sigma\beta_m} &= u \sum x_{ijm} \dGy \\
g_{\beta_m\beta_k} &= \sum x_{ijm} x_{ijk} \dGy,
\end{split}
\ee
and
\be
\begin{split}
g_{u\sigma\sigma} &= u^2 \sigma \sum \Iy + 2 u \sum \dGy \\
g_{u\sigma\beta_m} &= u \sigma \sum x_{ijm} \Iy + \sum x_{ijm} \dGy\\
g_{u\beta_m\beta_k} &= \sigma \sum x_{ijm} x_{ijk} \Iy,
\end{split}
\ee
and
\be
\begin{split}
g_{uu\sigma\sigma} &= u^2 \sigma^2\sum \Ky\\ 
& + 4 u\sigma \sum \Iy + 2 \sum
\dGy\\ 
g_{uu\sigma\beta_m} &= u \sigma^2\sum x_{ijm} \Ky +  2 \sigma \sum x_{ijm}
\Iy \\
g_{uu\beta_m\beta_k} &= \sigma^2 \sum x_{ijm} x_{ijk} \Ky,
\end{split}
\ee
where
\be
\Ky = \frac{\partial}{\partial x} I(x, y),
\ee
and
\be
\begin{split}
g_{uuuu} &= \frac{d^4}{d u^4}\log p(u) + \sigma^4 \sum K(\bb x_{ij} +
\sigma u, y_{ij}) \\ 
g_{uuu\sigma} &= u\sigma^3 \sum K(\bb x_{ij} + \sigma u, y_{ij}) + 3
\sigma^2 \sum \Iy \\
g_{uuu\beta_m} &= \sigma^3 \sum x_{ijm}K(\bb x_{ij} + \sigma u, y_{ij}) \\
\end{split} 
\ee 

\end{document}