%\VignetteIndexEntry{Description of the algorithm} %\VignettePackageifit} %\VignetteEncoding{UTF-8} \documentclass[nojss]{jss} % no need of \usepackage{Sweave} \usepackage[utf8]{inputenc} \usepackage{amsmath,amsfonts,bm} \usepackage{enumerate} \usepackage{algorithm,algpseudocode} \title{Description of the `ifit` algorithm} \author{Guido Masarotto\\Università of Padova} \Address{% Guido Masarotto\\ Department of Statistical Sciences\\ University of Padova, Italy\\ Email: \email{guido.masarotto@unipd.it} } \Abstract{% This brief document details the algorithm implemented in the \pkg{ifit} package and provides an overview of the optional arguments for the \code{ifit::ifit} function. } \Keywords{% Indirect inference based on intermediate statistics; Generalized method of moments; Parametric estimation; Simulation-based model fitting} \begin{document} \maketitle Package \pkg{ifit} addresses the problem of fitting a parametric model $\bigl\{p_{\vartheta}$: $\vartheta \in R^p \bigr\}$, to some data $y_{obs}$. It operates under the following assumptions: \begin{enumerate} \item It is not possible to analytically compute the likelihood function, moments, or other quantities typically used for statistical inference. However, for any given $\vartheta$, it is possible to simulate pseudo-data $y \sim p_\vartheta$. \item The only prior information available on the parameters are \emph{boundary constraints} where each parameter $\vartheta_i$ is known to lie within a specific interval $[l_i, u_i]$, i.e., it is possible to assume that $\vartheta$ belongs to the hypercube $\Theta=\{(\vartheta_1, \ldots, \vartheta_p) \in R^p: l_i \le \vartheta_i \le u_i\}$. \end{enumerate} The implemented estimator is a special case of the class of methods that \cite{jiang_indirect_2004} term \emph{indirect inference based on intermediate statistics}. It can also be viewed as an application of the \emph{simulated generalized method of moments} \citep{McFadden1989, Pakes1989}. Additionally, it is closely related to the work of \cite{cox_fitting_2012} on fitting complex parametric models. We start defining the score $$ u\bigl(y_{obs};\vartheta\bigr) =t_{obs}-\tau\bigl(\vartheta\bigr),$$ where $t_{obs}=\Psi(y_{obs})$ is a vector of $q$, $q\ge p$, features intended to summarize the evidence from the data, and $\tau(\vartheta)=E_{\vartheta}\bigl(\Psi(y)\bigr)$ is its expected value. The package then aims to find $$ \hat\vartheta = \arg \min_{\vartheta \in \Theta} u'\bigl(y_{obs};\vartheta\bigr) V^{-1} u\bigl(y_{obs};\vartheta\bigr)$$ where $V$ is positive-definite weighting matrix. Assuming the model is well specified (i.e., there exists $\vartheta_0$ such that $y_{obs}\sim p_{\vartheta_0}$), it can be shown that, under the usual regularity and identifiability conditions, the estimates is consistent for every positive-definite matrix $V$. However, the efficiency depends on $V$ and the most efficient choice for the weighting matrix is $V=\Sigma\bigl(\vartheta_0\bigl)$, where $\Sigma\bigl(\vartheta\bigr)=\text{var}_{\vartheta}\bigl(\Psi(y)\bigr)$. Since $\vartheta_0$ is unknown, this choice of weighting matrix is not directly possible. However, it is easy to proof under the usual assumptions that an asymptotically equivalent estimator to the optimal one can be obtained by solving the quasi-likelihood equation \begin{equation} g(\hat\vartheta) = J'\bigl(\hat\vartheta\bigr)\Sigma^{-1}\bigl(\hat\vartheta\bigr)u\bigl(y_{obs};\hat\vartheta\bigr) =0_p \label{est:eq} \end{equation} where $J\bigl(\vartheta\bigr)$ denotes the Jacobian matrix $J\bigl(\vartheta\bigr)=\partial \tau(\vartheta)/\partial \vartheta$. In addition, continuing to assume that the model is well-specified and that all necessary regularity conditions hold, the dispersion matrix of $\hat\vartheta$ can be estimated by $ \widehat{\text{var}}\bigl(\hat\vartheta\bigr) = \Omega\bigl(\hat\vartheta\bigr)^{-1} $ where $ \Omega\bigl(\vartheta\bigl) = J'\bigl(\vartheta\bigr) \Sigma^{-1}\bigl(\vartheta\bigr) J\bigl(\vartheta\bigl) $. The implemented algorithm first performs a global search to find a promising starting point. This is followed by a local search, which refines the solution using a trust-region version of the Fisher scoring iteration for solving \eqref{est:eq}. This basic iteration is defined as $$ \vartheta_{new} = \vartheta_{old} + \Omega\bigl(\vartheta\bigr)^{-1}g\bigl(\vartheta_{old}\bigr). $$ Because $\tau\bigl(\vartheta\bigr)$, $J\bigl(\vartheta\bigr)$ and $\Omega\bigl(\vartheta\bigr)$ are unknown, they are approximated through simulations. The approach is sequential. In particular, after step $k$ of the algorithm, $N_k$ pairs $[\vartheta_i;t_i]$, where $t_i=\Psi(y_i)$ with $y_i \sim p_{\vartheta_i}$, are available. These simulated values are then used to decide the location of the next set of simulations, i.e., to choose $\vartheta_{N_k+1}, \ldots, \vartheta_{N_{k+1}}$. When needed, the approximations of $\tau\bigl(\vartheta\bigl)$, $J\bigl(\vartheta\bigr)$, and $\Omega\bigl(\vartheta\bigr)$ are computed from these simulated pairs using local regression techniques. Specifically, a simple kNN average is used to compute the necessary quantities during the global search. In contrast, during the local search, they are obtained by fitting linear models neighborhoods of the current guess including a progressively larger number of points. The details are as follows. \newcommand{\pp}[2]{\text{\texttt{#1}}_{#2}} \algblock{Start}{End} \begin{algorithmic}[1] \State \textbf{Input} (default values in brackets): $\pp{N}{init}$ (1000), $\pp{N}{elite}$ (100), $\pp{A}{elite}$ ($0.5$), $\pp{Tol}{global}$ ($0.1$), $\pp{NAdd}{global}$ (100), $\pp{NTot}{global}$ (20000), $\pp{Rho}{max}$ ($0.1$), $\pp{Lambda}{}$ ($0.1$), $\pp{Tol}{local}$ ($1$), $\pp{NFit}{local}$ (4000), $\pp{NAdd}{local}$ (10), \text{\texttt{Tol}}$_{model}$ ($1.5$). \State Set $k \gets 0$. \Start\textbf{ Global Search} \State \emph{Initialization.} Set $N_0 \gets \pp{N}{init}$ and draw $\vartheta_1, \ldots, \vartheta_{N_{0}}$ in $\Theta$ using Latin hypercube sampling. Simulate the corresponding summary statistics $t_1, \ldots, t_{N_0}$. \State \label{glob:begin} \emph{Estimation of $\tau(\vartheta)$ and $V$.} Estimate $\tau(\vartheta)$ using kNN regression. Specifically, for $i=1, \ldots, N_k$, calculate $$ \hat{\tau}_{k, i} = \sum_{\{r: d_{G}(\vartheta_r, \vartheta_i) \le \overline{d}_{k,i}\}} \left[1- \left(\dfrac{d_G(\vartheta_r,\vartheta_i)}{\overline{d}_{k,i}} \right)^3\right]^3 t_r $$ where $d_G(\vartheta', \vartheta'')=\sqrt{\sum_{i=1}^p [(\vartheta'_i-\vartheta''_i)/(u_i-l_i)]^2}$ and $\overline{d}_{k,i}$ is determined such that the size of the neighbourhood $\{r: d(\vartheta_r, \vartheta_i) \le \overline{d}_{k,i}\}$ is equal to $floor\left(\sqrt{N_k}\right)$. In addition, set $V_k \gets S_k R_k S_k$ where $S_k$ is the $q \times q$ diagonal matrix containing the Median Absolute Deviations (MADs) of the elements of the vector $t_k-\hat\tau_{k,i}$ (specifically, one MAD for each summary statistic), and $R_k$ is the correlation matrix among the Gaussian scores (or normal scores) of the same elements of $t_i-\hat\tau_{k,i}$ \citep{gaussian_2012}. \State \emph{Elite sample.} Determine the indexes $i_{k,1}, \ldots, i_{k,E_k}$ of the $$E_k=floor\left[\pp{N}{elite}+(\pp{N}{init}-\pp{N}{elite})\pp{A}{elite}^{(N_K/\pp{N}{init})^2}\right]$$ couples $(\vartheta_i, t_i)$ with the smallest Mahalanobis distances $\bigl(t_{obs}-\hat\tau_{k,i}\bigr)'V_k^{-1}\bigl(t_{obs}-\hat\tau_{k, i}\bigr)$. Then, compute \begin{align*} M_k &\gets \text{ sample mean of } \vartheta_{i_{k,1}}, \ldots, \vartheta_{i_{k, E_k}}\\ C_k &\gets \text{ sample covariance matrix of } \vartheta_{i_{k,1}}, \ldots, \vartheta_{i_{k, E_k}}\\ \end{align*} and denote with $s_{k,1}, \ldots, s_{k, p}$ the corresponding sample standard deviation, i.e., the square root of the diagonal of $C_k$. \State \emph{Convergenge?} If either \begin{enumerate}[(i)] \item $s_{k, r} < \max\bigl(1, |M_{k, r}|\bigr) \pp{Tol}{global}\;\forall r=1,\ldots,p$, or \item $N_k = \pp{NTot}{global}$, \end{enumerate} exit from the global search and go to \ref{glo:exit}. \State \emph{Elite sample reproduction.} Set $N_{k+1} \gets \min\bigl(N_k+\pp{NAdd}{global}, \pp{NTot}{global}\bigr)$ and sample $\vartheta_{N_K+1}, \ldots, \vartheta_{N_{k+1}}$ from a mixture of $E_k$ multivariate normal distributions (truncated to $\Theta$) with means $\vartheta_{i_{k,1}}, \ldots, \vartheta_{i_{k, E_k}}$ and the same dispersion matrix $C_k$. Simulate the corresponding summary statistics $t_{N_k+1}, \ldots, t_{N_{k+1}}$. \State Set $k \gets k+1$ and go to \ref{glob:begin}. \End\textbf{ Global Search} \State \label{glo:exit} Set $L_k \gets \pp{N}{elite}$, $\rho_k \gets \pp{Rho}{max} / 10$ and $\hat\vartheta_k \gets \vartheta_{best}$ where $\vartheta_{best}$ is the ``best point'' sampled during the previous phase, i.e., it satisfies $$\bigl(t_{obs}-\hat\tau_{k,best}\bigr)'V_k^{-1} \bigl(t_{obs}-\hat\tau_{k, best}\bigr) = \min_{i=1, \ldots, N_k} \bigl(t_{obs}-\hat\tau_{k, i}\bigr)'V_k^{-1} \bigl(t_{obs}-\hat\tau_{k, i}\bigr).$$ \Start{} \textbf{Local Search} \State \label{loc:begin} \emph{Estimation of $g(\hat\vartheta_k)$, $J(\hat\vartheta_k)$ and $\Omega(\hat\vartheta_k)$.} Fit to the $L_k$ couples $(\vartheta_i, t_i)$ closest to $\hat\vartheta_k$, as measured by the distance $d_L(\vartheta, \hat\vartheta^{(k)})= \sum_{i=1}^p \bigl[(\vartheta_i-\hat\vartheta_i^{(k)})/ \max(1, |\vartheta_i^{(k)}|)\bigr]^2$, the multivariate linear regression model $t_i = \tau+B(\vartheta_i-\hat\vartheta_k)+\text{Err}_i.$ Denote with $\hat\tau_k$, $B_k$, $W_k$, $H_k$ the least squares estimates of $\tau$, $B$, $\text{var}(\text{Err}_i)$ and $\text{var}(\hat\tau_k)$, respectively. Further, set \begin{align*} J_k &\gets \begin{cases} B_k & \text{if } L_k=\pp{N}{elite} \\ (1-\pp{Lambda}{})J_{k-1}+\pp{Lambda}{} B_k & \text{otherwise} \end{cases},\\ V_k &\gets \begin{cases} W_k & \text{if } L_k=\pp{N}{elite} \\ (1-\pp{Lambda}{})V_{k-1}+\pp{Lambda}{} W_k & \text{otherwise} \end{cases},\\ \Omega_k &\gets J'_k V_k^{-1} J_k,\\ \hat{g}_k &\gets J'_k V_k^{-1}\bigl(t_{obs}-\hat\tau_k\bigr),\\ \widehat{\text{var}}\bigl(\hat{g}_k\bigr) & \gets J'_kV_k^{-1}H_kV_k^{-1}J_k. \end{align*} \State \emph{New guess.} Compute $\tilde\vartheta_k \gets \hat\vartheta_k+\delta_k$ where $\delta_k$ is the solution of the linear programming problem $$ \min_{\delta \in \Delta_k} \sum_{i=1}^p |r_{k,i}(\delta)|$$ where $r_k(\delta)=\Omega_k \delta-\hat{g}_k$ and $\Delta_k= \bigl\{\delta \in R^p: \hat\vartheta_k+\delta \in \Theta \text{ and } |\delta_i| \le \max(1, |\hat\vartheta_{k,i}|)\rho_k\bigr\} $. \State \emph{Convergence?} If $L_k=\pp{NFit}{local}$ and $ \hat{g}'_k\widehat{\text{var}}\bigl(\hat{g}_k\bigr)^{-1}\hat{g}_k < p \pp{Tol}{local} $ exit from the local search and go to \ref{end:local}. \State \emph{Sampling near the new guess.} Set $N_{k+1} \gets N_{k}+\pp{NAdd}{local}$ and draw $\vartheta_{N_k+1}, \ldots, \vartheta_{N_{k+1}}$ uniformily in $\bigl\{\vartheta \in \Theta: (\vartheta-\tilde\vartheta_k)'\Omega_k(\vartheta-\tilde\vartheta_k) \le 1\bigr\}$. Simulate the corresponding summary statistics $t_{N_k+1}, \ldots, t_{N_{k+1}}$. \State \emph{Accept/reject the guess. Adjust the size of the trust region.} Compute the differences between the last sampled summary statistics and their predictions obtained from the current linear model $D_i =t_i-\hat\tau_k-B_k(\vartheta_i-\tilde\vartheta_k)$ for $i=N_k+1,\ldots,N_{k+1}$. If $$\sum_{i=N_k+1}^{N_{k+1}} D'_i V_k^{-1} D_i < q (N_{k+1}-N_k)Mod_{ok}$$ accept the proposal and set $\hat\vartheta_{k+1} \gets \tilde\vartheta_k$ and $\rho_{k+1} \gets \min(2\rho_k, \pp{Rho}{max})$. Otherwise, set $\hat\vartheta_{k+1} \gets \hat\vartheta_k$ and $\rho_{k+1} \gets \rho_k /4$. \State Set $k \gets k+1$, $L_k \gets \min(\pp{NFit}{local}, L_{k-1}+\pp{NAdd}{local})$ and go to \ref{loc:begin}. \End{} \textbf{Local Search} \State \label{end:local} Set $\hat\vartheta \gets \tilde{\vartheta}_k$ and $\widehat{\text{var}}(\hat\vartheta)=\Omega_k^{-1}$. \end{algorithmic} \bibliography{alg} \end{document}