\documentclass[10pt,xcolor=x11names,compress]{beamer} \usepackage{lmodern} \usepackage[utf8]{inputenc} \usepackage[T1]{fontenc} \usepackage{booktabs} %\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{groupedSurv} \newcommand{\R}{\texttt{R}} \newcommand{\pkgname}{\texttt{groupedSurv}} \newcommand{\Rcpp}{\texttt{Rcpp}} \newcommand{\Cpp}{\texttt{C++}} \newcommand{\rfunc}[1]{\texttt{#1()}} % function name \newcommand{\df}{\texttt{data.frame}} \newcommand{\ie}{i.e.,} \newcommand{\eg}{e.g.,} \setbeamertemplate{navigation symbols}{} \setbeamertemplate{footline}[frame number] \setbeamertemplate{enumerate item}{\insertenumlabel} \setbeamertemplate{enumerate subitem}{\alph{enumii}.} \begin{document} <>= require(knitr) @ <>= library(groupedSurv) options(width=80) # make the printing fit on the page set.seed(1121) # make the results repeatable stdt<-date() @ \begin{frame} \title{\pkgname} \subtitle{Efficient Estimation of Grouped Survival Models Using the Exact Likelihood Function} \date{2020-01-31} \titlepage \end{frame} \section{Introduction} \begin{frame}{Introduction} % \scriptsize \begin{itemize} \item This document provides examples demonstrating how to use the \pkgname{} package to estimate the baseline survival rate, covariate, and fixed effect parameters, and compute the efficient score statistic, as well as gene-level statistics for grouped survival models. \item Additional functions can analyze grouped time-to-event data accounting for family structure of related individuals (e.g., trios), and provide estimates of frailty variance. Note, however, that the current implementation of the frailty model is sensitive to departures from model assumptions, and should be considered experimental. \item The major algorithms in this package are written in \texttt{C++}, which is ported to \R{} by \Rcpp{}, to facilitate fast computation. \end{itemize} \end{frame} \begin{frame}{Model assumptions} % \scriptsize Data without family structure~\cite{prentice1978} \begin{itemize} \item The input data is assumed to be organized as a \texttt{matrix} or \df{}, with rows corresponding samples, and columns corresponding to variables of interest (and covariates) for the sample. \item Support is also provided for GenABEL \texttt{gwaa.data} objects as alternative inputs. \end{itemize} Data with family structure~\cite{ripatti2004} [Experimental] \begin{itemize} \item The input \texttt{matrix} or \df{} is assumed to be organized such that records for each family occur consecutively, and that records for offspring precede those for parents. The variance matrix for the random effects is assumed to be of the form \texttt{var}$*$\texttt{K}, where \texttt{K} is a matrix of kinship coefficients between family members. \item The following family groupings are permitted: (Individual), (Offspring, Offspring), (Offspring, Parent), (Offspring, Parent, Parent), and (Offspring, Offspring, Parent, Parent). Other family structures have not been implemented. \end{itemize} \end{frame} \begin{frame}[fragile]{Included functions} \scriptsize Functions for data without family structure\\ <>= thetaEst(Z=NULL, gtime, delta, method="BFGS") betaEst(x, Z=NULL, alpha, theta=NULL, gtime, delta) groupedSurv(x, Z=NULL, GenABEL.data=NULL, alpha, theta=NULL, gtime, delta, beta=0, nCores=1, reScore=FALSE) geneStat(x, Z=NULL, GenABEL.data=NULL, alpha, theta=NULL, gtime, delta, beta=0, nCores=1, FUN=function(Uij, weight){sum((colSums(Uij)*weight)^2)}, geneSet) @ Functions for data with family structure [Experimental]\\ <>= alphaEstFam(gtime, delta) betaEstFam(x, fam_group, fam_role, alpha, var, gtime, delta, lower, upper) varEstFam(x, fam_group, fam_role, alpha, gtime, delta, lower, upper, beta = 0) groupedSurvFam(x, fam_group, fam_role, alpha, var, gtime, delta, beta=0) varLLFam(x, fam_group, fam_role, alpha, var, gtime, delta, beta=0) @ \end{frame} \begin{frame}[fragile]{Usage overview} \scriptsize Most users will interface with the package through the \rfunc{thetaEst} and \rfunc{groupedsurv} functions. The \rfunc{thetaEst} function provides maximum likelihood estimates (MLE) for the nuisance parameters, \ie{} the baseline survival rate and the parameters for any covariates. The estimates are computed under the null hypothesis, \ie{} that the variable of interest has no effect on the time to event. The \rfunc{thetaEst} function requires a vector of grouped survival times, \texttt{gtime} and a vector of event indicators, \texttt{delta}, as arguments. If the model includes covariates, these values, in the form of a \texttt{matrix} or \df{}, \texttt{Z}, are required as well. Optionally, users can specify the method of optimization (\texttt{method} = ``BFGS'' or ``CG'') which is passed to the \texttt{optim} function in \Cpp{}.\\ \vspace{2mm} The \rfunc{groupedSurv} function is the core of the \pkgname{} package. As inputs, it requires a \texttt{matrix} or \df{} of variables to be tested for association with the outcome, \texttt{x}, a vector of grouped survival times and a vector of event indicators (\texttt{gtime} and \texttt{delta}), as well as estimates for the nuisance parameters (\texttt{alpha} and \texttt{theta}). Note that since \rfunc{thetaEst} estimates the nuisance parameters under the null hypothesis, these estimates can be reused to calculate the efficient score statistic for any number of variables of interest. If the model includes covariates, the \texttt{Z} \texttt{matrix} or \df{} is also required. \vspace{2mm} In the context of GWAS, covariates and variables of interest can be provided as part of a GenABEL object (\texttt{gwaa.data}), with the arguments \texttt{x} and \texttt{Z} instead used to specify to the columns of \texttt{gwaa.data} to be included in the analyses.\\ \end{frame} \begin{frame}[fragile]{Usage overview} \scriptsize Users also have the option of specifying the number of cores available for parallel processing (\texttt{nCores}). The \texttt{beta} argument is 0 by default, computing the statistics under the null hypothesis, but users have the option of specifying a different value, \eg{} in order to evaluate the statistic under an alternative hypothesis.\\ \vspace{2mm} By default, the package returns a data frame containing the efficient score statistics, along with the (unadjusted) asymptotic p-values, FWER-adjusted p-values~\cite{bonfcorrect} and local FDRs~\cite{qvalue1,qvalue2} for each of the variables of interest. By setting the optional argument \texttt{reScore=TRUE}, the \rfunc{groupedsurv} function will also return a matrix of the contribution of each sample to the efficient score statistic for each variable of interest.\\ \vspace{2mm} A supporting function, \rfunc{betaEst}, takes a vector of values of a variable of interest, \texttt{x}, along with the same arguments as the \rfunc{thetaEst} function, and returns the MLE of the log hazard ratio, $\beta$.\\ \vspace{2mm} Please refer to the individual function manuals for more detailed explanations of the arguments and returned values associated with each function.\\ \end{frame} \begin{frame}[fragile]{A note about coding grouped survival times} \scriptsize Under the grouped failure time model, the continuous survival time, $time \in [0,\infty)$ is not observed. Instead, subjects are assessed for failure only at pre-specified time points, $gtime_1, gtime_2, \dots, gtime_{r-1}$. These time points form the right end-points of $r$ adjacent intervals, \ie{} $[0,gtime_1),[gtime_1,gtime_2),\dots,[gtime_{r-2},gtime_{r-1})[gtime_{r-1},\infty)$.\\ \vspace{2mm} The contribution of each subject to the likelihood used in the efficient score is composed of the combination of the intervals they survived and, if applicable, that in which the event occurred. For example, a subject who survives the first two intervals (\ie has not failed at $gtime_1$ or $gtime_2$) but then has failed by the third observation should be coded as (\texttt{gtime}$=gtime_3$, \texttt{delta}$=1$). Similarly, a subject who has not yet failed at the fourth observation time, but who is lost to follow-up before the fifth observation time would be coded as (\texttt{gtime}$=gtime_5$, \texttt{delta}$=0$).\\ \vspace{2mm} Note then that subjects who are censored at the first time point, (\texttt{gtime}$=gtime_1$, \texttt{delta}$=0$), contribute no information to the likelihood. Note also that subjects who have not failed by the final observation time point should be considered censored at infinity, and coded (\texttt{gtime}$=$\texttt{Inf}, \texttt{delta}$=0$).\\ \end{frame} \section{Example with covariates} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % simulate data %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile]{Simulate grouped survival data} \scriptsize We first simulate continuous survival data: <>= set.seed(111) n <- 1000 # effect size beta <- 0.3 # covariate parameters theta <- c(0.2, 0.2) # variable of interest associated with outcome MAF <- 0.05 x <- matrix(rbinom(n, 2, MAF), ncol = 1) # additional variables of interest xMore <- matrix(rbinom(n*100, 2, MAF), ncol = 100) xMore <- cbind(x, xMore) # covariate data (centered at 0) z1 <- rnorm(n) z2 <- rbinom(n, 1, 0.5) - 0.5 Z <- matrix(cbind(z1, z2), ncol = 2) # continuous survival time lam0 <- 1 cmax <- 3 lami <- lam0 * exp(x * beta + Z %*% theta) stime <- rexp(n, lami) ctime <- runif(n, 0, cmax) delta <- stime < ctime otime <- pmin(stime, ctime) @ \end{frame} % by(otime, x, summary) \begin{frame}[fragile]{Generate grouped survival data} \scriptsize Then generate grouped survival times from continuous survival data: <>= # grouped observation time points ntps <- 5 r <- ntps + 1 # last observation time maxbreakq <- 0.85 maxbreak <- qexp(maxbreakq, lam0) # grouped survival times breaks <- (1:ntps) * (maxbreak/ntps) gtime <- findInterval(otime, breaks) + 1 delta[gtime == r] <- FALSE dctime <- findInterval(ctime, breaks) + 1 delta[gtime == dctime] <- FALSE delta <- as.numeric(delta) gtime[which(gtime == r)] <- Inf table(gtime, delta) @ \end{frame} % by(otime, x, summary) % table(x, gtime) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % load package %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile]{Example of \texttt{thetaEst}} \scriptsize Load \pkgname{} (after installing its dependent packages): <>= library(groupedSurv) @ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % example for thetaEst %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Estimate $\theta$ (including baseline survival rate for each time interval and the covariate parameters) under the null hypothesis ($\beta = 0$):\\ <>= thetaest <- thetaEst(Z, gtime, delta) thetaest @ \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % example for groupedSurv + betaEst %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile]{Examples of \texttt{groupedSurv} and \texttt{betaEst}} \scriptsize Compute the efficient score under the null hypothesis based on the estimated $\theta$ :\\ <>= eff <- groupedSurv(x=xMore, Z=Z, alpha=thetaest$alpha, theta=thetaest$theta, gtime=gtime, delta=delta, beta=0, nCores=1) head(eff) @ One may wish to estimate $\beta$ for variables of interest found to be associated with the outcome:\\ <>= betaest <- betaEst(x=x, Z=Z, alpha=thetaest$alpha, theta=thetaest$theta, gtime=gtime, delta=delta) betaest @ \end{frame} \section{Examples of alternative data sources} \begin{frame}[fragile]{Examples of alternative data sources} \scriptsize One can alternatively input SNP information directly from a GenABEL object. <>= library(GenABEL) data(srdta) GenABELdat <- srdta[1:n] snpsToTest <- GenABELdat@gtdata@snpnames[1:200] eff <- groupedSurv(x=snpsToTest, Z=Z, GenABEL.data=GenABELdat, alpha=thetaest$alpha, theta=thetaest$theta, gtime=gtime, delta=delta, beta=0, nCores=1) @ Genotype data can be imported from binary PLINK~\cite{plink} file using the \texttt{BEDMatrix} package. <>= library(BEDMatrix) path <- system.file("extdata", "example.bed", package = "BEDMatrix") m <- BEDMatrix(path) # Extract genotypes for the specified variants xPLINK <- m[, c("snp0_A", "snp1_C", "snp2_G")] @ Or, genotype dosage data can also be directly extracted from a VCF file using the \texttt{VariantAnnotation} package~\cite{VariantAnnotation}. <>= system("wget ftp://share.sph.umich.edu/minimac3/DosageConvertor/DosageConvertor.v1.0.4.tar.gz") system("tar -xzvf DosageConvertor.v1.0.4.tar.gz") library(VariantAnnotation) exampleVcfFile <- "./DosageConvertor/test/TestDataImputedVCF.dose.vcf.gz" myvcf <- readVcf(exampleVcfFile, "hg19") dosedat <- assay(myvcf,"DS") xVCF <- t(dosedat) @ \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % example for genestat %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Analysis of gene sets} \begin{frame}[fragile]{Analysis of gene sets} \scriptsize Specify SNP and gene information: <>= geneInfo <- data.frame(gene=c("BRCA1","BRCA2"), chr=c(17,13), start=c(41196312, 32889611), end=c(41277500, 32973805), stringsAsFactors=FALSE) snpInfo <- data.frame(chr=c(17,17,13,13), pos=c(41211653,41213996,32890026,32890572), rsid=c("rs8176273","rs8176265","rs9562605","rs1799943"), stringsAsFactors=FALSE) @ Use \texttt{snplist} package to create gene sets: <>= library(snplist) setGeneTable(geneInfo) setSNPTable(snpInfo) geneset <- makeGeneSet() @ <>= geneset @ \end{frame} % snplist not actually evaluated, to avoid package dependence % genesetHide, echo=FALSE geneset <- list(BRCA1=c("rs8176273", "rs8176265"), BRCA2=c("rs9562605", "rs1799943")) %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % example for genestat %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile]{Simulate SNP data and compute statistics} \scriptsize Simulate genotyping data: <>= G <- matrix(rbinom(n*nrow(snpInfo), 2, 0.5), ncol=nrow(snpInfo)) colnames(G) <- snpInfo$rsid @ SNPs can be weighted within gene sets. Generate dummy weights and append them: <>= for(i in seq_len(length(geneset))){ weight <- rep(1, length(geneset[[i]])) geneset[[i]] <- list(geneset[[i]], weight) } @ Compute SKAT statistics for each gene set: <>= res <- geneStat(x=G, Z=Z, alpha=thetaest$alpha, theta=thetaest$theta, gtime=gtime, delta=delta, geneSet=geneset) res$stat @ \end{frame} \section{Example with family structure} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % example for family structure %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile]{Incorporating family structure} \scriptsize Generate grouped survival data: <>= rm(list=ls()) set.seed(111) m <- 10 # family ID fgrp <- as.character(rep(1:m, each=3)) # role within family f_ind <- rep(c('o','f','m'),m) # grouped survival data gtimes <- sample(1:4, m*3, replace=TRUE) deltas <- sample(0:1, m*3, replace=TRUE) # variable of interest g <- rbinom(m*3, 2, 0.1) # parameter search bounds upper <- 2 lower <- 0 @ \end{frame} \begin{frame}[fragile]{Examples of \texttt{alphaEstFam} and \texttt{varEstFam}} \scriptsize Estimate baseline survival rates (always under the null hypothesis): <>= alphaest <- alphaEstFam(gtimes, deltas) alphaest @ Estimate variance under the null by setting \texttt{beta=0}: <>= varest <- varEstFam(x=g, fam_group=fgrp, fam_role=f_ind, alpha=alphaest, gtime=gtimes, delta=deltas, lower, upper, beta=0) varest @ \end{frame} \begin{frame}[fragile]{Examples of \texttt{groupedSurvFam}, \texttt{PvalueFam}, and \texttt{betaEstFam}} \scriptsize Compute the efficient score under the null by setting \texttt{beta=0}, and compute the associated p-values: <>= effFam <- groupedSurvFam(x=g, fam_group=fgrp, fam_role=f_ind, alpha=alphaest, var=varest, gtime=gtimes, delta=deltas, beta=0) PvalueFam(effFam) @ Estimate $\beta$: <>= betaEstFam(x=g, fam_group=fgrp, fam_role=f_ind, alpha=alphaest, var=varest, gtime=gtimes, delta=deltas, lower, upper) @ \end{frame} \section*{References} \begin{frame}[fragile]{References} \scriptsize \begin{thebibliography}{9} \setbeamertemplate{bibliography item}[text] \bibitem{prentice1978} Prentice, RL and Gloeckler, LA (1978). Regression analysis of grouped survival data with application to breast cancer data. \emph{Biometrics}, 34(1):57-67. \bibitem{ripatti2004} Ripatti, S and Palmgren, J (2004). Estimation of multivariate frailty models using penalized partial likelihood. \emph{Biometrics}, 56(4):1016-1022. \bibitem{bonfcorrect} Bonferroni, CE (1935). Il calcolo delle assicurazioni su gruppi di teste. \emph{Studi in Onore del Professore Salvatore Ortu Carbon}, 13-60. \bibitem{qvalue1} Storey, JD (2013). The positive false discovery rate: a Bayesian interpretation and the q-value. \emph{Annals of Statistics}, 31(6):2013-2035. \bibitem{qvalue2} Storey, JD, Taylor, JE, and Siegmund, D (2004). Strong control, conservative point estimation and simultaneous conservative consistency of false discovery rates: a unified approach. \emph{Journal of the Royal Statistical Society Series B-Statistical Methodology}, 66:187-205. \bibitem{plink} Purcell, S, Neale, B, Todd-Brown, K, Thomas, L, Ferreira, MAR, Bender, D, Maller, J, Sklar, P, de Bakker, PIW, Daly, MJ, and Sham, PC (2007). PLINK: a toolset for whole-genome association and population-based linkage analysis. \emph{American Journal of Human Genetics}, 81. \bibitem{VariantAnnotation} Obenchain, V, Lawrence, M, Carey, V, Gogarten, S, Shannon, P, and Morgan, M (20014). VariantAnnotation: a Bioconductor package for exploration and annotation of genetic variants \emph{Bioinformatics}, 30(14):2076-2078. % \bibitem{vcfR} % Knaus, BJ., Gr{\"u}nwald, NJ % (20017). % VCFR: a package to manipulate and visualize variant call format data in T % \emph{Molecular Ecology Resources}, % 17:44-53. % \setbeamertemplate{bibliography item}[triangle] \end{thebibliography} \end{frame} \section*{Session Information} \begin{frame}[fragile]{Session Information} \scriptsize <>= toLatex(sessionInfo(), locale=FALSE) @ %<>= %print(paste("Start Time",stdt)) %print(paste("End Time ",date())) %@ \end{frame} \end{document}