% -*- mode: noweb; noweb-default-code-mode: R-mode; -*-
%\VignetteIndexEntry{HowTo plw}
\documentclass[a4paper]{article}
\title{HowTo plw}
\author{Magnus {\AA}strand}

\SweaveOpts{echo=FALSE}
\usepackage{a4wide}
\usepackage[round]{natbib}

\begin{document}

\maketitle

\section{Introduction}
This vignette describes how to use {\it plw}, an R implementation of the Probe
level Locally moderated Weighted median-t (PLW) method \citep{Astrand:2007c}
for finding differentially expressed genes. PLW uses an empirical Bayes model
taking into account the dependency between variability and intensity-level.
A global covariance matrix is also used allowing for differing variances
between arrays as well as array-to-array correlations, and thus PLW performs
weighted analysis. PLW is specially designed for Affymetrix type arrays (or
other multiple-probe arrays). Instead of making inference on probe-set
summaries, comparisons are made separately for each perfect-match probe and are
then summarized into one score for the probe-set. The Locally
Moderated Weighted-t (LMW) method, applying the model of PLW on probe-set
summaries or data from single probe arrays, is also implemented in the
{\it plw} package.
See \cite{Astrand:2007c} for details on PLW and LMW, and 
\cite{Kristiansson:2005,Kristiansson:2006}, \cite{Sjogren:2007}, and
\cite{Astrand:2007b} for details on weighted analysis for microarrays. 
PLW is demonstrated in Sections~\ref{sec:data} to \ref{sect:output}, and LMW in
Section~\ref{sect:LMW}.

\section{Data}\label{sec:data}
%The R-package {\it plw} depends on a number of other R-packages from the
%Bioconductor\footnote{http://bioconductor.org/} project or the 
%Comprehensive R Archive Network\footnote{http://www.r-project.org/} (CRAN).
%All packages must be installed to be able to run {\it plw} and
%they are loaded automatically when loading {\it plw}:
The R-package {\it plw} depends on the {\it affy} package, available from the
Bioconductor\footnote{http://bioconductor.org/} project, which is loaded
automatically when loading {\it plw}: 
<<echo=TRUE,results=hide>>=
require(plw)
@
The {\it affy} package contains functions for reading CEL-file data into an
\verb*|AffyBatch| object using the function \verb*|ReadAffy|. It also contains
functions for doing low-level analysis, such as background correction,
normalization, and calculating expression indexes. For example, the \verb*|rma|
function performs all three steps in one call and returns an
\verb*|ExpressionSet| object holding RMA expression indexes. For further
details on how to read CEL-file data into R use
%<<echo=TRUE,results=hide>>=
\begin{verbatim}
> help(ReadAffy)
\end{verbatim}

In this vignette the PLW method is demonstrated using the
\verb*|AffySpikeU95Subset| data set of 6 arrays and 1016 probe-sets. The data
set was loaded using the \verb*|ReadAffy| function and is included in the
{\it plw} package. \verb*|AffySpikeU95Subset| is a sub-set of the Affymetrix
U95 Latin-Square spike-in data set of 59 arrays and 12626 probe-sets.
For these data there are 16 known differentially expressed genes/probe-sets
\cite{Cope:2004}, of which all 16 are included in
\verb*|AffySpikeU95Subset|. The data set in loaded using
<<echo=TRUE,results=hide>>=
data(AffySpikeU95Subset)
@
<<echo=TRUE,print=TRUE>>=
AffySpikeU95Subset
@
\section{Running PLW}
The \verb*|AffySpikeU95Subset| data set use data from groups a and b of the
Affymetrix U95 Latin-Square spike-in data set. Here we show how to do a
comparison of these two groups. The fifth letter of the CEL-file names holds
the group assignment of each array which we can inspect using the
\verb*|pData| function
<<echo=TRUE,print=TRUE>>=
pData(AffySpikeU95Subset)
@
We define a design using the function \verb*|model.matrix|, and a contrast
matrix for comparing groups a and b.
<<echo=TRUE,results=hide>>=
group<-factor(rep(letters[1:2],each=3))
design<-model.matrix(~group-1)
contrast<-matrix(c(1,-1),1,2)
@
<<echo=TRUE,print=TRUE>>=
design
contrast
@
Now we are ready to use the \verb*|plw| function.
<<echo=TRUE,results=hide>>=
plwFit<-plw(AffySpikeU95Subset,design=design,contrast=contrast,epsilon=1e-05)
@
<<echo=TRUE,print=TRUE>>=
plwFit
@
From the output we can see that steps 1 and 2 of the procedure used in
\verb*|plw| required \Sexpr{plwFit$iter[1]} and \Sexpr{plwFit$iter[2]}
iterations, respectively (see \cite{Astrand:2007c} for details of the procedure).
The estimated value for the $m$-parameter is \Sexpr{round(plwFit$m,3)} and the
degrees of freedom for the moderated t-statistics is 
\Sexpr{round(plwFit$dfT,1)}.

\section{Analysing PLW output}\label{sect:output}
%---------------------  DEGS --------------
There are three functions for displaying the ranking of probe-sets with respect
to differential expression, \verb*|topRankSummary|, \verb*|plotSummaryT|, and
\verb*|plotSummaryLog2FC|. All three show results for a given number of top
ranking probe-sets (e.g. probe-set ranked 1-20), for a specific list of ranks
(e.g. probe-set ranked 1,5, and 7), or for a specific list of probe-sets.
For example we can display the result for the 16 spiked-in probsets.
<<echo=TRUE,print=TRUE>>=
topRankSummary(plwFit,genes=spikedProbesU95)
@
We can also display results for probe-sets ranked 11 to 20,
<<echo=TRUE,print=TRUE>>=
topRankSummary(plwFit,genesOfRank=11:20)
@
%\clearpage
Alternatively ,we can display the result for the 20 top ranking probe-sets,
<<echo=TRUE,print=TRUE>>=
topRankSummary(plwFit,nGenes=20)
@
The other two functions plot individual values for each perfect-match probe
together with the median value. The \verb*|plotSummaryT| plots t-statistics,
whereas \verb*|plotSummaryLog2FC| plots logged fold-change values, as shown
in Figures~\ref{plotT} and \ref{plotFC}, respectively.

\begin{figure}[htbp]
  \begin{center}
<<echo=TRUE,fig=TRUE>>=
plotSummaryT(plwFit,genes=spikedProbesU95)
@
    \caption{T-statistics for spiked-in probsets.}\label{plotT}
  \end{center}
\end{figure}
\begin{figure}[htbp]
  \begin{center}
<<echo=TRUE,fig=TRUE>>=
plotSummaryLog2FC(plwFit,nGenes=15)
@
    \caption{Logged fold-change values for the 15 top ranking probe-sets.}\label{plotFC}
  \end{center}
\end{figure}
\clearpage

%---------------------  model fit --------------
The plw function uses an empirical bayes model with an inverse-gamma prior for
the unknown variances, where the scale parameter of the inverse-gamma prior is
modeled as a function of mean intensity. With the \verb*|varHistPlot| function
we can compare the fitted distribution for $\log(s^2)$ with the observed data,
and with the \verb*|scaleParameterPlot| function we can look at the fitted
curve for the scale parameter $\nu$ of the inverse-gamma prior. See
Figures~\ref{varHistPlot} and \ref{scaleParameterPlot}, respectively.

\begin{figure}[htbp]
  \begin{center}
<<echo=TRUE,fig=TRUE>>=
varHistPlot(plwFit)
@
    \caption{Comparing the fitted distribution for $\log(s^2)$ with the observed data.}\label{varHistPlot}
  \end{center}
\end{figure}

\begin{figure}[htbp]
  \begin{center}
<<echo=TRUE,fig=TRUE>>=
scaleParameterPlot(plwFit)
@
    \caption{Fitted curve for the scale parameter $\nu$ of the inverse-gamma prior.}\label{scaleParameterPlot}
  \end{center}
\end{figure}
\clearpage

%---------------------  LMW on --------------
\section{LMW on two-color microarray data}\label{sect:LMW}
In \cite{Astrand:2007c} the LMW method is used on RMA expression indexes,
and \verb|example(lmw)| shows how to use LMW on Affymetrix or other one-color
array data. This section demonstrates how to use LMW on the ApoAI data-set
\citep{Callow:2000},  comparing 8 ApoAI knockout mice with 8 normal mice using
a set of $n=16$ two-color cDNA-arrays. Data was pre-processed as described in
\citep{Callow:2000} and the analysis presented here is based on the 6068 genes
(out of 6226) having no missing values.
<<echo=TRUE,results=hide,eval=FALSE>>=
source("http://www.math.chalmers.se/~astrandm/plw/GetApoAIdata.R")
RG <- GetApoAIdata()
require(limma)
MA <- normalizeWithinArrays(RG)
rownames(MA$M) <- MA$genes$Name
ii <- apply(is.na(MA$M),1,any)
MA$A <- MA$A[!ii,]
MA$M <- MA$M[!ii,]
@
Arrays 1 to 8 is the control group with mRNA from normal mice, whereas arrays
9 to 16 are from the knockout group.  Thus, we specify a design and contrast
matrix for the comparison of knock out mice with the control group of normal
mice.
<<echo=TRUE,results=hide,eval=FALSE>>=
design <- cbind("Control-Ref"=1,"KO-Control"=MA$targets$Cy5=="ApoAI KO")
contrast <- matrix(0:1,ncol=2)
@
<<echo=TRUE,print=TRUE,eval=FALSE>>=
design
contrast
@
The analysis using LMW is done using the mean intensity of the sum of logged
green and red signal, respectively,  to model the scale parameter of the
inverse-gamma prior for error variances. Also, the spline-knots for the
scale-parameter function are set using a set of sample quantiles 
(10, 30, 50, 70, and the 90\% quantile) of the mean intensity instead of the
default knots computing using an internal function.
<<echo=TRUE,results=hide,eval=FALSE>>=
meanX <- apply(MA$A,1,mean)
knots <- quantile(meanX,seq(0.1,0.9,by=0.2))
lmwFit <- lmw(MA$M,design=design,contrast=contrast,meanX=meanX,knots=knots)
@
<<echo=TRUE,print=TRUE,eval=FALSE>>=
lmwFit
@
From the fitted model we can select the top 10 ranked genes from the analysis, 
<<echo=TRUE,print=TRUE,eval=FALSE>>=
topRankSummary(lmwFit,nGenes=10)
@
and inspect the model fit for the inverse-gamma prior together with the
estimated scale-parameter curve,
%see Figures~\ref{varHistPlot2} and \ref{scaleParameterPlot2}, respectively.
%\begin{figure}[htbp]
%  \begin{center}
%<<echo=TRUE,fig=TRUE>>=
%varHistPlot(lmwFit)
%@
%    \caption{Comparing the fitted distribution for $\log(s^2)$ with the observed data from the ApoAI knockout experiment.}\label{varHistPlot2}
%  \end{center}
%\end{figure}

%\begin{figure}[htbp]
%  \begin{center}
%<<echo=TRUE,fig=TRUE>>=
%scaleParameterPlot(lmwFit)
%@
%    \caption{Fitted curve for the scale parameter $\nu$ of the inverse-gamma prior from the analysis of the ApoAI data-set.}\label{scaleParameterPlot2}
%  \end{center}
%\end{figure}
\clearpage


%\bibliographystyle{plainnat}
%\bibliography{/users/math/astrandm/Latex/bibfile}
%\end{document}

\begin{thebibliography}{7}
\expandafter\ifx\csname natexlab\endcsname\relax\def\natexlab#1{#1}\fi
\expandafter\ifx\csname url\endcsname\relax
  \def\url#1{{\tt #1}}\fi

\bibitem[{\AA}strand et~al.(2007{\natexlab{a}}){\AA}strand, Mostad, and
  Rudemo]{Astrand:2007c}
M.~{\AA}strand, P.~Mostad, and M~Rudemo.
\newblock Empirical bayes models for multiple probe type arrays at the probe
  level.
\newblock Technical report, Chalmers University of Technology and G\"oteborg
  University, Department of Mathematical Statistics, 2007{\natexlab{a}}.
\newblock URL \url{{{http://www.math.chalmers.se/Math/Research/
  Preprints/2007/27.pdf}}}.

\bibitem[{\AA}strand et~al.(2007{\natexlab{b}}){\AA}strand, Mostad, and
  Rudemo]{Astrand:2007b}
M.~{\AA}strand, P.~Mostad, and M~Rudemo.
\newblock Improved covariance matrix estimators for weighted analysis of
  microarray data.
\newblock {\em J. Comput. Biol.}, Accepted, appearing in number 10, 2007{\natexlab{b}}.

\bibitem[Callow et~al.(2000)Callow, Dudoit, Gong, Speed, and
  Rubin]{Callow:2000}
Matthew~J. Callow, Sandrine Dudoit, Elaine~L. Gong, Terence~P. Speed, and
  Edward~M. Rubin.
\newblock {Microarray Expression Profiling Identifies Genes with Altered
  Expression in HDL-Deficient Mice}.
\newblock {\em Genome Res.}, 10\penalty0 (12):\penalty0 2022--2029, 2000.

\bibitem[Cope et~al.(2004)Cope, Irizarry, Jaffee, Wu, and Speed]{Cope:2004}
Leslie~M. Cope, Rafael~A. Irizarry, Harris~A. Jaffee, Zhijin Wu, and Terence~P.
  Speed.
\newblock {A benchmark for Affymetrix GeneChip expression measures}.
\newblock {\em Bioinformatics}, 20\penalty0 (3):\penalty0 323--331, 2004.

\bibitem[Kristiansson et~al.(2005)Kristiansson, Sj{\"o}gren, Rudemo, and
  Nerman]{Kristiansson:2005}
Erik Kristiansson, Anders Sj{\"o}gren, Mats Rudemo, and Olle Nerman.
\newblock Weighted analysis of paired microarray experiments.
\newblock {\em Stat. Appl. Genet. Mol. Biol.}, 4\penalty0 (1):\penalty0 article
  30, 2005.

\bibitem[Kristiansson et~al.(2006)Kristiansson, Sj{\"o}gren, Rudemo, and
  Nerman]{Kristiansson:2006}
Erik Kristiansson, Anders Sj{\"o}gren, Mats Rudemo, and Olle Nerman.
\newblock Quality optimised analysis of general paired microarray experiments.
\newblock {\em Stat. Appl. Genet. Mol. Biol.}, 5\penalty0 (1):\penalty0 article
  10, 2006.

\bibitem[Sj{\"o}gren et~al.(2007)Sj{\"o}gren, Kristiansson, Rudemo, and
  Nerman]{Sjogren:2007}
Anders Sj{\"o}gren, Erik Kristiansson, Mats Rudemo, and Olle Nerman.
\newblock Weighted analysis of general microarray experiments.
\newblock {\em BMC Bioinformatics}, 8\penalty0 (1):\penalty0 article 387, 2007.

\end{thebibliography}

\end{document}