\name{cosmo}
\alias{cosmo}
\title{Constrained motif detection main function}
\description{
  cosmo searches a set of unaligned DNA sequences for a shared
	motif that may, for example, represent a common transcription
	factor binding site. The algorithm is similar to MEME, but
	also allows the user to specify a set of constraints that the 	
	position weight matrix of the unknown motif must satisfy. 
	Such constraints may include bounds on the information content
	across certain regions of the unknown motif, for example, and
	can often be formulated on the basis of prior knowledge about 
	the structure of the transcription factor in question.
}
\usage{
  cosmo(seqs="browse",constraints="None", minW=6, maxW=15,
    models = "ZOOPS", revComp = TRUE, minSites = NULL, maxSites = NULL, 
    starts = 5, approx = "over", cutFac = 5, wCrit = "bic", 
    wFold = 5, wTrunc = 100, modCrit = "lik", modFold = 5, modTrunc = 100, 
    conCrit = "likCV", conFold = 5, conTrunc = 90, intCrit = "lik", 
    intFold = 5, intTrunc = 100, maxIntensity = FALSE, lstarts = FALSE, 
    backSeqs = NULL, backFold = 5, bfile = NULL, transMat = NULL,
    order = NULL, maxOrder=6, silent = FALSE) 
}
\arguments{
  \item{seqs}{ This argument specifies the sequences to be analyzed.
    If seqs == "browse", a browser appears that allows the user to
    select a file that contains the sequences in FASTA format.
    If seqs is another character string, it is assumed to give the
    path to a FASTA file containing the sequences of interest.
    Lastly, seqs may be a list with each element representing a sequence
    in the form of a single string such as "ACGTAGCTAG" ("seq" entry)
    and a description ("desc" entry).}
  \item{constraints}{ These are the constraints that are to be imposed
    on the unknown motif. If constraints == "None", cosmo() will be run
    without constraints. If constraints == "GUI" and the \code{cosmoGUI}
    package has been installed, a GUI will pop up that allows the user
    to interactively create a set of constraints, either from scratch or
    on the basis of several templates of interest. If constraints is
    another character string, it is assumed to give the path to a file
    that contains the constraint definitions in the standard text format
    (see http://cosmoweb.berkeley.edu/constraints.html). Lastly,
    constraints may be an object of class constraintSet or a list of
    such objects that defines the constraints of interest.}
  \item{minW}{ \code{numeric} indicating the minimum motif width to consider}
  \item{maxW}{ \code{numeric} indicating the maximum motif width to consider}
  \item{models}{ \code{character} a vector containing the different
    models to be considered for the distribution of motif
    occurrences ("OOPS", "ZOOPS", and "TCM"). The
    One-Occurrence-Per-Sequence (OOPS) model assumes that each
    sequence contains exactly one occurrence of the motif. The
    Zero-or-One-Occurrences-Per-Sequence model allows zero or one
    occurrences of the motif on a given sequence. The
    Two-Compoment-Mixture (TCM) model allows an arbitrary number of
    motif occurrences on a given seqence.}
  \item{revComp}{ \code{logical} indicating whether motifs are allowed
    to occur in the reverse complement orientation.}
  \item{minSites}{\code{numerical} The minimum number of motif
    occurrences in the input sequences (default: 2)}
  \item{maxSites}{\code{numerical} The maximum number of motif
    occurrences in the input sequences (default: MIN(5*number of
    sequences, 50))}
  \item{starts}{\code{numerical} number of starting values to use for
    each optimization}
  \item{approx}{approximation for TCM likelihood; one of "over", "cut", "exact"}
  \item{cutFac}{\code{numerical} if TCM model is approximated by over or
    cut models, subsequences are of length cutFac * motif width}
  \item{wCrit}{Criterion for choosing the motif width. This can be either
    "lik" for the likelihood, "aic" for Akaike's Information Criterion,
    "bic" for the Bayesian Information Criterion, "eval" for the E-value
    of the alignment of the predicted motif sites, or "likCV" for
    likelihood-based cross-validation.}
  \item{wFold}{\code{numerical} cross-validation fold for selecting
    motif width}
  \item{wTrunc}{\code{numerical} truncate loss-function for selecting
    motif width to this percentile (1-100)}
  \item{modCrit}{Criterion for choosing the model type. This can be either
    "lik" for the likelihood, "aic" for Akaike's Information Criterion,
    "bic" for the Bayesian Information Criterion, "eval" for the E-value
    of the alignment of the predicted motif sites, or "likCV" for
    likelihood-based cross-validation.}
  \item{modFold}{\code{numerical} cross-validation fold for selecting
    the model type}
  \item{modTrunc}{\code{numerical} truncate loss-function for selecting
    model type to this percentile (1-100)}
  \item{conCrit}{Criterion for choosing the constraint set. This can be either
    "lik" for the likelihood, "eval" for the E-value of the alignment of
    the predicted motif sites, "likCV" for likelihood-based cross-validation,
    or "pwmCV" for cross-validation based on the Euclidean norm between two
    position weight matrices.}
  \item{conFold}{\code{numerical} cross-validation fold for selecting
    the constraint set (likelihood cross-validation only).}
  \item{conTrunc}{\code{numerical}truncate loss-function for selecting
    constraint set to this percentile (1-100)}
  \item{intCrit}{Criterion for estimating the intensity parameter in
    the ZOOPS or TCM model. This can be either "lik" for the likelihood,
    "aic" for Akaike's Information Criterion, "bic" for the Bayesian
    Information Criterion, or "eval" for the E-value of the alignment
    of the predicted motif sites.}
  \item{intFold}{\code{numerical} cross-validation fold for selecting
    the intensity parameter}
  \item{intTrunc}{\code{numerical} truncate loss-function for selecting
    intensity parameter to this percentile (1-100)}
  \item{maxIntensity}{\code{logical} maximize likelihood function with
    respect to intensity parameter (in ZOOPS or TCM model) instead of
    using profiling approach?} 
  \item{lstarts}{\code{logical} should likelihood-based starting values
    be used rather than E-value-based starting values?}
  \item{backSeqs}{ This argument specifies the sequences that are to be
    used to estimate the background Markov model. If backseqs == NULL,
    the background model is estimated from the sequences supplied in
    the seqs argument. If backSeqs == "browse", a browser appears that
    allows the user to select a file that contains the sequences in
    FASTA format. If backSeqs is another character string, it is assumed
    to give the path to a FASTA file containing the sequences of interest.
    Lastly, backSeqs may be a list with each element representing a sequence
    in the form of a single string such as "ACGTAGCTAG" ("seq" entry)
    and a description ("desc" entry).}
  \item{backFold}{\code{numerical} cross-validation fold for selecting
    order of background Markov model.}
  \item{bfile}{\code{character} The name of a MEME-style background file
    for specifying the background Markov model. Such a file lists the
    frequencies of all tuples of all possible tuples of length up to
    order + 1. See the help file on the function \code{bfile2tmat()} for
    an example.}
  \item{transMat}{The transition matrix to use for the
    background Markov model. This is a list of matrices, with the first
    matrix given the transition probabilities for the 0th order Markov
    model, the second matrix giving the transition probabilities for a
    1st order Markov model, and so on. The entry in cell(i,j) of a k-th
    order transition matrix gives the probability of observing the
    nucleotide in column j given that the previous k nucleotides are
    equal to those in row i. Type 'data(transMats)' to look at an
    example. The function \code{\link{bgModel}} can be
    used to obtain a transition matrix from a set of sequences that can
    be used for this argument. The function \code{\link{bfile2tmat}} may
    be used to obtain a transition matrix  from a MEME-style background file.}
  \item{order}{\code{numerical} order of Markov background model} 
  \item{maxOrder}{\code{numerical} maximum order to consider for Markov background model} 
  \item{silent}{\code{logical} suppress output?}     
}
\details{

}
\value{
  An object of class \code{cosmo}, returning all the results of the motif detection analysis.
}
\references{Oliver Bembom, Sunduz Keles, and Mark J. van der Laan,
  "Supervised Detection of Conserved Motifs in DNA Sequences with cosmo"
  (2007). Statistical Applications in Genetics and Molecular Biology:
  Vol. 6 : Iss. 1, Article 8. http://www.bepress.com/sagmb/vol6/iss1/art8}
\author{Oliver Bembom, \email{bembom@berkeley.edu},  Fabian Gallusser, \email{fgallusser@berkeley.edu}}
\note{}

\seealso{
  \code{\link{bgModel}},
  \code{\link{bfile2tmat}}
}
\examples{
## initialize constraint set
## consisting of three intervals
## 1st and 3rd intervals are 3bp long
## middle interval is variable lenght
conSet <- makeConSet(numInt=3, type=c("B","V","B"),length=c(3,NA,3))

## construct two bound constraints
boundCon1 <- makeBoundCon(lower=1.0, upper=2.0)
boundCon2 <- makeBoundCon(lower=0.0, upper=1.0)

## construct palindromic constraint
## require intervals 1 and 3 to be palindromes
## to within 0.05 tolerance
palCon1 <- makePalCon(int1=1, int2=3, errBnd=0.05)

## add constraints to initial constraint set
constraint <- list(boundCon1, boundCon2, palCon1)
int <- list(1, 2, NA)
conSet <- addCon(conSet=conSet, constraint=constraint, int=int)

## path to example sequence file in FASTA format
seqFile <- system.file("Exfiles","seq.fasta",package="cosmo")

## search for motifs of width 8
## assume zero or one occurrences of motif per sequence (ZOOPS)
res <- cosmo(seqs=seqFile, constraints=conSet, minW=8, maxW=8, models="ZOOPS")
plot(res)
}
\keyword{misc}