\name{PDict-class} \docType{class} % PreprocessedTB class: \alias{class:PreprocessedTB} \alias{PreprocessedTB-class} \alias{PreprocessedTB} \alias{length,PreprocessedTB-method} \alias{width,PreprocessedTB-method} \alias{tb} \alias{tb,PreprocessedTB-method} \alias{tb.width} \alias{tb.width,PreprocessedTB-method} \alias{nnodes} \alias{hasAllFlinks} \alias{computeAllFlinks} \alias{initialize,PreprocessedTB-method} \alias{duplicated,PreprocessedTB-method} % Twobit class: \alias{class:Twobit} \alias{Twobit-class} \alias{Twobit} \alias{show,Twobit-method} \alias{initialize,Twobit-method} % ACtree2 class: \alias{class:ACtree2} \alias{ACtree2-class} \alias{ACtree2} \alias{nnodes,ACtree2-method} \alias{hasAllFlinks,ACtree2-method} \alias{computeAllFlinks,ACtree2-method} \alias{show,ACtree2-method} \alias{initialize,ACtree2-method} % PDict3Parts class: \alias{class:PDict3Parts} \alias{PDict3Parts-class} \alias{PDict3Parts} \alias{length,PDict3Parts-method} \alias{width,PDict3Parts-method} \alias{head,PDict3Parts-method} \alias{tb,PDict3Parts-method} \alias{tb.width,PDict3Parts-method} \alias{tail,PDict3Parts-method} % PDict class: \alias{class:PDict} \alias{PDict-class} \alias{length,PDict-method} \alias{width,PDict-method} \alias{names,PDict-method} \alias{names<-,PDict-method} \alias{[[,PDict-method} \alias{duplicated,PDict-method} \alias{patternFrequency} \alias{patternFrequency,PDict-method} % TB_PDict class: \alias{class:TB_PDict} \alias{TB_PDict-class} \alias{TB_PDict} \alias{head,TB_PDict-method} \alias{tb,TB_PDict-method} \alias{tb.width,TB_PDict-method} \alias{tail,TB_PDict-method} \alias{show,TB_PDict-method} % MTB_PDict class: \alias{class:MTB_PDict} \alias{MTB_PDict-class} \alias{MTB_PDict} \alias{as.list,MTB_PDict-method} \alias{show,MTB_PDict-method} % Expanded_TB_PDict class: \alias{class:Expanded_TB_PDict} \alias{Expanded_TB_PDict-class} \alias{Expanded_TB_PDict} % PDict() constructor: \alias{PDict} \alias{PDict,character-method} \alias{PDict,DNAStringSet-method} \alias{PDict,XStringViews-method} \alias{PDict,AsIs-method} \alias{PDict,probetable-method} \title{PDict objects} \description{ The PDict class is a container for storing a preprocessed dictionary of DNA patterns that can later be passed to the \code{\link{matchPDict}} function for fast matching against a reference sequence (the subject). \code{PDict} is the constructor function for creating new PDict objects. } \usage{ PDict(x, max.mismatch=NA, tb.start=NA, tb.end=NA, tb.width=NA, algorithm="ACtree2", skip.invalid.patterns=FALSE) } \arguments{ \item{x}{ A character vector, a \link{DNAStringSet} object or an \link{XStringViews} object with a \link{DNAString} subject. } \item{max.mismatch}{ A single non-negative integer or \code{NA}. See the "Allowing a small number of mismatching letters" section below. } \item{tb.start,tb.end,tb.width}{ A single integer or \code{NA}. See the "Trusted Band" section below. } \item{algorithm}{ \code{"ACtree2"} (the default) or \code{"Twobit"}. } \item{skip.invalid.patterns}{ This argument is not supported yet (and might in fact be replaced by the \code{filter} argument very soon). } } \details{ THIS IS STILL WORK IN PROGRESS! If the original dictionary \code{x} is a character vector or an \link{XStringViews} object with a \link{DNAString} subject, then the \code{PDict} constructor will first try to turn it into a \link{DNAStringSet} object. By default (i.e. if \code{PDict} is called with \code{max.mismatch=NA}, \code{tb.start=NA}, \code{tb.end=NA} and \code{tb.width=NA}) the following limitations apply: (1) the original dictionary can only contain base letters (i.e. only As, Cs, Gs and Ts), therefore IUPAC ambiguity codes are not allowed; (2) all the patterns in the dictionary must have the same length ("constant width" dictionary); and (3) later \code{matchPdict} can only be used with \code{max.mismatch=0}. A Trusted Band can be used in order to relax these limitations (see the "Trusted Band" section below). If you are planning to use the resulting \code{PDict} object in order to do inexact matching where valid hits are allowed to have a small number of mismatching letters, then see the "Allowing a small number of mismatching letters" section below. Two preprocessing algorithms are currently supported: \code{algorithm="ACtree2"} (the default) and \code{algorithm="Twobit"}. With the \code{"ACtree2"} algorithm, all the oligonucleotides in the Trusted Band are stored in a 4-ary Aho-Corasick tree. With the \code{"Twobit"} algorithm, the 2-bit-per-letter signatures of all the oligonucleotides in the Trusted Band are computed and the mapping from these signatures to the 1-based position of the corresponding oligonucleotide in the Trusted Band is stored in a way that allows very fast lookup. Only PDict objects preprocessed with the \code{"ACtree2"} algo can then be used with \code{matchPdict} (and family) and with \code{fixed="pattern"} (instead of \code{fixed=TRUE}, the default), so that IUPAC ambiguity codes in the subject are treated as ambiguities. PDict objects obtained with the \code{"Twobit"} algo don't allow this. See \code{?`\link{matchPDict-inexact}`} for more information about support of IUPAC ambiguity codes in the subject. } \section{Trusted Band}{ What's a Trusted Band? A Trusted Band is a region defined in the original dictionary where the limitations described above will apply. Why use a Trusted Band? Because the limitations described above will apply to the Trusted Band only! For example the Trusted Band cannot contain IUPAC ambiguity codes but the "head" and the "tail" can (see below for what those are). Also with a Trusted Band, if \code{matchPdict} is called with a non-null \code{max.mismatch} value then mismatching letters will be allowed in the head and the tail. Or, if \code{matchPdict} is called with \code{fixed="subject"}, then IUPAC ambiguity codes in the head and the tail will be treated as ambiguities. How to specify a Trusted Band? Use the \code{tb.start}, \code{tb.end} and \code{tb.width} arguments of the \code{PDict} constructor in order to specify a Trusted Band. This will divide each pattern in the original dictionary into three parts: a left part, a middle part and a right part. The middle part is defined by its starting and ending nucleotide positions given relatively to each pattern thru the \code{tb.start}, \code{tb.end} and \code{tb.width} arguments. It must have the same length for all patterns (this common length is called the width of the Trusted Band). The left and right parts are defined implicitely: they are the parts that remain before (prefix) and after (suffix) the middle part, respectively. Therefore three \link{DNAStringSet} objects result from this division: the first one is made of all the left parts and forms the head of the PDict object, the second one is made of all the middle parts and forms the Trusted Band of the PDict object, and the third one is made of all the right parts and forms the tail of the PDict object. In other words you can think of the process of specifying a Trusted Band as drawing 2 vertical lines on the original dictionary (note that these 2 lines are not necessarily straight lines but the horizontal space between them must be constant). When doing this, you are dividing the dictionary into three regions (from left to right): the head, the Trusted Band and the tail. Each of them is a \link{DNAStringSet} object with the same number of elements than the original dictionary and the original dictionary could easily be reconstructed from those three regions. The width of the Trusted Band must be >= 1 because Trusted Bands of width 0 are not supported. Finally note that calling \code{PDict} with \code{tb.start=NA}, \code{tb.end=NA} and \code{tb.width=NA} (the default) is equivalent to calling it with \code{tb.start=1}, \code{tb.end=-1} and \code{tb.width=NA}, which results in a full-width Trusted Band i.e. a Trusted Band that covers the entire dictionary (no head and no tail). } \section{Allowing a small number of mismatching letters}{ [TODO] } \section{Accessor methods}{ In the code snippets below, \code{x} is a PDict object. \describe{ \item{}{ \code{length(x)}: The number of patterns in \code{x}. } \item{}{ \code{width(x)}: A vector of non-negative integers containing the number of letters for each pattern in \code{x}. } \item{}{ \code{names(x)}: The names of the patterns in \code{x}. } \item{}{ \code{head(x)}: The head of \code{x} or \code{NULL} if \code{x} has no head. } \item{}{ \code{tb(x)}: The Trusted Band defined on \code{x}. } \item{}{ \code{tb.width(x)}: The width of the Trusted Band defined on \code{x}. Note that, unlike \code{width(tb(x))}, this is a single integer. And because the Trusted Band has a constant width, \code{tb.width(x)} is in fact equivalent to \code{unique(width(tb(x)))}, or to \code{width(tb(x))[1]}. } \item{}{ \code{tail(x)}: The tail of \code{x} or \code{NULL} if \code{x} has no tail. } } } \section{Subsetting methods}{ In the code snippets below, \code{x} is a PDict object. \describe{ \item{}{ \code{x[[i]]}: Extract the i-th pattern from \code{x} as a \link{DNAString} object. } } } \section{Other methods}{ In the code snippet below, \code{x} is a PDict object. \describe{ \item{}{ \code{duplicated(x)}: [TODO] } \item{}{ \code{patternFrequency(x)}: [TODO] } } } \author{H. Pages} \references{ Aho, Alfred V.; Margaret J. Corasick (June 1975). "Efficient string matching: An aid to bibliographic search". Communications of the ACM 18 (6): 333-340. } \seealso{ \code{\link{matchPDict}}, \code{\link{DNA_ALPHABET}}, \code{\link{IUPAC_CODE_MAP}}, \link{DNAStringSet-class}, \link{XStringViews-class} } \examples{ ## --------------------------------------------------------------------- ## A. NO HEAD AND NO TAIL (THE DEFAULT) ## --------------------------------------------------------------------- library(drosophila2probe) dict0 <- DNAStringSet(drosophila2probe) dict0 # The original dictionary. length(dict0) # Hundreds of thousands of patterns. unique(nchar(dict0)) # Patterns are 25-mers. pdict0 <- PDict(dict0) # Store the original dictionary in # a PDict object (preprocessing). pdict0 class(pdict0) length(pdict0) # Same as length(dict0). tb.width(pdict0) # The width of the (implicit) # Trusted Band. sum(duplicated(pdict0)) table(patternFrequency(pdict0)) # 9 patterns are repeated 3 times. pdict0[[1]] pdict0[[5]] ## --------------------------------------------------------------------- ## B. NO HEAD AND A TAIL ## --------------------------------------------------------------------- dict1 <- c("ACNG", "GT", "CGT", "AC") pdict1 <- PDict(dict1, tb.end=2) pdict1 class(pdict1) length(pdict1) width(pdict1) head(pdict1) tb(pdict1) tb.width(pdict1) width(tb(pdict1)) tail(pdict1) pdict1[[3]] } \keyword{methods} \keyword{classes}