\name{BSgenome-utils} \alias{BSgenome-utils} \alias{matchPWM,BSgenome-method} \alias{countPWM,BSgenome-method} \alias{vmatchPattern,BSgenome-method} \alias{vcountPattern,BSgenome-method} \alias{vmatchPDict,BSgenome-method} \alias{vcountPDict,BSgenome-method} \title{BSgenome utilities} \description{ Utilities for BSgenome objects. } \usage{ \S4method{matchPWM}{BSgenome}(pwm, subject, min.score = "80\%", exclude = "", maskList = logical(0), asRangedData = TRUE) \S4method{countPWM}{BSgenome}(pwm, subject, min.score = "80\%", exclude = "", maskList = logical(0)) \S4method{vmatchPattern}{BSgenome}(pattern, subject, max.mismatch = 0, min.mismatch = 0, with.indels = FALSE, fixed = TRUE, algorithm = "auto", exclude = "", maskList = logical(0), userMask = RangesList(), invertUserMask = FALSE, asRangedData = TRUE) \S4method{vcountPattern}{BSgenome}(pattern, subject, max.mismatch = 0, min.mismatch = 0, with.indels = FALSE, fixed = TRUE, algorithm = "auto", exclude = "", maskList = logical(0), userMask = RangesList(), invertUserMask = FALSE) \S4method{vmatchPDict}{BSgenome}(pdict, subject, max.mismatch = 0, min.mismatch = 0, fixed = TRUE, algorithm = "auto", verbose = FALSE, exclude = "", maskList = logical(0), asRangedData = TRUE) \S4method{vcountPDict}{BSgenome}(pdict, subject, max.mismatch = 0, min.mismatch = 0, fixed = TRUE, algorithm = "auto", collapse = FALSE, weight = 1L, verbose = FALSE, exclude = "", maskList = logical(0)) } \arguments{ \item{pwm}{ A numeric matrix with row names A, C, G and T representing a Position Weight Matrix. } \item{pattern}{ A \link[Biostrings]{DNAString} object containing the pattern sequence. } \item{pdict}{ A \link[Biostrings]{DNAStringSet} object containing the pattern sequences. } \item{subject}{ A \link{BSgenome} object containing the subject sequences. } \item{min.score}{ The minimum score for counting a match. Can be given as a character string containing a percentage (e.g. \code{"85\%"}) of the highest possible score or as a single number. } \item{max.mismatch, min.mismatch}{ The maximum and minimum number of mismatching letters allowed (see \code{?`\link[Biostrings]{lowlevel-matching}`} for the details). If non-zero, an inexact matching algorithm is used. } \item{with.indels}{ If \code{TRUE} then indels are allowed. In that case, \code{min.mismatch} must be \code{0} and \code{max.mismatch} is interpreted as the maximum "edit distance" allowed between the pattern and a match. Note that in order to avoid pollution by redundant matches, only the "best local matches" are returned. Roughly speaking, a "best local match" is a match that is locally both the closest (to the pattern P) and the shortest. More precisely, a substring S' of the subject S is a "best local match" iff: \preformatted{ (a) nedit(P, S') <= max.mismatch (b) for every substring S1 of S': nedit(P, S1) > nedit(P, S') (c) for every substring S2 of S that contains S': nedit(P, S2) <= nedit(P, S') } One nice property of "best local matches" is that their first and last letters are guaranteed to be aligned with letters in P (i.e. they match letters in P). } \item{fixed}{ If \code{FALSE} then IUPAC extended letters are interpreted as ambiguities (see \code{?`\link[Biostrings]{lowlevel-matching}`} for the details). } \item{algorithm}{ For \code{vmatchPattern} and \code{vcountPattern} one of the following: \code{"auto"}, \code{"naive-exact"}, \code{"naive-inexact"}, \code{"boyer-moore"}, \code{"shift-or"}, or \code{"indels"}. For \code{vmatchPDict} and \code{vcountPDict} one of the following: \code{"auto"}, \code{"naive-exact"}, \code{"naive-inexact"}, \code{"boyer-moore"}, or \code{"shift-or"}. } \item{collapse, weight}{ ignored arguments. } \item{verbose}{ \code{TRUE} or \code{FALSE}. } \item{exclude}{ A character vector with strings that will be used to filter out chromosomes whose names match these strings. } \item{maskList}{ A named logical vector of maskStates preferred when used with a BSGenome object. When using the bsapply function, the masks will be set to the states in this vector. } \item{userMask}{ A \linkS4class{RangesList}, containing a mask to be applied to each chromosome. See \code{\link{bsapply}}. } \item{invertUserMask}{ Whether the \code{userMask} should be inverted. } \item{asRangedData}{ A logical value to assist in migrating output type from \link[IRanges]{RangedData} (deprecated) to \link[GenomicRanges]{GRanges}. Should be \code{FALSE}. If \code{TRUE}, a warning message is issued and a RangedData object is returned. } } \value{ A \link[GenomicRanges]{GRanges} object for \code{matchPWM} with two elementMetadata columns: "score" (numeric), and "string" (DNAStringSet). A \link[GenomicRanges]{GRanges} object for \code{vmatchPattern}. A \link[GenomicRanges]{GRanges} object for \code{vmatchPDict} with one elementMetadata column: "index", which represents a mapping to a position in the original pattern dictionary. A data.frame object for \code{countPWM} and \code{vcountPattern} with three columns: "seqname" (factor), "strand" (factor), and "count" (integer). A \link[IRanges]{DataFrame} object for \code{vcountPDict} with four columns: "seqname" ('factor' Rle), "strand" ('factor' Rle), "index" (integer) and "count" ('integer' Rle). As with \code{vmatchPDict} the index column represents a mapping to a position in the original pattern dictionary. } \author{P. Aboyoun} \seealso{ \code{\link[Biostrings]{matchPWM}}, \code{\link[Biostrings]{matchPattern}}, \code{\link[Biostrings]{matchPDict}}, \code{\link[BSgenome]{bsapply}} } \examples{ library(BSgenome.Celegans.UCSC.ce2) data(HNF4alpha) pwm <- PWM(HNF4alpha) matchPWM(pwm, Celegans, asRangedData = FALSE) countPWM(pwm, Celegans) pattern <- consensusString(HNF4alpha) vmatchPattern(pattern, Celegans, fixed = "subject", asRangedData = FALSE) vcountPattern(pattern, Celegans, fixed = "subject") vmatchPDict(HNF4alpha[1:10], Celegans, asRangedData = FALSE) vcountPDict(HNF4alpha[1:10], Celegans) } \keyword{methods} \keyword{utilities}