\name{semsim} \alias{semsim} \alias{pms} \alias{subsumers} \alias{conceptProbs} \alias{usageCount} \title{Compute semantic similarity measure for terms in an object-ontology complex } \description{Compute semantic similarity measure for terms in an object-ontology complex } \usage{ semsim(c1, c2, ooc, acc=NULL, pc=NULL) conceptProbs(ooc,acc=NULL,inds=NULL) subsumers(c1, c2, ont, acc=NULL) pms(c1, c2, ooc, acc=NULL, pc=NULL) usageCount(map,acc,inds) } \arguments{ \item{c1}{ c1, c2: "character" terms to be compared} \item{c2}{ c1, c2: "character" terms to be compared} \item{ooc}{ ooc: an object of class "OOC": object-ontology complex} \item{ont}{ ont: an object of class "ontology": annotated rooted DAG} \item{acc}{ acc: optional (sparse) accessibility matrix for the ontology} \item{pc}{ pc: optional vector of concept probabilities, if pre-computed} \item{map}{ map: OOmap component of an ooc} \item{inds}{ inds: vector of numeric indices, row indices of object-ontology map to be processed} } \details{ For large ontologies, computation of the term accessibility relationships and term probabilities can be costly. Once these are computed to support one semsim calculation, they should be saved. The acc and pc parameters allow use of this saved information. } \value{ semsim returns the measure of semantic similarity cited by Lord et al (2003). } \references{PW Lord et al, Bioinformatics, 19(10)2003:1275 } \author{Vince Carey } %\note{ } %\seealso{ } \examples{ # # we are given a graph of GOMF and the OOmap between LL and GOMF # derived from humanLLMappings and stored as data resources in # ontoTools -- these will have to be updated regularly # data(goMFgraph.1.15) data(LL2GOMFooMap.1.15) # # build the rooted DAG, the ontology, and the OOC objects # gomfrDAG <- new("rootedDAG", root="GO:0003674", DAG=goMFgraph.1.15) GOMFonto <- new("ontology", name="GOMF", version="bioc GO 1.15", rDAG=gomfrDAG) LLGOMFOOC <- makeOOC(GOMFonto, LL2GOMFooMap.1.15) # # we are given the accessibility matrix for the GO MF graph as a # data resource, and we can compute some term probabilities # data(goMFamat.1.15) pc <- conceptProbs(LLGOMFOOC, goMFamat.1.15, inds=1:20) # # now we will get a sample of GO MF terms and compute the # semantic similarities of pairs of terms in the sample # data(LL2GOMFcp.1.15) # full set of precomputed concept probabilities library(GO.db) library(Biobase) library(combinat) library(annotate) GO() # get the GO environments GOtags <- ls(GOTERM) GOlabs <- mget(GOtags, GOTERM, ifnotfound=NA) GOMFtags <- GOtags[ sapply(GOlabs,Ontology)=="MF" ] GOMFtags <- GOMFtags[!is.na(GOMFtags)] GOMFtermObs <- mget(GOMFtags,env=GOTERM) GOMFterms <- sapply( GOMFtermObs, Term ) ntags <- length(GOMFtags) if (any(duplicated(GOMFterms))) { dups <- (1:ntags)[duplicated(GOMFterms)] GOMFterms[dups] <- paste(GOMFterms[dups],".2",sep="") } #names(GOMFterms) <- GOMFtags set.seed(1234) # does not lead to common samples across platforms... st <- sample(names(GOMFterms),size=50) # take the sample st <- intersect(st, names(LL2GOMFcp.1.15))[1:10] # use only those terms available in bioc GO 1.15 # thus ... st = c("GO:0004397", "GO:0030215", "GO:0042802", "GO:0008504", "GO:0008640", "GO:0008528", "GO:0008375", "GO:0005436", "GO:0004756", "GO:0003729" ) pst <- combn(st,2) # get a matrix with the pairs of terms in columns bad = c(4L, 12L, 19L, 25L, 31L, 32L, 33L, 34L, 35L) # can't use 8640 pst = pst[,-bad] npst <- ncol(pst) ss <- rep(NA,npst) for (i in 1:npst) # compute semantic similarities { cat(i) ss[i] <- semsim( pst[1,i], pst[2,i], ooc=LLGOMFOOC, acc=goMFamat.1.15, pc=LL2GOMFcp.1.15 ) } print(summary(ss)) top <- (1:npst)[ss==max(ss,na.rm=TRUE)][1] # index of the most similar pair # note -- must come to an understanding of the NAs print( GOMFterms[ as.character(pst[,top]) ] ) pen <- (1:npst)[ss==max(ss[-top],na.rm=TRUE)][1] # second most similar print( GOMFterms[ as.character(pst[,pen]) ] ) } \keyword{ models }