## ----setup, cache=FALSE, echo=FALSE----------------------------------- library(knitr) options(replace.assign=FALSE,width=72) opts_chunk$set(fig.path='figs/key-', cache.path='cache/key-', fig.align='center', fig.width=3.5, fig.height=3.5, fig.show='hold', par=TRUE, tidy=FALSE, comment=NA) knit_hooks$set(par=function(before, options, envir){ if (before && options$fig.show!='none') par(mar=c(4,4,2.6,.1), cex.lab=.95,cex.axis=.9,mgp=c(2,.7,0),tcl=-.3) }, crop=hook_pdfcrop) oldopt <- options(digits=4) ## ----cap123----------------------------------------------------------- cap1 <- "Heart weight versus body weight of 97 male cats. A regression line has been fitted." cap2 <- "Q-Q plot ('Normal Probability plot') of residuals from the regression of heart weight on body weight, for the male cats." cap3 <- "Scatterplot matrix of bootstrap estimates of `Slope` versus `Intercept`, with boxplots used to summarize the marginal distributions." cap4 <- "Bootstrap resampling result, compared with simulation resampling." ## ----cats-xy, fig.width=4.25, fig.height=4.25, fig.cap=cap1----------- library(lattice) mcats <- subset(MASS::cats, Sex=="M") xyplot(Hwt ~ Bwt, data=mcats, type=c("p","r")) ## ----mcats-lm--------------------------------------------------------- mcats.lm <- lm(Hwt~Bwt, data=mcats) signif(coef(summary(mcats.lm)),3) ## ----cats-qq, fig.cap=cap2-------------------------------------------- res <- resid(mcats.lm) qqnorm(res) ## ----outlier-num}, eval=TRUE, echo=TRUE------------------------------- nout <- (1:nrow(mcats))[c(which.min(res),which.max(res))] ## ----boot-regcoefs, fig.cap=cap3, fig.width=2.85, fig.height=3-------- library(gamclass, quietly=TRUE) library(car, quietly=TRUE) mcats <- subset(MASS::cats, Sex=="M") bootmat <- bootreg(Hwt ~ Bwt, data = mcats, nboot = 1000) bootdf <- as.data.frame(bootmat) names(bootdf) <- c("Intercept", "Slope") colr <- adjustcolor(rep("black",3), alpha.f=0.25) scatterplot(Slope ~ Intercept, col=colr, data=bootdf, boxplots="xy", regLine=NA, smooth=FALSE) ## ----95CI------------------------------------------------------------- sapply(bootdf, quantile, prob=c(.025, .975)) ## ----alt-boot-omitOutlier--------------------------------------------- library(lattice) bootmat <- bootreg(formula = Hwt ~ Bwt, data = mcats[-97, ], nboot = 1000) bootdf0 <- as.data.frame(bootmat) names(bootdf0) <- c("Intercept","Slope") gphA <- xyplot(Slope ~ Intercept, data=bootdf0, alpha=0.25) ## ----alt-sim---------------------------------------------------------- simmat <- simreg(formula = Hwt ~ Bwt, data=mcats[-97, ], nsim=1000) simdf <- as.data.frame(simmat) names(simdf) <- c("Intercept","Slope") gphB <- xyplot(Slope ~ Intercept, data=simdf, alpha=0.25) ## ----alt-resamp, out.width='47%', fig.show='hold', fig.width=3.85, fig.height=3.85, fig.cap=cap4---- update(gphA, main=grid::textGrob("A: Bootstrap (outlier omitted)", x=grid::unit(.05, "npc"), y = grid::unit(.25, "npc"), just="left", gp=grid::gpar(cex=1))) update(gphB, main=grid::textGrob("B: Simulation", x = grid::unit(.05, "npc"), y = grid::unit(.25, "npc"), just="left", gp=grid::gpar(cex=1))) ## ----cap5------------------------------------------------------------- cap5 <- "Regression tree for predicting `Mileage` given `Weight`. At each node, observations for which the criterion is satisfied take the branch to the left. Thus, at the first node, `tonsWt$>=$1.218` chooses the branch to the left, while `tonsWt$<$1.218` chooses the branch to the right. Panel B plots `Mileage` versus `tonsWt`, with fitted values from the `rpart` model shown as horizontal grey lines." ## ----rpart------------------------------------------------------------ library(rpart) Car90 <- na.omit(car90[, c("Mileage","Weight")]) ## Express weight in metric tonnes Car90 <- within(Car90, tonsWt <- Weight/2240) car90.rpart <- rpart(Mileage ~ tonsWt, data=Car90) ## ----deftree, out.width='48%', fig.cap=cap5,fig.width=5.25, fig.height=5.25, fig.show='hold'---- ## Code for Panel A plot(car90.rpart) text(car90.rpart, xpd=TRUE, digits=3) mtext(side=3, line=1.25, "A: Regression tree", adj=0, cex=1.25) ## Code for Panel B plot(Mileage ~ tonsWt, data=Car90, fg='gray') wt <- with(Car90, tonsWt) hat <- predict(car90.rpart) gamclass::addhlines(wt, hat, lwd=2, col="gray") mtext(side=3, line=1.25, "B: Predicted values from tree", adj=0, cex=1.25) ## ----cap6------------------------------------------------------------- cap6 <- "Between group sum of squares for `Mileage`, as a function of the value of `tonsWt` at which the split is made. The choice $c = 1.218$ maximizes the between groups sum of squares." ## ----bss-plot, fig.width=5, h=4.25, out.width="60%", fig.cap=cap6----- ## Code BSS <- bssBYcut(tonsWt, Mileage, Car90) with(BSS, plot(xOrd, bss, xlab="Cutpoint", fg='gray', ylab="")) mtext(side=2, line=1.5, "Between groups\nsum of squares") abline(v=1.218, lty=2) ## ----cap7, out.width='80%'-------------------------------------------- cap7 <- "Scatterplot matrix of accuracies for the several models. The line $y=x$ is shown in each panel. Note that `rfOOB` is out-of-bag accuracy, i.e., calculated from the set of 95 observations, and that `rfTest` is accuracy on the test data, again for a random forest model with no preliminary smoothing. Results from hybrid models are labeled according to the name of the formula for the smooth. The final accuracy, evaluated on the test data, is for a random forest model fitted to residuals from the smooth" ## ----meuse-setup------------------------------------------------------ data("meuse", package="sp") meuse <- within(meuse, {levels(soil) <- c("1","2","2") ffreq <- as.numeric(ffreq) loglead <- log(lead) }) ## ----gam-formulae----------------------------------------------------- form1 <- ~ dist + elev + soil + ffreq form3 <- ~ s(dist, k=3) + s(elev,k=3) + soil +ffreq form3x <- ~ s(dist, k=3) + s(elev,k=3) + s(x, k=3) + soil+ffreq form8x <- ~ s(dist, k=8) + s(elev,k=8) + s(x, k=8) + soil+ffreq formlist <- list("Hybrid1"=form1, "Hybrid3"=form3, "Hybrid3x"=form3x, "Hybrid8x"=form8x) ## ----rfgam-setup------------------------------------------------------ rfVars <- c("dist", "elev", "soil", "ffreq", "x", "y") nrep <- 100 errsmat <- matrix(0, nrep, length(formlist)+2) dimnames(errsmat)[[2]] <- c(names(formlist), "rfTest", "rfOOB") ## ----rfgam-many------------------------------------------------------- n <- 95 for(i in 1:nrep){ sub <- sample(1:nrow(meuse), n) meuseOut <- meuse[-sub,] meuseIn <- meuse[sub,] errsmat[i, ] <- gamclass::gamRF(formlist=formlist, yvar="loglead", rfVars=rfVars, data=meuseIn, newdata=meuseOut, printit=FALSE) } ## ----cf-models-rfgam, fig.cap=cap7, fig.width=7.5, fig.height=7.5, out.width='65%'---- ## Code for scatterplot of results ran <- range(errsmat) at <- round(ran+c(0.02,-0.02)*diff(ran),2) lis <- list(limits=ran, at=at, labels=format(at, digits=2)) lims=list(lis,lis,lis,lis,lis,lis) library(lattice) splom(errsmat, pscales=lims, par.settings=simpleTheme(cex=0.75), col=adjustcolor("black", alpha=0.5), panel=function(x,y,...){lpoints(x,y,...) panel.abline(0,1,col="gray")} ) ## ----cf-accs-rfgam---------------------------------------------------- errdf <- stack(as.data.frame(errsmat[,-6])) names(errdf) <- c("errs","model") errdf$model <- relevel(errdf$model, ref="rfTest") errdf$sample <- factor(rep(1:nrow(errsmat),5)) errors.aov <- aov(errs ~ model+sample, data=errdf) round(coef(summary.lm(errors.aov))[1:5,], 4) ## ----cf-accs-rfgam-ests----------------------------------------------- model.tables(errors.aov, type="mean", cterms='model') ## ----cuckoos-caps----------------------------------------------------- cap8 <- "Length versus breadth, compared between cuckoo eggs laid in the different host nests. Axes for the scores are overlaid." cap9 <- "Plot of length versus breadth of cuckoo eggs, now showing a boundary line for distinguishing `wren` from `not-wren`, based on the `lda()` analysis." ## ----cuckoos-lda------------------------------------------------------ cuckoos <- within(DAAG::cuckoos, levels(species) <- abbreviate(levels(species), 8)) cuckoos.lda <- MASS::lda(species ~ length + breadth, data=cuckoos) ## ----cuckoo-basic-gph------------------------------------------------- gph <- xyplot(length ~ breadth, groups=species, data=cuckoos, type=c("p"), auto.key=list(space="right"), aspect=1, scales=list(tck=0.5), par.settings=simpleTheme(pch=16)) ## ----cuckoos-sc, fig.cap=cap8,out.width='70%', fig.width=6.25, fig.height=4.75---- ## library(latticeExtra) # This package has the function layer() LDmat <- cuckoos.lda$scaling ld1 <- LDmat[,1] ld2 <- LDmat[,2] gm <- sapply(cuckoos[, c("length", "breadth")], mean) av1 <- gm[1] + ld1[2]/ld1[1]*gm[2] av2 <- gm[1] + ld2[2]/ld2[1]*gm[2] addlayer <- latticeExtra::layer(panel.abline(av1, -ld1[2]/ld1[1], lty=1), panel.abline(av2, -ld2[2]/ld2[1], lty=2)) gph + addlayer ## ----calc-scores------------------------------------------------------ gm <- apply(cuckoos.lda$means*cuckoos.lda$prior,2,sum) # Grand 'means' ## Sweep out grand 'means' centered <- sweep(as.matrix(cuckoos[,1:2]), 2, gm) ## ----loo-cv-acc------------------------------------------------------- ## Leave-one-out cross-validation ## Accuracies for linear discriminant analysis cuckooCV.lda <- MASS::lda(species ~ length + breadth, data=cuckoos, CV=TRUE) confusion(cuckoos$species, cuckooCV.lda$class, gpnames=abbreviate(levels(cuckoos$species), 8)) ## Post-multiply by scaling matrix scores <- centered %*% cuckoos.lda$scaling ## ----qda-loo-cv-acc--------------------------------------------------- ## Accuracies for quadratic discriminant analysis cuckooCV.qda <- MASS::qda(species ~ length + breadth, data=cuckoos, CV=TRUE) acctab <-confusion(cuckoos$species, cuckooCV.qda$class, gpnames=levels(cuckoos$species), printit=FALSE)$confusion tab <- table(cuckoos$species) ## ## Overall accuracy cat("Overall accuracy =", sum(diag(acctab)*tab)/sum(tab), "\n") ## Confusion matrix round(acctab, 3) ## ----CVfalse---------------------------------------------------------- cuckoos.lda <- MASS::lda(species ~ length + breadth, data=cuckoos) ## ----calc-discrim----------------------------------------------------- x <- pretty(cuckoos$breadth, 20) y <- pretty(cuckoos$length, 20) Xcon <- expand.grid(breadth=x, length=y) cucklda.pr <- predict(cuckoos.lda, Xcon)$posterior ## ----cuckoos-contour, fig.cap=cap9,out.width='70%', fig.width=6.25, fig.height=4.75---- m <- match("wren", colnames(cucklda.pr)) ldadiff <- apply(cucklda.pr, 1, function(x)x[m]-max(x[-m])) addlayer1 <- latticeExtra::as.layer(contourplot(ldadiff ~ breadth*length, at=c(-1,0,1), labels=c("", "lda",""), label.style="flat", col='gray', data=Xcon), axes=FALSE) gph + addlayer1 ## ----cf-acc-lda-qda--------------------------------------------------- library(gamclass, quietly=TRUE) cucklda.pr <- cuckooCV.lda$posterior cucklda.pr2 <- MASS::lda(species ~ length + breadth + I(length^2) + I(breadth^2) + I(length*breadth), CV=TRUE, data=cuckoos)$posterior compareModels(groups=cuckoos$species, estprobs=list(lda=cucklda.pr, "lda plus"=cucklda.pr2), gpnames=abbreviate(levels(cuckoos$species),8)) ## ----getdata-spam----------------------------------------------------- data(spam, package='kernlab') spam[,-58] <- scale(spam[,-58]) nr <- sample(1:nrow(spam)) spam0 <- spam[nr[1:2601],] ## Training spam1 <- spam[nr[2602:3601],] ## Holdout spam01 <- spam[nr[1:3601],] ## Use for training, ## if holdout not needed spam2 <- spam[nr[3602:4601],] ## Test ## ----lda-rates-------------------------------------------------------- library('MASS') spam01.lda <- lda(type~., data=spam01) ldaRates <- ldaErr(train.lda=spam01.lda, train=spam01, test=spam2, group='type') ## ----rpart-dofun------------------------------------------------------ set.seed(29) ## Make results precisely reproducible spam01.rp <- rpart::rpart(type~., data=spam01, cp=0.0001) rpRates <- rpartErr(train.rp=spam01.rp, train=spam01, test=spam2, group='type') ## ----rf-dofun--------------------------------------------------------- set.seed(29) ## Make results precisely reproducible spam01.rf <- randomForest::randomForest(type ~ ., data=spam01) rfRates <- rfErr(train.rf=spam01.rf, train=spam01, test=spam2, group='type') ## ----acctable--------------------------------------------------------- lda_main <- c(round(ldaRates['loo'], 3), round(ldaRates['trainerr'], 3),'-', round(ldaRates['testerr'], 3)) rpartAcc <- c('-',round(rpRates['cverror'], 3),round(rpRates['trainerror'], 3), round(rpRates['testerror'], 3)) rfAcc <- c('-',round(rfRates['trainerr'], 3),round(rfRates['OOBerr'], 3), round(rfRates['testerr'], 3)) accmat <- rbind(lda_main,rpartAcc,rfAcc) rownames(accmat) <- c('lda (main effects)','rpart','randomForest') kable(accmat, col.names=c("Leave One Out CV","Training", "Out of Bag","Test rate"), align=rep('r',4)) ## ----lda-x, eval=FALSE------------------------------------------------ # mm01x <- model.matrix(type ~ 0+.^2, data=spam01) # mm2x <- model.matrix(type ~ 0+.^2, data=spam2) # spam01x.lda <- lda(x=mm01x, grouping=spam01[,'type']) # ldaRatesx <- ldaErr(train.lda=spam01x.lda, train=mm01x, test=mm2x, # group=spam01[,'type']) # round(ldaRatesx,3) ## ----errRates, echo=FALSE--------------------------------------------- c(loo=0.291, trainerr=0.027, testerr=0.164) ## ----data-Vowel------------------------------------------------------- data('Vowel', package='mlbench') Vowel$V1 <- factor(Vowel$V1) ## ----vowel-qda-------------------------------------------------------- vowelcv.qda <- qda(Class ~ ., data=Vowel, CV=TRUE) accqda <- confusion(Vowel$Class, vowelcv.qda$class, printit=FALSE)$overall print(round(accqda,3)) ## Compare with CV performance accspqda <- CVcluster(Class ~ ., id=V1, data=Vowel, nfold=15, FUN=qda, printit=FALSE)$CVaccuracy print(round(accspqda,3)) ## ----vowel-rf--------------------------------------------------------- data('Vowel', package='mlbench') Vowel$V1 <- factor(Vowel$V1) vowel.rf <- randomForest::randomForest(Class ~ ., data=Vowel) print("OOB accuracy estimate") accOOB <- 1-sum(rep(1/11,11)*vowel.rf$confusion[,"class.error"]) print(round(accOOB,3)) ## Compare with performance based on OOB choice of speakers accspOOB <- RFcluster(Class ~ ., id=V1, data=Vowel, ntree=500, progress = FALSE)$OOBaccuracy print(round(accspOOB),3)