## ----echo=FALSE,results='hide'---------------------------- library("knitr") options(prompt = "R> ", continue = " ", width = 60, useFancyQuotes = FALSE) opts_chunk$set(tidy=TRUE, highlight=FALSE, background="white") opts_chunk$set(dev=c('pdf','postscript'), fig.path='cancerfigure/') ## ----message=FALSE, eval=FALSE---------------------------- # #The data files below were downloaded on June 1, 2016 # require("gdata") # bc <- t(read.delim("GSE20194_series_matrix.txt.gz",sep="",header=FALSE,skip=80)) # colnames(bc) <- bc[1,] # bc <- bc[-1, -c(1, 2)] # ###The last column is empty with variable name !series_matrix_table_end, thus omitted # bc <- bc[, -22284] # mode(bc) <- "numeric" ### convert character to numeric # dat1 <- read.xls("GSE20194_MDACC_Sample_Info.xls", sheet=1, header=TRUE) # y <- dat1$characteristics..ER_status # y <- ifelse(y=="P", 1, -1) # table(y) # res <- rep(NA, dim(bc)[2]) # for(i in 1:dim(bc)[2]) # res[i] <- abs(t.test(bc[,i]~y)$statistic) # ### find 3000 largest absolute value of t-statistic # tmp <- order(res, decreasing=TRUE)[1:3000] # dat <- bc[, tmp] # ### standardize variables # dat <- scale(dat) ## ----message=FALSE---------------------------------------- nrun <- 100 per <- c(0, 0.05, 0.10, 0.15) learntype <- c("tree", "ls")[2] tuning <- "error" n.cores <- 4 plot.it <- TRUE ### robust tuning parameters used in bst/rbst function s <- c(0.9, 1.01,0.5, -0.2, 0.8, -0.5, -0.2) nu <- c(0.01, 0.1, 0.01, rep(0.1, 4)) m <- 100 ### boosting iteration number ### whether to truncate the predicted values in each boosting iteration? ctr.trun <- c(TRUE, rep(FALSE, 6)) ### used in bst function bsttype <- c("closs", "gloss", "qloss", "binom", "binom", "hinge", "expo") ### and corresponding labels bsttype1 <- c("ClossBoost", "GlossBoost", "QlossBoost", "LogitBoost", "LogitBoost", "HingeBoost", "AdaBoost") ### used in rbst function rbsttype <- c("closs", "gloss", "qloss", "tbinom", "binomd", "thinge", "texpo") ### and corresponding labels rbsttype1 <- c("ClossBoostQM", "GlossBoostQM", "QlossBoostQM", "TLogitBoost", "DlogitBoost", "THingeBoost", "TAdaBoost") ## ----cancer, fig.show='hold', eval=FALSE------------------ # summary7 <- function(x) c(summary(x), sd=sd(x)) # ptm <- proc.time() # for(k in 1:7){ ### k controls which family in bst, and rfamily in rbst # err.m1 <- err.m2 <- nvar.m1 <- nvar.m2 <- errbest.m1 <- errbest.m2 <- matrix(NA, ncol=4, nrow=nrun) # mstopbest.m1 <- mstopbest.m2 <- mstopcv.m1 <- mstopcv.m2 <- matrix(NA, ncol=4, nrow=nrun) # colnames(err.m1) <- colnames(err.m2) <- c("cont-0%", "cont-5%", "cont-10%", "cont-15%") # colnames(mstopcv.m1) <- colnames(mstopcv.m2) <- colnames(err.m1) # colnames(nvar.m1) <- colnames(nvar.m2) <- colnames(err.m1) # colnames(errbest.m1) <- colnames(errbest.m2) <- colnames(err.m1) # colnames(mstopbest.m1) <- colnames(mstopbest.m2) <- colnames(err.m1) # for (ii in 1:nrun){ # set.seed(1000+ii) # trid <- c(sample(which(y==1))[1:50], sample(which(y==-1))[1:50]) # dtr <- dat[trid, ] # dte <- dat[-trid, ] # ytrold <- y[trid] # yte <- y[-trid] # ### number of patients/no. variables in training and test data # dim(dtr) # dim(dte) # ###randomly contaminate data # ntr <- length(trid) # set.seed(1000+ii) # con <- sample(ntr) # for(j in 1){ ### controls learntype # for(i in 1:4){ ### i controls how many percentage of data contaminated # ytr <- ytrold # percon <- per[i] # ### randomly flip labels of the samples in training set according to pre-defined contamination level # if(percon > 0){ # ji <- con[1:(percon*ntr)] # ytr[ji] <- -ytrold[ji] # } # dat.m1 <- bst(x=dtr, y=ytr, ctrl = bst_control(mstop=m, center=FALSE, trace=FALSE, nu=nu[k], s=s[k], trun=ctr.trun[k]), family = bsttype[k], # learner = learntype[j]) # err1 <- predict(dat.m1, newdata=dte, newy=yte, type="error") # err1tr <- predict(dat.m1, newdata=dtr, newy=ytr, type="loss") # ### cross-validation to select best boosting iteration # set.seed(1000+ii) # cvm1 <- cv.bst(x=dtr, y=ytr, K=5, n.cores=n.cores, ctrl = bst_control(mstop=m, center=FALSE, trace=FALSE, nu=nu[k], s=s[k], trun=ctr.trun[k]), family = bsttype[k], # learner = learntype[j], main=bsttype[k], type=tuning, plot.it=FALSE) # optmstop <- max(10, which.min(cvm1$cv)) # err.m1[ii, i] <- err1[optmstop] # nvar.m1[ii, i] <- nsel(dat.m1, optmstop)[optmstop] # errbest.m1[ii, i] <- min(err1) # mstopbest.m1[ii, i] <- which.min(err1) # mstopcv.m1[ii, i] <- optmstop # dat.m2 <- rbst(x=dtr, y=ytr, ctrl = bst_control(mstop=m, iter=100, nu=nu[k], s=s[k], trun=ctr.trun[k], center=FALSE, trace=FALSE), rfamily = rbsttype[k], learner = learntype[j]) # err2 <- predict(dat.m2, newdata=dte, newy=yte, type="error") # err2tr <- predict(dat.m2, newdata=dtr, newy=ytr, type="loss") # ### cross-validation to select best boosting iteration # set.seed(1000+ii) # cvm2 <- cv.rbst(x=dtr, y=ytr, K=5, n.cores=n.cores, ctrl = bst_control(mstop=m, iter=100, nu=nu[k], s=s[k], trun=ctr.trun[k], center=FALSE, trace=FALSE), rfamily = rbsttype[k], # learner = learntype[j], main=rbsttype[k], type=tuning, plot.it=FALSE) # optmstop <- max(10, which.min(cvm2$cv)) # err.m2[ii, i] <- err2[optmstop] # nvar.m2[ii, i] <- nsel(dat.m2, optmstop)[optmstop] # errbest.m2[ii, i] <- min(err2) # mstopbest.m2[ii, i] <- which.min(err2) # mstopcv.m2[ii, i] <- optmstop # } # } # if(ii %% nrun ==0){ # if(bsttype[k]%in% c("closs", "gloss", "qloss")) # cat(paste("\nbst family ", bsttype1[k], ", s=", s[k], ", nu=", nu[k], sep=""), "\n") # if(bsttype[k]%in% c("binom", "hinge", "expo")) # cat(paste("\nbst family ", bsttype1[k], ", nu=", nu[k], sep=""), "\n") # cat("best misclassification error from bst\n") # print(round(apply(errbest.m1, 2, summary7),4)) # cat("CV based misclassification error from bst\n") # print(round(apply(err.m1, 2, summary7),4)) # cat("best mstop with best misclassification error from bst\n") # print(round(apply(mstopbest.m1, 2, summary7),0)) # cat("best mstop with CV from bst\n") # print(round(apply(mstopcv.m1, 2, summary7), 0)) # cat("nvar from bst\n") # print(round(apply(nvar.m1, 2, summary7),1)) # # cat(paste("\nrbst family ", rbsttype1[k], ", s=", s[k], ", nu=", nu[k], sep=""), "\n") # cat("\nbest misclassification error from rbst\n") # print(round(apply(errbest.m2, 2, summary7),4)) # cat("CV based misclassification error from rbst\n") # print(round(apply(err.m2, 2, summary7),4)) # cat("best mstop with best misclassification error from rbst\n") # print(round(apply(mstopbest.m2, 2, summary7),0)) # cat("best mstop with CV from rbst\n") # print(round(apply(mstopcv.m2, 2, summary7),0)) # cat("nvar from rbst\n") # print(round(apply(nvar.m2, 2, summary7),1)) # res <- list(err.m1=err.m1, nvar.m1=nvar.m1, # errbest.m1=errbest.m1, # mstopbest.m1=mstopbest.m1, # mstopcv.m1=mstopcv.m1, # err.m2=err.m2, nvar.m2=nvar.m2, # errbest.m2=errbest.m2, # mstopbest.m2=mstopbest.m2, # mstopcv.m2=mstopcv.m2, # s=s[k], nu=nu[k], trun=ctr.trun[k], family = bsttype[k], rfamily = rbsttype[k]) # if(plot.it){ # par(mfrow=c(2, 1)) # boxplot(err.m1, main="Misclassification error", subset="", sub=bsttype1[k]) # boxplot(err.m2, main="Misclassification error", subset="", sub=rbsttype1[k]) # boxplot(nvar.m1, main="No. variables", subset="", sub=bsttype1[k]) # boxplot(nvar.m2, main="No. variables", subset="", sub=rbsttype1[k]) # } # check <- FALSE # if(check){ # par(mfrow=c(3, 1)) # title <- paste('percentage of contamination ', percon, sep='') # plot(err2tr, main=title, ylab="Loss value", xlab="Iteration", type="l", lty="dashed", col="red") # points(err1tr, type="l", lty="solid", col="black") # legend("topright", c(bsttype1[k], rbsttype1[k]), lty=c("solid", "dashed"), col=c("black","red")) # plot(err2, main=title, ylab="Misclassification error", xlab="Iteration", type="l", lty="dashed", col="red") # points(err1, type="l") # legend("bottomright", c(bsttype1[k], rbsttype1[k]), lty=c("solid", "dashed"), col=c("black","red")) # plot(nsel(dat.m2, m), main=title, ylab="No. variables", xlab="Iteration", lty="dashed", col="red", type="l") # points(nsel(dat.m1, m), ylab="No. variables", xlab="Iteration", lty="solid", type="l", col="black") # legend("bottomright", c(bsttype1[k], rbsttype1[k]), lty=c("solid", "dashed"), col=c("black","red")) # } # } # } # } # print(proc.time()-ptm) ## ----sessionInfo------------------------------------------ sessionInfo();