## ----echo=FALSE,results='hide'---------------------------- library("knitr") options(prompt = "R> ", continue = " ", width = 60, useFancyQuotes = FALSE) opts_chunk$set(tidy=TRUE, highlight=FALSE, background="white") opts_chunk$set(dev=c('pdf','postscript'), fig.path='cancerfigure/') ## ----message=FALSE, eval=FALSE---------------------------- # #The data files below were downloaded on June 1, 2016 # require("openxlsx") # library("mpath") # bc <- t(read.delim("GSE20194_series_matrix.txt.gz",sep="",header=FALSE,skip=80)) # colnames(bc) <- bc[1,] # bc <- bc[-1, -c(1, 2)] # ###The last column is empty with variable name !series_matrix_table_end, thus omitted # bc <- bc[, -22284] # mode(bc) <- "numeric" ### convert character to numeric # dat1 <- openxlsx::openxlsx("GSE20194_MDACC_Sample_Info.xls", sheet=1, header=TRUE) # y <- dat1$characteristics..ER_status # y <- ifelse(y=="P", 1, -1) # table(y) # res <- rep(NA, dim(bc)[2]) # for(i in 1:dim(bc)[2]) # res[i] <- abs(t.test(bc[,i]~y)$statistic) # ### find 3000 largest absolute value of t-statistic # tmp <- order(res, decreasing=TRUE)[1:3000] # dat <- bc[, tmp] # ### standardize variables # dat <- scale(dat) ## ----message=FALSE---------------------------------------- ### number of replicates nrun <- 100 ### penalty type penalty <- c("enet", "snet", "mnet") ### Smallest value for lambda, as a fraction of lambda.max, the smallest value for which all coefficients are zero except the intercept ratio <- 0.25 type.path <- "nonactive" nlam <- ifelse(type.path!="onestep", 30, 100) ### The training data is contaminated by randomly switching response variable labels at varying pre-specified proportions per <- c(0, 0.05, 0.10, 0.15) ### what quantity is minimized for tuning parameter selection tuning <- "error" n.cores <- 5 ### robust nonconvex loss function, rfamily type and logistic type <- c("closs", "gloss", "qloss", "binomial") ### and corresponding labels type1 <- c("Closs", "Gloss", "Qloss", "Logistic") ### and corresponding tuning parameter s <- c(0.9, 1.01, 0.5) mstop <- 50 plot.it <- TRUE ## ----brcancer, fig.show='hold', eval=FALSE---------------- # summary7 <- function(x) c(summary(x), sd=sd(x)) # ptm <- proc.time() # for(k in (1:4)){ ### k controls family argument rfamily type (see above) # if(type[k]=="gloss") # nu <- 0.1 # else # nu <- 0.01 # for(j in (1:3)){ ### j controls argument penalty type (see above) # gam <- ifelse(penalty[j]=="snet", 3.7, 12) # err.m1 <- nvar.m1 <- errbest.m1 <- lambest.m1 <- matrix(NA, ncol=4, nrow=nrun) # nvarbest.m1 <- mstopcv.m1 <- matrix(NA, ncol=4, nrow=nrun) # colnames(err.m1) <- c("cont-0%", "cont-5%", "cont-10%", "cont-15%") # colnames(mstopcv.m1) <- colnames(nvarbest.m1) <- colnames(err.m1) # colnames(nvar.m1) <- colnames(err.m1) # colnames(errbest.m1) <- colnames(err.m1) # colnames(lambest.m1) <- colnames(err.m1) # for (ii in 1:nrun){ # set.seed(1000+ii) # trid <- c(sample(which(y==1))[1:50], sample(which(y==-1))[1:50]) # dtr <- dat[trid, ] # dte <- dat[-trid, ] # ytrold <- y[trid] # yte <- y[-trid] # ### number of patients/no. variables in training and test data # dim(dtr) # dim(dte) # ### randomly contaminate data # ntr <- length(trid) # set.seed(1000+ii) # con <- sample(ntr) # for(i in (1:4)){ ### i controls how many percentage of data contaminated, see argument per above # ytr <- ytrold # percon <- per[i] # ### randomly flip labels of the samples in training set according to pre-defined contamination level # if(percon > 0){ # ji <- con[1:(percon*ntr)] # ytr[ji] <- -ytrold[ji] # } # ### fit a model with nclreg for nonconvex loss or glmreg for logistic loss, and use cross-validation to select best penalization parameter # if(type[k] %in% c("closs", "gloss", "qloss")){ # dat.m1 <- nclreg(x=dtr, y=ytr, s=s[k], iter=100, rfamily = type[k], penalty=penalty[j], lambda.min.ratio=ratio, gamma=gam, mstop.init=mstop, nu.init=nu, type.path=type.path, decreasing=FALSE, type.init="bst") # lambda <- dat.m1$lambda[1:nlam] # set.seed(1000+ii) # cvm1 <- cv.nclreg(x=dtr, y=ytr, nfolds=5, n.cores=n.cores, parallel=TRUE, s=s[k], lambda=lambda, rfamily = type[k], penalty=penalty[j], gamma=gam, type=tuning, plot.it=FALSE, type.init=dat.m1$type.init, mstop.init=dat.m1$mstop.init, nu.init=dat.m1$nu.init, type.path=type.path, decreasing=dat.m1$decreasing) # err1 <- predict(dat.m1, newdata=dte, newy=yte, type="error") # } # else{ # dat.m1 <- glmreg(x=dtr, y=(ytr+1)/2, family = type[k], penalty=penalty[j], lambda.min.ratio=ratio, gamma=gam) # set.seed(1000+ii) # cvm1 <- cv.glmreg(x=dtr, y=(ytr+1)/2, nfolds=5, n.cores=n.cores, parallel=TRUE, lambda=dat.m1$lambda, # family = type[k], penalty=penalty[j], gamma=gam, plot.it=FALSE) # err1 <- apply((yte > -1)!=predict(dat.m1, newx=dte, type="class"), 2, mean) # } # optmstop <- cvm1$lambda.which # err.m1[ii, i] <- err1[optmstop] # if(ii==1) varid <- names(predict(dat.m1, which=optmstop, type="nonzero")) else # varid <- intersect(varid, names(predict(dat.m1, which=optmstop, type="nonzero"))) # nvar.m1[ii, i] <- length(predict(dat.m1, which=optmstop, type="nonzero")) # errbest.m1[ii, i] <- min(err1, na.rm=TRUE) # lambest.m1[ii, i] <- which.min(err1) # nvarbest.m1[ii, i] <- length(predict(dat.m1, which=which.min(err1), type="nonzero")) # } # if(ii %% nrun ==0){ # if(type[k]%in% c("closs", "gloss", "qloss")) # cat(paste("\nfamily ", type1[k], ", s=", s[k], sep=""), "\n") # else cat(paste("\nfamily ", type1[k], sep=""), "\n") # pentype <- switch(penalty[j], # "enet"="LASSO", # "mnet"="MCP", # "snet"="SCAD") # cat("penalty=", pentype, "\n") # if(penalty[j] %in% c("snet", "mnet")) cat("gamma=", gam, "\n") # cat("common variables selected:", varid, "\n") # cat("best misclassification error\n") # print(round(apply(errbest.m1, 2, summary7),4)) # cat("which lambda has best error\n") # print(round(apply(lambest.m1, 2, summary7),1)) # cat("number of variables selected with best error\n") # print(round(apply(nvarbest.m1, 2, summary7),1)) # cat("CV based misclassification error\n") # print(round(apply(err.m1, 2, summary7),4)) # cat("number of variables selected by CV\n") # print(round(apply(nvar.m1, 2, summary7),1)) # if(plot.it){ # par(mfrow=c(2, 1)) # boxplot(err.m1, main="Misclassification error", subset="", sub=paste(type1[k], "-", pentype, sep="")) # boxplot(nvar.m1, main="No. variables", subset="", sub=paste(type1[k], "-", pentype, sep="")) # } # } # } # } # } # print(proc.time()-ptm) ## ----sessionInfo------------------------------------------ sessionInfo();