## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, fig.width = 6, fig.height = 6) ## ----------------------------------------------------------------------------- library(polyRAD) maphmcfile <- system.file("extdata", "ClareMap_HapMap.hmc.txt", package = "polyRAD") maphmcfile mydata <- readHMC(maphmcfile, possiblePloidies = list(2, c(2, 2)), taxaPloidy = 2) mydata ## ----------------------------------------------------------------------------- GetTaxa(mydata)[c(1:10,293:299)] ## ----------------------------------------------------------------------------- mydata <- SetDonorParent(mydata, "Kaskade-Justin") mydata <- SetRecurrentParent(mydata, "Zebrinus-Justin") ## ----------------------------------------------------------------------------- mydata$taxaPloidy[c("IGR-2011-001", "p196-150A-c", "p877-348-b")] <- 1L mydata ## ----------------------------------------------------------------------------- alignfile <- system.file("extdata", "ClareMap_alignments.csv", package = "polyRAD") aligndata <- read.csv(alignfile, row.names = 1) head(aligndata) mydata$locTable$Chr <- aligndata[GetLoci(mydata), 1] mydata$locTable$Pos <- aligndata[GetLoci(mydata), 2] head(mydata$locTable) ## ----eval = FALSE------------------------------------------------------------- # mydata <- AddPCA(mydata) ## ----------------------------------------------------------------------------- load(system.file("extdata", "examplePCA.RData", package = "polyRAD")) mydata$PCA <- examplePCA ## ----------------------------------------------------------------------------- plot(mydata) ## ----------------------------------------------------------------------------- realprogeny <- GetTaxa(mydata)[mydata$PCA[,"PC1"] > -10 & mydata$PCA[,"PC1"] < 10] # eliminate the one doubled haploid line in this group realprogeny <- realprogeny[!realprogeny %in% c("IGR-2011-001", "p196-150A-c", "p877-348-b")] # also retain parents keeptaxa <- c(realprogeny, GetDonorParent(mydata), GetRecurrentParent(mydata)) mydata <- SubsetByTaxon(mydata, taxa = keeptaxa) plot(mydata) ## ----------------------------------------------------------------------------- mydata2 <- PipelineMapping2Parents(mydata, freqAllowedDeviation = 0.06, useLinkage = FALSE, minLikelihoodRatio = 2) ## ----------------------------------------------------------------------------- overdispersionP <- TestOverdispersion(mydata2, to_test = 8:15) sapply(overdispersionP[names(overdispersionP) != "optimal"], quantile, probs = c(0.01, 0.25, 0.5, 0.75, 0.99)) ## ----------------------------------------------------------------------------- my_ovdisp <- overdispersionP$optimal ## ----message = FALSE---------------------------------------------------------- myhindhe <- HindHeMapping(mydata, ploidy = 2L) hist(colMeans(myhindhe, na.rm = TRUE), col = "lightgrey", xlab = "Hind/He", main = "Histogram of Hind/He by locus") ## ----------------------------------------------------------------------------- set.seed(720) ExpectedHindHeMapping(mydata, ploidy = 2, overdispersion = my_ovdisp, reps = 2, contamRate = 0.001, errorRate = 0.001) ## ----------------------------------------------------------------------------- goodMarkers <- colnames(myhindhe)[which(colMeans(myhindhe, na.rm = TRUE) < 0.53 & colMeans(myhindhe, na.rm = TRUE) > 0.43)] mydata <- SubsetByLocus(mydata, goodMarkers) ## ----------------------------------------------------------------------------- mydata <- PipelineMapping2Parents(mydata, freqAllowedDeviation = 0.06, useLinkage = TRUE, overdispersion = my_ovdisp, minLikelihoodRatio = 2) ## ----------------------------------------------------------------------------- table(mydata$alleleFreq) ## ----------------------------------------------------------------------------- mydata$alleleDepth["Map1-089",1:8] mydata$genotypeLikelihood[[1,"2"]][,"Map1-089",1:8] mydata$genotypeLikelihood[[2,"2"]][,"Map1-089",1:8] ## ----------------------------------------------------------------------------- mydata$priorProb[[1,"2"]][,1:8] mydata$priorProb[[2,"2"]][,1:8] ## ----------------------------------------------------------------------------- mydata$ploidyChiSq[,1:8] ## ----------------------------------------------------------------------------- plot(mydata$ploidyChiSq[1,], mydata$ploidyChiSq[2,], xlab = "Chi-squared for diploid model", ylab = "Chi-squared for tetraploid model") ## ----------------------------------------------------------------------------- mydata$posteriorProb[[1,"2"]][,"Map1-089",1:8] mydata$posteriorProb[[2,"2"]][,"Map1-089",1:8] ## ----------------------------------------------------------------------------- mydata <- SubsetByPloidy(mydata, ploidies = list(2)) ## ----------------------------------------------------------------------------- mydata <- RemoveUngenotypedLoci(mydata) ## ----------------------------------------------------------------------------- mywm <- GetWeightedMeanGenotypes(mydata) round(mywm[c(276, 277, 1:5), 9:12], 3) ## ----------------------------------------------------------------------------- mydata$likelyGeno_donor[,1:8] mydata$likelyGeno_recurrent[,1:8] ## ----echo = FALSE------------------------------------------------------------- # Determine if VariantAnnotation is installed, so we know whether to # execute the rest of the vignette. haveVA <- requireNamespace("VariantAnnotation", quietly = TRUE) ## ----message=FALSE, warning=FALSE, eval = haveVA------------------------------ library(VariantAnnotation) myVCF <- system.file("extdata", "Msi01genes.vcf", package = "polyRAD") ## ----eval=FALSE--------------------------------------------------------------- # mybg <- bgzip(myVCF) # indexTabix(mybg, format = "vcf") ## ----------------------------------------------------------------------------- pldfile <- system.file("extdata", "Msi_ploidies.txt", package = "polyRAD") msi_ploidies <- read.table(pldfile, sep = "\t", header = FALSE) head(msi_ploidies) table(msi_ploidies$V2) pld_vect <- msi_ploidies$V2 names(pld_vect) <- msi_ploidies$V1 ## ----eval = haveVA------------------------------------------------------------ mydata <- VCF2RADdata(myVCF, possiblePloidies = list(2, c(2,2)), expectedLoci = 100, expectedAlleles = 500, taxaPloidy = pld_vect) mydata ## ----echo = FALSE, eval = !haveVA--------------------------------------------- # # If we don't have VariantAnnotation, load in the dataset # load(system.file("extdata", "vcfdata.RData", package = "polyRAD")) ## ----------------------------------------------------------------------------- overdispersionP <- TestOverdispersion(mydata, to_test = 8:14) sapply(overdispersionP[names(overdispersionP) != "optimal"], quantile, probs = c(0.01, 0.25, 0.5, 0.75, 0.99)) ## ----------------------------------------------------------------------------- my_ovdisp <- overdispersionP$optimal ## ----------------------------------------------------------------------------- myhindhe <- HindHe(mydata) myhindheByLoc <- colMeans(myhindhe, na.rm = TRUE) hist(myhindheByLoc, col = "lightgrey", xlab = "Hind/He", main = "Histogram of Hind/He by locus") abline(v = 0.5, col = "blue", lwd = 2) ## ----------------------------------------------------------------------------- mydata <- AddAlleleFreqHWE(mydata) theseloci <- GetLoci(mydata)[mydata$alleles2loc[mydata$alleleFreq >= 0.05 & mydata$alleleFreq < 0.5]] theseloci <- unique(theseloci) myhindheByLoc2 <- colMeans(myhindhe[mydata$taxaPloidy == 2L, theseloci], na.rm = TRUE) hist(myhindheByLoc2, col = "lightgrey", xlab = "Hind/He", main = "Histogram of Hind/He by locus, MAF >= 0.05") abline(v = 0.5, col = "blue", lwd = 2) ## ----------------------------------------------------------------------------- set.seed(803) ExpectedHindHe(mydata, inbreeding = 0.25, ploidy = 2, overdispersion = my_ovdisp, reps = 10, contamRate = 0.001, errorRate = 0.001) ## ----------------------------------------------------------------------------- mean(myhindheByLoc < 0.24) # about 29% of markers would be removed keeploci <- names(myhindheByLoc)[myhindheByLoc >= 0.24] mydata <- SubsetByLocus(mydata, keeploci) ## ----message = FALSE---------------------------------------------------------- mydataHWE <- IterateHWE(mydata, tol = 1e-3, overdispersion = 10) ## ----------------------------------------------------------------------------- hist(mydataHWE$alleleFreq, breaks = 20, col = "lightgrey") ## ----message = FALSE---------------------------------------------------------- set.seed(3908) mydataPopStruct <- IteratePopStruct(mydata, nPcsInit = 8, tol = 5e-03, overdispersion = 10) ## ----------------------------------------------------------------------------- hist(mydataPopStruct$alleleFreq, breaks = 20, col = "lightgrey") ## ----------------------------------------------------------------------------- plot(mydataPopStruct) ## ----------------------------------------------------------------------------- myallele <- 1 freqcol <- heat.colors(101)[round(mydataPopStruct$alleleFreqByTaxa[,myallele] * 100) + 1] plot(mydataPopStruct, pch = 21, bg = freqcol) ## ----------------------------------------------------------------------------- plot(mydataPopStruct$ploidyChiSq[1,], mydataPopStruct$ploidyChiSq[2,], xlab = "Chi-squared for diploid model", ylab = "Chi-squared for allotetraploid model", log = "xy") abline(a = 0, b = 1, col = "blue", lwd = 2) ## ----message = FALSE, eval = requireNamespace("ggplot2", quietly = TRUE)------ myChiSqRat <- mydataPopStruct$ploidyChiSq[1,] / mydataPopStruct$ploidyChiSq[2,] myChiSqRat <- tapply(myChiSqRat, mydataPopStruct$alleles2loc, mean) allelesPerLoc <- as.vector(table(mydataPopStruct$alleles2loc)) library(ggplot2) ggplot(mapping = aes(x = myhindheByLoc[GetLoci(mydata)], y = myChiSqRat, fill = as.factor(allelesPerLoc))) + geom_point(shape = 21, size = 3) + labs(x = "Hind/He", y = "Ratio of Chi-squared values, diploid to allotetraploid", fill = "Alleles per locus") + geom_hline(yintercept = 1) + geom_vline(xintercept = 0.5) + scale_fill_brewer(palette = "YlOrRd") ## ----------------------------------------------------------------------------- wmgenoPopStruct <- GetWeightedMeanGenotypes(mydataPopStruct) wmgenoPopStruct[1:10,1:5] ## ----eval = FALSE------------------------------------------------------------- # myHindHe <- HindHe(mydata) # TotDepthT <- rowSums(mydata$locDepth) ## ----------------------------------------------------------------------------- print(load(system.file("extdata", "MsaHindHe0.RData", package = "polyRAD"))) ## ----------------------------------------------------------------------------- myHindHeByInd <- rowMeans(myHindHe, na.rm = TRUE) ## ----eval = requireNamespace("ggplot2", quietly = TRUE)----------------------- ggplot(data.frame(Depth = TotDepthT, HindHe = myHindHeByInd, Ploidy = ploidies), mapping = aes(x = Depth, y = HindHe, color = Ploidy)) + geom_point() + scale_x_log10() + facet_wrap(~ Ploidy) + geom_hline(data = data.frame(Ploidy = c("2x", "3x", "4x"), ExpHindHe = c(1/2, 2/3, 3/4)), mapping = aes(yintercept = ExpHindHe), lty = 2) + labs(x = "Read Depth", y = "Hind/He", color = "Ploidy") ## ----------------------------------------------------------------------------- myHindHe2x <- myHindHe[ploidies == "2x",] myHindHe4x <- myHindHe[ploidies == "4x",] ## ----------------------------------------------------------------------------- myHindHeByLoc2x <- colMeans(myHindHe2x, na.rm = TRUE) hist(myHindHeByLoc2x, breaks = 50, xlab = "Hind/He", main = "Distribution of Hind/He among loci in diploids", col = "lightgrey") abline(v = 0.5, col = "blue", lwd = 2) myHindHeByLoc4x <- colMeans(myHindHe4x, na.rm = TRUE) hist(myHindHeByLoc4x, breaks = 50, xlab = "Hind/He", main = "Distribution of Hind/He among loci in tetraploids", col = "lightgrey") abline(v = 0.75, col = "blue", lwd = 2) ## ----------------------------------------------------------------------------- goodLoci <- colnames(myHindHe)[myHindHeByLoc2x < 0.5 & myHindHeByLoc4x < 0.75] length(goodLoci) # 611 out of 1000 markers retained head(goodLoci) ## ----eval = FALSE------------------------------------------------------------- # library(polyRAD) # library(VariantAnnotation) # # # Two files produced by the TASSEL-GBSv2 pipeline using two different # # enzyme systems. # NsiI_file <- "170705Msi_NsiI_genotypes.vcf.bgz" # PstI_file <- "170608Msi_PstI_genotypes.vcf.bgz" # # # The vector allSam was defined outside of this script, and contains the # # names of all samples that I wanted to import. Below I find sample names # # within the VCF files that match those samples. # NsiI_sam <- allSam[allSam %in% samples(scanVcfHeader(NsiI_file))] # PstI_sam <- allSam[allSam %in% samples(scanVcfHeader(PstI_file))] # # # Import two RADdata objects, assuming diploidy. A large yield size was # # used due to the computer having 64 Gb RAM; on a typical laptop you # # would probably want to keep the default of 5000. # PstI_RAD <- VCF2RADdata(PstI_file, samples = PstI_sam, yieldSize = 5e4, # expectedAlleles = 1e6, expectedLoci = 2e5) # NsiI_RAD <- VCF2RADdata(NsiI_file, samples = NsiI_sam, yieldSize = 5e4, # expectedAlleles = 1e6, expectedLoci = 2e5) # # # remove any loci duplicated across the two sets # nLoci(PstI_RAD) # 116757 # nLoci(NsiI_RAD) # 187434 # nAlleles(PstI_RAD) # 478210 # nAlleles(NsiI_RAD) # 952511 # NsiI_keeploci <- which(!GetLoci(NsiI_RAD) %in% GetLoci(PstI_RAD)) # cat(nLoci(NsiI_RAD) - length(NsiI_keeploci), # file = "180522Num_duplicate_loci.txt") #992 duplicate # NsiI_RAD <- SubsetByLocus(NsiI_RAD, NsiI_keeploci) # # # combine allele depth into one matrix # PstI_depth <- PstI_RAD$alleleDepth # NsiI_depth <- NsiI_RAD$alleleDepth # total_depth <- matrix(0L, nrow = length(allSam), # ncol = ncol(PstI_depth) + ncol(NsiI_depth), # dimnames = list(allSam, # c(colnames(PstI_depth), # colnames(NsiI_depth)))) # total_depth[,colnames(PstI_depth)] <- PstI_depth[allSam,] # total_depth[rownames(NsiI_depth),colnames(NsiI_depth)] <- NsiI_depth # # # combine other slots # total_alleles2loc <- c(PstI_RAD$alleles2loc, # NsiI_RAD$alleles2loc + nLoci(PstI_RAD)) # total_locTable <- rbind(PstI_RAD$locTable, NsiI_RAD$locTable) # total_alleleNucleotides <- c(PstI_RAD$alleleNucleotides, # NsiI_RAD$alleleNucleotides) # # # build new RADdata object and save # total_RAD <- RADdata(total_depth, total_alleles2loc, total_locTable, # list(2L), 0.001, total_alleleNucleotides) # #save(total_RAD, file = "180524_RADdata_NsiIPstI.RData") # # # Make groups representing pairs of chromosomes, and one group for all # # non-assembled scaffolds. # splitlist <- list(c("^01$", "^02$"), # c("^03$", "^04$"), # c("^05$", "^06$"), # c("^07$", "^08$"), # c("^09$", "^10$"), # c("^11$", "^12$"), # c("^13$", "^14$", "^15$"), # c("^16$", "^17$"), # c("^18$", "^194"), "^SCAFFOLD") # # split by chromosome and save seperate objects # SplitByChromosome(total_RAD, chromlist = splitlist, # chromlist.use.regex = TRUE, fileprefix = "180524splitRAD") # # # files with RADdata objects # splitfiles <- grep("^180524splitRAD", list.files("."), value = TRUE) # # # list to hold markers formatted for GAPIT/FarmCPU # GAPITlist <- list() # length(GAPITlist) <- length(splitfiles) # # # loop through RADdata objects # for(i in 1:length(splitfiles)){ # load(splitfiles[i]) # splitRADdata <- IteratePopStructLD(splitRADdata) # GAPITlist[[i]] <- ExportGAPIT(splitRADdata) # } # #save(GAPITlist, file = "180524GAPITlist.RData") # # # put together into one dataset for FarmCPU # GM.all <- rbind(GAPITlist[[1]]$GM, GAPITlist[[2]]$GM, GAPITlist[[3]]$GM, # GAPITlist[[4]]$GM, GAPITlist[[5]]$GM, GAPITlist[[6]]$GM, # GAPITlist[[7]]$GM, GAPITlist[[8]]$GM, # GAPITlist[[9]]$GM, GAPITlist[[10]]$GM) # GD.all <- cbind(GAPITlist[[1]]$GD, GAPITlist[[2]]$GD[,-1], # GAPITlist[[3]]$GD[,-1], GAPITlist[[4]]$GD[,-1], # GAPITlist[[5]]$GD[,-1], GAPITlist[[6]]$GD[,-1], # GAPITlist[[7]]$GD[,-1], GAPITlist[[8]]$GD[,-1], # GAPITlist[[9]]$GD[,-1], GAPITlist[[10]]$GD[,-1]) # #save(GD.all, GM.all, file = "180525GM_GD_all_polyRAD.RData") # 1076888 markers