## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, fig.width = 6, fig.height = 6)

## -----------------------------------------------------------------------------
library(polyRAD)
maphmcfile <- system.file("extdata", "ClareMap_HapMap.hmc.txt", 
                          package = "polyRAD")
maphmcfile

mydata <- readHMC(maphmcfile,
                  possiblePloidies = list(2, c(2, 2)),
                  taxaPloidy = 2)
mydata

## -----------------------------------------------------------------------------
GetTaxa(mydata)[c(1:10,293:299)]

## -----------------------------------------------------------------------------
mydata <- SetDonorParent(mydata, "Kaskade-Justin")
mydata <- SetRecurrentParent(mydata, "Zebrinus-Justin")

## -----------------------------------------------------------------------------
mydata$taxaPloidy[c("IGR-2011-001", "p196-150A-c", "p877-348-b")] <- 1L
mydata

## -----------------------------------------------------------------------------
alignfile <- system.file("extdata", "ClareMap_alignments.csv", 
                         package = "polyRAD")

aligndata <- read.csv(alignfile, row.names = 1)
head(aligndata)

mydata$locTable$Chr <- aligndata[GetLoci(mydata), 1]
mydata$locTable$Pos <- aligndata[GetLoci(mydata), 2]
head(mydata$locTable)

## ----eval = FALSE-------------------------------------------------------------
#  mydata <- AddPCA(mydata)

## -----------------------------------------------------------------------------
load(system.file("extdata", "examplePCA.RData", package = "polyRAD"))
mydata$PCA <- examplePCA

## -----------------------------------------------------------------------------
plot(mydata)

## -----------------------------------------------------------------------------
realprogeny <- GetTaxa(mydata)[mydata$PCA[,"PC1"] > -10 &
                                 mydata$PCA[,"PC1"] < 10]
# eliminate the one doubled haploid line in this group
realprogeny <- realprogeny[!realprogeny %in% c("IGR-2011-001", "p196-150A-c",
                                               "p877-348-b")]
# also retain parents
keeptaxa <- c(realprogeny, GetDonorParent(mydata), GetRecurrentParent(mydata))

mydata <- SubsetByTaxon(mydata, taxa = keeptaxa)
plot(mydata)

## -----------------------------------------------------------------------------
mydata2 <- PipelineMapping2Parents(mydata, 
                                   freqAllowedDeviation = 0.06,
                                   useLinkage = FALSE,
                                   minLikelihoodRatio = 2)

## -----------------------------------------------------------------------------
overdispersionP <- TestOverdispersion(mydata2, to_test = 8:15)

sapply(overdispersionP[names(overdispersionP) != "optimal"],
       quantile, probs = c(0.01, 0.25, 0.5, 0.75, 0.99))

## -----------------------------------------------------------------------------
my_ovdisp <- overdispersionP$optimal

## ----message = FALSE----------------------------------------------------------
myhindhe <- HindHeMapping(mydata, ploidy = 2L)
hist(colMeans(myhindhe, na.rm = TRUE), col = "lightgrey",
     xlab = "Hind/He", main = "Histogram of Hind/He by locus")

## -----------------------------------------------------------------------------
set.seed(720)
ExpectedHindHeMapping(mydata, ploidy = 2, overdispersion = my_ovdisp, reps = 2,
                      contamRate = 0.001, errorRate = 0.001)

## -----------------------------------------------------------------------------
goodMarkers <- colnames(myhindhe)[which(colMeans(myhindhe, na.rm = TRUE) < 0.53 &
                                          colMeans(myhindhe, na.rm = TRUE) > 0.43)]
mydata <- SubsetByLocus(mydata, goodMarkers)

## -----------------------------------------------------------------------------
mydata <- PipelineMapping2Parents(mydata, 
                                  freqAllowedDeviation = 0.06,
                                  useLinkage = TRUE, overdispersion = my_ovdisp,
                                  minLikelihoodRatio = 2)

## -----------------------------------------------------------------------------
table(mydata$alleleFreq)

## -----------------------------------------------------------------------------
mydata$alleleDepth["Map1-089",1:8]
mydata$genotypeLikelihood[[1,"2"]][,"Map1-089",1:8]
mydata$genotypeLikelihood[[2,"2"]][,"Map1-089",1:8]

## -----------------------------------------------------------------------------
mydata$priorProb[[1,"2"]][,1:8]
mydata$priorProb[[2,"2"]][,1:8]

## -----------------------------------------------------------------------------
mydata$ploidyChiSq[,1:8]

## -----------------------------------------------------------------------------
plot(mydata$ploidyChiSq[1,], mydata$ploidyChiSq[2,], 
     xlab = "Chi-squared for diploid model",
     ylab = "Chi-squared for tetraploid model")

## -----------------------------------------------------------------------------
mydata$posteriorProb[[1,"2"]][,"Map1-089",1:8]
mydata$posteriorProb[[2,"2"]][,"Map1-089",1:8]

## -----------------------------------------------------------------------------
mydata <- SubsetByPloidy(mydata, ploidies = list(2))

## -----------------------------------------------------------------------------
mydata <- RemoveUngenotypedLoci(mydata)

## -----------------------------------------------------------------------------
mywm <- GetWeightedMeanGenotypes(mydata)
round(mywm[c(276, 277, 1:5), 9:12], 3)

## -----------------------------------------------------------------------------
mydata$likelyGeno_donor[,1:8]
mydata$likelyGeno_recurrent[,1:8]

## ----echo = FALSE-------------------------------------------------------------
# Determine if VariantAnnotation is installed, so we know whether to
# execute the rest of the vignette.
haveVA <- requireNamespace("VariantAnnotation", quietly = TRUE)

## ----message=FALSE, warning=FALSE, eval = haveVA------------------------------
library(VariantAnnotation)

myVCF <- system.file("extdata", "Msi01genes.vcf", package = "polyRAD")

## ----eval=FALSE---------------------------------------------------------------
#  mybg <- bgzip(myVCF)
#  indexTabix(mybg, format = "vcf")

## -----------------------------------------------------------------------------
pldfile <- system.file("extdata", "Msi_ploidies.txt", package = "polyRAD")
msi_ploidies <- read.table(pldfile, sep = "\t", header = FALSE)
head(msi_ploidies)
table(msi_ploidies$V2)
pld_vect <- msi_ploidies$V2
names(pld_vect) <- msi_ploidies$V1

## ----eval = haveVA------------------------------------------------------------
mydata <- VCF2RADdata(myVCF, possiblePloidies = list(2, c(2,2)),
                      expectedLoci = 100, expectedAlleles = 500,
                      taxaPloidy = pld_vect)
mydata

## ----echo = FALSE, eval = !haveVA---------------------------------------------
#  # If we don't have VariantAnnotation, load in the dataset
#  load(system.file("extdata", "vcfdata.RData", package = "polyRAD"))

## -----------------------------------------------------------------------------
overdispersionP <- TestOverdispersion(mydata, to_test = 8:14)

sapply(overdispersionP[names(overdispersionP) != "optimal"],
       quantile, probs = c(0.01, 0.25, 0.5, 0.75, 0.99))

## -----------------------------------------------------------------------------
my_ovdisp <- overdispersionP$optimal

## -----------------------------------------------------------------------------
myhindhe <- HindHe(mydata)
myhindheByLoc <- colMeans(myhindhe, na.rm = TRUE)
hist(myhindheByLoc, col = "lightgrey",
     xlab = "Hind/He", main = "Histogram of Hind/He by locus")
abline(v = 0.5, col = "blue", lwd = 2)

## -----------------------------------------------------------------------------
mydata <- AddAlleleFreqHWE(mydata)
theseloci <- GetLoci(mydata)[mydata$alleles2loc[mydata$alleleFreq >= 0.05 & mydata$alleleFreq < 0.5]]
theseloci <- unique(theseloci)
myhindheByLoc2 <- colMeans(myhindhe[mydata$taxaPloidy == 2L, theseloci], na.rm = TRUE)
hist(myhindheByLoc2, col = "lightgrey",
     xlab = "Hind/He", main = "Histogram of Hind/He by locus, MAF >= 0.05")
abline(v = 0.5, col = "blue", lwd = 2)

## -----------------------------------------------------------------------------
set.seed(803)
ExpectedHindHe(mydata, inbreeding = 0.25, ploidy = 2, overdispersion = my_ovdisp,
               reps = 10, contamRate = 0.001, errorRate = 0.001)

## -----------------------------------------------------------------------------
mean(myhindheByLoc < 0.24) # about 29% of markers would be removed
keeploci <- names(myhindheByLoc)[myhindheByLoc >= 0.24]
mydata <- SubsetByLocus(mydata, keeploci)

## ----message = FALSE----------------------------------------------------------
mydataHWE <- IterateHWE(mydata, tol = 1e-3, overdispersion = 10)

## -----------------------------------------------------------------------------
hist(mydataHWE$alleleFreq, breaks = 20, col = "lightgrey")

## ----message = FALSE----------------------------------------------------------
set.seed(3908)
mydataPopStruct <- IteratePopStruct(mydata, nPcsInit = 8, tol = 5e-03,
                                    overdispersion = 10)

## -----------------------------------------------------------------------------
hist(mydataPopStruct$alleleFreq, breaks = 20, col = "lightgrey")

## -----------------------------------------------------------------------------
plot(mydataPopStruct)

## -----------------------------------------------------------------------------
myallele <- 1
freqcol <- heat.colors(101)[round(mydataPopStruct$alleleFreqByTaxa[,myallele] * 100) + 1]
plot(mydataPopStruct, pch = 21, bg = freqcol)

## -----------------------------------------------------------------------------
plot(mydataPopStruct$ploidyChiSq[1,], mydataPopStruct$ploidyChiSq[2,], 
     xlab = "Chi-squared for diploid model",
     ylab = "Chi-squared for allotetraploid model", log = "xy")
abline(a = 0, b = 1, col = "blue", lwd = 2)

## ----message = FALSE, eval = requireNamespace("ggplot2", quietly = TRUE)------
myChiSqRat <- mydataPopStruct$ploidyChiSq[1,] / mydataPopStruct$ploidyChiSq[2,]
myChiSqRat <- tapply(myChiSqRat, mydataPopStruct$alleles2loc, mean)
allelesPerLoc <- as.vector(table(mydataPopStruct$alleles2loc))

library(ggplot2)
ggplot(mapping = aes(x = myhindheByLoc[GetLoci(mydata)], y = myChiSqRat, fill = as.factor(allelesPerLoc))) +
  geom_point(shape = 21, size = 3) +
  labs(x = "Hind/He", y = "Ratio of Chi-squared values, diploid to allotetraploid",
       fill = "Alleles per locus") +
  geom_hline(yintercept = 1) +
  geom_vline(xintercept = 0.5) +
  scale_fill_brewer(palette = "YlOrRd")

## -----------------------------------------------------------------------------
wmgenoPopStruct <- GetWeightedMeanGenotypes(mydataPopStruct)
wmgenoPopStruct[1:10,1:5]

## ----eval = FALSE-------------------------------------------------------------
#  myHindHe <- HindHe(mydata)
#  TotDepthT <- rowSums(mydata$locDepth)

## -----------------------------------------------------------------------------
print(load(system.file("extdata", "MsaHindHe0.RData", package = "polyRAD")))

## -----------------------------------------------------------------------------
myHindHeByInd <- rowMeans(myHindHe, na.rm = TRUE)

## ----eval = requireNamespace("ggplot2", quietly = TRUE)-----------------------
ggplot(data.frame(Depth = TotDepthT, HindHe = myHindHeByInd,
                  Ploidy = ploidies),
  mapping = aes(x = Depth, y = HindHe, color = Ploidy)) +
  geom_point() +
  scale_x_log10() +
  facet_wrap(~ Ploidy) +
  geom_hline(data = data.frame(Ploidy = c("2x", "3x", "4x"),
                               ExpHindHe = c(1/2, 2/3, 3/4)),
             mapping = aes(yintercept = ExpHindHe), lty = 2) +
  labs(x = "Read Depth", y = "Hind/He", color = "Ploidy")

## -----------------------------------------------------------------------------
myHindHe2x <- myHindHe[ploidies == "2x",]
myHindHe4x <- myHindHe[ploidies == "4x",]

## -----------------------------------------------------------------------------
myHindHeByLoc2x <- colMeans(myHindHe2x, na.rm = TRUE)
hist(myHindHeByLoc2x, breaks = 50, xlab = "Hind/He",
     main = "Distribution of Hind/He among loci in diploids",
     col = "lightgrey")
abline(v = 0.5, col = "blue", lwd = 2)

myHindHeByLoc4x <- colMeans(myHindHe4x, na.rm = TRUE)
hist(myHindHeByLoc4x, breaks = 50, xlab = "Hind/He",
     main = "Distribution of Hind/He among loci in tetraploids",
     col = "lightgrey")
abline(v = 0.75, col = "blue", lwd = 2)

## -----------------------------------------------------------------------------
goodLoci <- colnames(myHindHe)[myHindHeByLoc2x < 0.5 & myHindHeByLoc4x < 0.75]
length(goodLoci) # 611 out of 1000 markers retained
head(goodLoci)

## ----eval = FALSE-------------------------------------------------------------
#  library(polyRAD)
#  library(VariantAnnotation)
#  
#  # Two files produced by the TASSEL-GBSv2 pipeline using two different
#  # enzyme systems.
#  NsiI_file <- "170705Msi_NsiI_genotypes.vcf.bgz"
#  PstI_file <- "170608Msi_PstI_genotypes.vcf.bgz"
#  
#  # The vector allSam was defined outside of this script, and contains the
#  # names of all samples that I wanted to import.  Below I find sample names
#  # within the VCF files that match those samples.
#  NsiI_sam <- allSam[allSam %in% samples(scanVcfHeader(NsiI_file))]
#  PstI_sam <- allSam[allSam %in% samples(scanVcfHeader(PstI_file))]
#  
#  # Import two RADdata objects, assuming diploidy.  A large yield size was
#  # used due to the computer having 64 Gb RAM; on a typical laptop you
#  # would probably want to keep the default of 5000.
#  PstI_RAD <- VCF2RADdata(PstI_file, samples = PstI_sam, yieldSize = 5e4,
#                          expectedAlleles = 1e6, expectedLoci = 2e5)
#  NsiI_RAD <- VCF2RADdata(NsiI_file, samples = NsiI_sam, yieldSize = 5e4,
#                          expectedAlleles = 1e6, expectedLoci = 2e5)
#  
#  # remove any loci duplicated across the two sets
#  nLoci(PstI_RAD)    # 116757
#  nLoci(NsiI_RAD)    # 187434
#  nAlleles(PstI_RAD) # 478210
#  nAlleles(NsiI_RAD) # 952511
#  NsiI_keeploci <- which(!GetLoci(NsiI_RAD) %in% GetLoci(PstI_RAD))
#  cat(nLoci(NsiI_RAD) - length(NsiI_keeploci),
#      file = "180522Num_duplicate_loci.txt") #992 duplicate
#  NsiI_RAD <- SubsetByLocus(NsiI_RAD, NsiI_keeploci)
#  
#  # combine allele depth into one matrix
#  PstI_depth <- PstI_RAD$alleleDepth
#  NsiI_depth <- NsiI_RAD$alleleDepth
#  total_depth <- matrix(0L, nrow = length(allSam),
#                        ncol = ncol(PstI_depth) + ncol(NsiI_depth),
#                        dimnames = list(allSam,
#                                        c(colnames(PstI_depth),
#                                          colnames(NsiI_depth))))
#  total_depth[,colnames(PstI_depth)] <- PstI_depth[allSam,]
#  total_depth[rownames(NsiI_depth),colnames(NsiI_depth)] <- NsiI_depth
#  
#  # combine other slots
#  total_alleles2loc <- c(PstI_RAD$alleles2loc,
#                         NsiI_RAD$alleles2loc + nLoci(PstI_RAD))
#  total_locTable <- rbind(PstI_RAD$locTable, NsiI_RAD$locTable)
#  total_alleleNucleotides <- c(PstI_RAD$alleleNucleotides,
#                               NsiI_RAD$alleleNucleotides)
#  
#  # build new RADdata object and save
#  total_RAD <- RADdata(total_depth, total_alleles2loc, total_locTable,
#                       list(2L), 0.001, total_alleleNucleotides)
#  #save(total_RAD, file = "180524_RADdata_NsiIPstI.RData")
#  
#  # Make groups representing pairs of chromosomes, and one group for all
#  # non-assembled scaffolds.
#  splitlist <- list(c("^01$", "^02$"),
#                    c("^03$", "^04$"),
#                    c("^05$", "^06$"),
#                    c("^07$", "^08$"),
#                    c("^09$", "^10$"),
#                    c("^11$", "^12$"),
#                    c("^13$", "^14$", "^15$"),
#                    c("^16$", "^17$"),
#                    c("^18$", "^194"), "^SCAFFOLD")
#  # split by chromosome and save seperate objects
#  SplitByChromosome(total_RAD, chromlist = splitlist,
#                    chromlist.use.regex = TRUE, fileprefix = "180524splitRAD")
#  
#  # files with RADdata objects
#  splitfiles <- grep("^180524splitRAD", list.files("."), value = TRUE)
#  
#  # list to hold markers formatted for GAPIT/FarmCPU
#  GAPITlist <- list()
#  length(GAPITlist) <- length(splitfiles)
#  
#  # loop through RADdata objects
#  for(i in 1:length(splitfiles)){
#    load(splitfiles[i])
#    splitRADdata <- IteratePopStructLD(splitRADdata)
#    GAPITlist[[i]] <- ExportGAPIT(splitRADdata)
#  }
#  #save(GAPITlist, file = "180524GAPITlist.RData")
#  
#  # put together into one dataset for FarmCPU
#  GM.all <- rbind(GAPITlist[[1]]$GM, GAPITlist[[2]]$GM, GAPITlist[[3]]$GM,
#                  GAPITlist[[4]]$GM, GAPITlist[[5]]$GM, GAPITlist[[6]]$GM,
#                  GAPITlist[[7]]$GM, GAPITlist[[8]]$GM,
#                  GAPITlist[[9]]$GM, GAPITlist[[10]]$GM)
#  GD.all <- cbind(GAPITlist[[1]]$GD, GAPITlist[[2]]$GD[,-1],
#                  GAPITlist[[3]]$GD[,-1], GAPITlist[[4]]$GD[,-1],
#                  GAPITlist[[5]]$GD[,-1], GAPITlist[[6]]$GD[,-1],
#                  GAPITlist[[7]]$GD[,-1], GAPITlist[[8]]$GD[,-1],
#                  GAPITlist[[9]]$GD[,-1], GAPITlist[[10]]$GD[,-1])
#  #save(GD.all, GM.all, file = "180525GM_GD_all_polyRAD.RData") # 1076888 markers