--- title: "Generating Allele Similarity Clusters (ASC) for BIOMED IGHV Reference" author: "Ayelet Peres & William D. Lees & Gur Yaari" date: "Last modified `r Sys.Date()`" output: bookdown::html_document2: base_format: rmarkdown::html_vignette toc: yes toc_depth: 3 pdf_document: dev: pdf fig_height: 20 fig_width: 15 highlight: pygments toc: yes toc_depth: 3 template: null md_document: fig_height: 20 fig_width: 15 preserve_yaml: no toc: yes toc_depth: 3 always_allow_html: yes bibliography: bibliography.bib csl: ieee-with-url.csl link-citations: yes urlcolor: blue geometry: margin=1in fontsize: 11pt vignette: > %\usepackage[utf8]{inputenc} %\VignetteIndexEntry{Generating Allele Similarity Clusters (ASC) for BIOMED IGHV Reference} %\VignetteEngine{knitr::rmarkdown} --- # Overview This tutorial demonstrates how to generate Allele Similarity Clusters (ASC) for a BIOMED-style V reference using the OGRDB IGHV reference. You will: - Infer ASC clusters - Merge with PIgLET thresholds - Output ASC-formatted references - Or load precomputed versions --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) suppressMessages({ library(data.table) library(tigger) library(piglet) }) ``` 1. Load and Process the Reference Sets - Load the OGRDB reference - Mask the FWR1 region to match BIOMED-2 library ```{r, eval=FALSE} url <- "https://bitbucket.org/yaarilab/piglet/raw/70b7d4491e25e7197e2a94bd890ce5b6e3b506a8/data-raw/HVGERM_OGRDB.fasta" tmp_dest_file <- file.path(tempdir(), "HVGERM_OGRDB.fasta") download.file(url, tmp_dest_file, mode = "wb") ref_ogrdb <- readIgFasta(tmp_dest_file) ref_ogrdb_frw1 <- piglet::artificialFRW1Germline(ref_ogrdb) ``` 2. Infer Allele Similarity Clusters (ASC) ```{r, eval=FALSE} asc_frw1 <- inferAlleleClusters(ref_ogrdb_frw1) allele_table_frw1 <- setDT(asc_frw1@alleleClusterTable)[, .(imgt_allele, new_allele)] setnames(allele_table_frw1, c("allele", "asc_allele")) ``` 3. Merge with Threshold Table from PIgLET ```{r, eval=FALSE} allele_table_piglet <- fread("https://bitbucket.org/yaarilab/piglet/raw/70b7d4491e25e7197e2a94bd890ce5b6e3b506a8/data-raw/allele_threshold_table.tsv") allele_table_frw1$threshold <- 1e-04 allele_table_frw1$threshold <- apply(allele_table_frw1, 1, function(x){ gene <- unlist(strsplit(x[["allele"]],"[*]")) alleles <- unlist(strsplit(gene[2],"_")) gene <- gene[1] alleles <- paste0(gene,"*",alleles) thresh <- allele_table_piglet[allele %in% alleles, sum(threshold)] thresh }) allele_table_frw1 <- rbind( allele_table_frw1[,.(allele,asc_allele,threshold)], allele_table_piglet[!grepl("V",allele),] ) allele_table_frw1[,tag:=substr(allele, 4, 4)] ``` 4. Load Precomputed Partial BIOMED Reference ```{r, eval=FALSE} url <- "https://bitbucket.org/yaarilab/piglet/raw/70b7d4491e25e7197e2a94bd890ce5b6e3b506a8/data-raw/HVGERM_ogrdb_asc_partial.fasta" tmp_dest_file <- file.path(tempdir(), "HVGERM_ogrdb_asc_partial.fasta") download.file(url, tmp_dest_file, mode = "wb") asc_germline <- readIgFasta(tmp_dest_file) allele_table <- fread("https://bitbucket.org/yaarilab/piglet/raw/70b7d4491e25e7197e2a94bd890ce5b6e3b506a8/data-raw/allele_threshold_table_ogrdb_partial.tsv") ``` 5. Convert Annotations to ASC format ```{r, eval=FALSE} data <- tigger::AIRRDb data$v_call_or <- data$v_call allele_table_split <- allele_table[, { parts <- strsplit(allele, "\\*")[[1]] gene <- parts[1] alleles <- strsplit(parts[2], "_")[[1]] expanded <- paste0(gene, "*", alleles) .(allele = expanded, asc_allele, threshold, tag) }, by = .I] allele_table_split[, I := NULL] asc_data <- assignAlleleClusters(data, allele_table_split, v_call = "v_call", from_col = "allele", to_col = "asc_allele") ``` 6. Infer Genotypes ```{r, eval=FALSE} # using the asc annotations asc_genotype <- inferGenotypeAllele( asc_data, allele_threshold_table = allele_table, call = "v_call", # change to the column call you want to genotype asc_annotation = TRUE, # if you use iuis names then set to FALSE single_assignment = TRUE, # if you want to use the single assignment algorithm find_unmutated = FALSE # change to TRUE to filter mutated reads # germline_db = asc_germline # Uncomment if you want to filter mutated reads ) # using the biomed annotations, make sure to convert the v_call to the collapsed biomed annotations allele_table_biomed <- allele_table allele_table_biomed[, asc_allele := allele] allele_table_split <- allele_table_biomed[, { parts <- strsplit(allele, "\\*")[[1]] gene <- parts[1] alleles <- strsplit(parts[2], "_")[[1]] expanded <- paste0(gene, "*", alleles) .(allele = expanded, asc_allele, threshold, tag) }, by = .I] allele_table_split[, I := NULL] biomed_data <- assignAlleleClusters(data, allele_table_split, v_call = "v_call", from_col = "allele", to_col = "asc_allele") asc_genotype_biomed <- inferGenotypeAllele( biomed_data, allele_threshold_table = allele_table, call = "v_call", # change to the column call you want to genotype asc_annotation = FALSE, # if you use iuis names then set to FALSE single_assignment = TRUE ) ```