## ----echo=FALSE, results="hide", message=FALSE-------------------------------- knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE, tidy = FALSE) library(BiocStyle) set.seed(42) ## ----------------------------------------------------------------------------- suppressMessages(library(immApex)) suppressMessages(library(ggplot2)) suppressMessages(library(viridis)) suppressMessages(library(dplyr)) suppressMessages(library(igraph)) suppressMessages(library(tidygraph)) suppressMessages(library(ggraph)) ## ----eval=knitr::is_html_output()--------------------------------------------- # Function to check IMGT website availability is_imgt_available <- function() { tryCatch({ r <- httr::HEAD("https://www.imgt.org", timeout(5)) httr::status_code(r) == 200 }, error = function(e) { FALSE }) } # Run getIMGT only if the website is available if (is_imgt_available()) { TRBV_aa <- getIMGT(species = "human", chain = "TRB", frame = "inframe", region = "v", sequence.type = "aa") # Display first sequence as an example TRBV_aa[[1]][1] } else { # Display a message if IMGT is not available "IMGT website is not accessible at the moment." } ## ----------------------------------------------------------------------------- data("immapex_example.data") Adaptive_example <- formatGenes(immapex_example.data[["Adaptive"]], region = "v", technology = "Adaptive", simplify.format = TRUE) head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")]) ## ----------------------------------------------------------------------------- if (is_imgt_available()) { Adaptive_example <- inferCDR(Adaptive_example, chain = "TRB", reference = TRBV_aa, technology = "Adaptive", sequence.type = "aa", sequences = c("CDR1", "CDR2")) Adaptive_example[200:210,c("CDR1_IMGT", "CDR2_IMGT")] } else { # Display a message if IMGT is not available "IMGT website is not accessible at the moment." } ## ----------------------------------------------------------------------------- sequences <- generateSequences(prefix.motif = "CAS", suffix.motif = "YF", number.of.sequences = 1000, min.length = 8, max.length = 16) sequences <- unique(sequences) head(sequences) ## ----------------------------------------------------------------------------- nucleotide.sequences <- generateSequences(number.of.sequences = 1000, min.length = 8, max.length = 16, sequence.dictionary = c("A", "C", "T", "G")) head(nucleotide.sequences) ## ----------------------------------------------------------------------------- mutated.sequences <- mutateSequences(sequences, number.of.sequences = 1, position.start = 3, position.end = 8) head(sequences) head(mutated.sequences) ## ----------------------------------------------------------------------------- freq.matrix <- calculateFrequency(sequences, max.length = 20) head(freq.matrix[, 1:10]) ## ----------------------------------------------------------------------------- shannon.entropy <- calculateEntropy(sequences, method = "shannon") head(shannon.entropy) ## ----------------------------------------------------------------------------- # Calculate the mean of Atchley factors at each position atchley.profile <- calculateProperty(sequences, property.set = "atchleyFactors", summary.fun = "mean") head(atchley.profile[, 1:6]) ## ----------------------------------------------------------------------------- # Using the built-in AIRR data data_airr <- immapex_example.data[["AIRR"]] # Calculate paired V-J gene usage as percentages vj_usage <- calculateGeneUsage(data_airr, loci = c("v_call", "j_call"), summary = "percent") vj_usage[1:5, 1:5] ## ----------------------------------------------------------------------------- motif.counts <- calculateMotif(sequences, motif.lengths = 3, min.depth = 5) head(motif.counts) ## ----------------------------------------------------------------------------- ppm.matrix <- probabilityMatrix(sequences) head(ppm.matrix) ## ----------------------------------------------------------------------------- set.seed(42) # Generate a sample background frequency back.freq <- sample(1:1000, 20) names(back.freq) <- amino.acids back.freq <- back.freq/sum(back.freq) pwm.matrix <- probabilityMatrix(sequences, max.length = 20, convert.PWM = TRUE, background.frequencies = back.freq) head(pwm.matrix) ## ----------------------------------------------------------------------------- adj.matrix <- adjacencyMatrix(sequences, normalize = FALSE) adj.matrix[1:10, 1:10] ## ----------------------------------------------------------------------------- # Building an edge list g1 <- buildNetwork(data.frame(sequences = sequences), seq_col = "sequences", threshold = 2) # Forming network graph <- graph_from_edgelist(as.matrix(g1[,1:2])) E(graph)$weight <- g1[,3] # Remove isolated nodes for clearer visualization graph <- delete_vertices(graph, which(degree(graph) == 0)) # Convert to tidygraph for use with ggraph g_tidy <- as_tbl_graph(graph) # Plot the network ggraph(g_tidy, layout = "fr") + geom_edge_link(aes(width = weight), color = "black", alpha = 0.5) + geom_node_point(aes(size = degree(g_tidy, mode = "all")), fill = "steelblue", color= "black", shape = 21) + theme_void() + scale_edge_width(range = c(0.1, 0.5)) + labs(size = "Degree") + theme(legend.position = "right") ## ----------------------------------------------------------------------------- # Generate one-hot encoding using the main function enc_onehot <- sequenceEncoder(sequences, mode = "onehot", verbose = FALSE) # You can achieve the same result with the alias enc_onehot_alias <- onehotEncoder(sequences, verbose = FALSE) # View the first few columns of the flattened matrix output head(enc_onehot$flattened[, 1:10]) ## ----------------------------------------------------------------------------- # Generate property-based encoding using FASGAI factors enc_prop <- sequenceEncoder(sequences, mode = "property", property.set = "FASGAI", verbose = FALSE) # The propertyEncoder() alias is a convenient shortcut enc_prop_alias <- propertyEncoder(sequences, property.set = "FASGAI", verbose = FALSE) # View the first few columns of the flattened property matrix head(enc_prop$flattened[, 1:6]) ## ----------------------------------------------------------------------------- # Generate a geometric embedding for each sequence enc_geo <- sequenceEncoder(sequences, mode = "geometric", method = "BLOSUM62", theta = pi / 3, verbose = FALSE) # The alias provides a direct path to this functionality enc_geo_alias <- geometricEncoder(sequences, method = "BLOSUM62", theta = pi / 3, verbose = FALSE) # The output is a single summary matrix where each row is a sequence head(enc_geo$summary) ## ----------------------------------------------------------------------------- token.matrix <- tokenizeSequences(input.sequences = c(sequences, mutated.sequences), add.startstop = TRUE, start.token = "!", stop.token = "^", convert.to.matrix = TRUE) head(token.matrix[,1:18]) ## ----------------------------------------------------------------------------- property.matrix <- propertyEncoder(input.sequences = c(sequences, mutated.sequences), property.set = "FASGAI") property.sequences <- sequenceDecoder(property.matrix[[1]], mode = "property", property.set = "FASGAI", call.threshold = 1) head(sequences) head(property.sequences) ## ----tidy=FALSE--------------------------------------------------------------- sequence.matrix <- onehotEncoder(input.sequences = c(sequences, mutated.sequences)) OHE.sequences <- sequenceDecoder(sequence.matrix, mode= "onehot") head(OHE.sequences) ## ----------------------------------------------------------------------------- # Step 1a: Generate two distinct classes of sequences class1.sequences <- generateSequences(prefix.motif = "CAS", min.length = 3, number.of.sequences = 500) class2.sequences <- generateSequences(prefix.motif = "CSG", min.length = 3, number.of.sequences = 500) # Combine sequences and create labels all.sequences <- c(class1.sequences, class2.sequences) labels <- as.factor(c(rep("Class1", 500), rep("Class2", 500))) # Step 1b: Use propertyEncoder to create a feature matrix from Atchley factors feature.matrix <- propertyEncoder(all.sequences, property.set = "atchleyFactors", verbose = FALSE) # Combine the flattened feature matrix and labels into the final data frame training.data <- data.frame(feature.matrix$flattened, labels) ## ----------------------------------------------------------------------------- suppressMessages(library(randomForest)) # Train the Random Forest model set.seed(42) # for reproducibility rf.model <- randomForest(labels ~ ., data = training.data, ntree = 100, importance = TRUE) # Set importance=TRUE to calculate scores # Print the confusion matrix to see model performance print(rf.model) ## ----------------------------------------------------------------------------- # Extract feature importance data from the model importance.data <- as.data.frame(importance(rf.model)) importance.data$Feature <- rownames(importance.data) # Get the top 15 most important features top_features <- importance.data[order(importance.data$MeanDecreaseGini, decreasing = TRUE), ][1:15,] # Plot using ggplot2 ggplot(top_features, aes(x = MeanDecreaseGini, y = reorder(Feature, MeanDecreaseGini))) + geom_col(aes(fill = MeanDecreaseGini)) + scale_fill_viridis_c() + labs(title = "Top 15 Most Important Features", x = "Mean Decrease in Gini Impurity", y = "Feature (Property and Position)") + theme_classic() + theme(legend.position = "none") ## ----------------------------------------------------------------------------- # Generate three distinct groups of sequences groupA <- generateSequences(prefix.motif = "CA", number.of.sequences = 100, min.length = 2) groupB <- generateSequences(prefix.motif = "QR", number.of.sequences = 100, min.length = 2) groupC <- generateSequences(prefix.motif = "YY", number.of.sequences = 100, min.length = 2) # Combine them and create a vector of original group IDs for later visualization mixed.sequences <- c(groupA, groupB, groupC) original.groups <- as.factor(rep(c("Group A", "Group B", "Group C"), each = 100)) # Use geometricEncoder to create embeddings geometric.embeddings <- geometricEncoder(mixed.sequences, method = "BLOSUM62", verbose = FALSE) ## ----------------------------------------------------------------------------- # Perform PCA on the embedding matrix pca.result <- prcomp(geometric.embeddings$summary, center = TRUE, scale. = TRUE) # Prepare data for plotting with ggplot2 pca.data <- data.frame(PC1 = pca.result$x[,1], PC2 = pca.result$x[,2], Group = original.groups) # Plot PC1 vs PC2 using ggplot2 and viridis ggplot(pca.data, aes(x = PC1, y = PC2, color = Group)) + geom_point(size = 3, alpha = 0.8) + scale_color_viridis(discrete = TRUE) + labs(title = "PCA of Geometric Sequence Embeddings", x = "Principal Component 1", y = "Principal Component 2") + theme_classic() ## ----------------------------------------------------------------------------- set.seed(42) # Generate three distinct groups of sequences to simulate clonal families family1 <- unique(generateSequences(prefix.motif = "CASS", suffix.motif = "YF", min.length = 6, number.of.sequences = 60)) family2 <- unique(generateSequences(prefix.motif = "CARS", suffix.motif = "GF", min.length = 6, number.of.sequences = 60)) family3 <- unique(generateSequences(prefix.motif = "CSVA", suffix.motif = "HF", min.length = 6, number.of.sequences = 60)) # Combine into a single data frame all_sequences_df <- data.frame( sequence = c(family1, family2, family3), original_family = c(rep("Family 1", 42), rep("Family 2", 40), rep("Family 3", 46)) ) ## ----------------------------------------------------------------------------- # Build the edge list: connect sequences with an edit distance of 3 or less edge_list <- buildNetwork(all_sequences_df, seq_col = "sequence", threshold = 3) # Replace numerical edge list with the sequences edge_list_sequences <- as.matrix(data.frame( from = all_sequences_df$sequence[as.numeric(edge_list$from)], to = all_sequences_df$sequence[as.numeric(edge_list$to)], dist = edge_list$dist )) # Create a graph object from the edge list and node data # This graph now contains all our sequences as nodes sequence_graph <- graph_from_data_frame(d = edge_list_sequences, vertices = all_sequences_df, directed = FALSE) E(sequence_graph)$weight <- edge_list_sequences[,3] ## ----------------------------------------------------------------------------- # Perform community detection using the Walktrap algorithm communities <- cluster_walktrap(sequence_graph) # Add the community membership as an attribute to the graph nodes V(sequence_graph)$community <- communities$membership ## ----------------------------------------------------------------------------- # Convert to tidygraph for use with ggraph g_tidy <- as_tbl_graph(sequence_graph) # Plot the network, coloring nodes by their detected community ggraph(g_tidy, layout = "fr") + geom_edge_link(aes(alpha = weight), show.legend = FALSE) + geom_node_point(aes(color = as.factor(community)), size = 4) + scale_color_viridis(discrete = TRUE, option = "D") + labs(title = "Sequence Network with Detected Communities", color = "Detected\nCommunity") + theme_void() ## ----------------------------------------------------------------------------- sessionInfo()