## ----echo=FALSE, results="hide", message=FALSE--------------------------------
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE, tidy = FALSE)
library(BiocStyle)
set.seed(42)

## -----------------------------------------------------------------------------
suppressMessages(library(immApex))
suppressMessages(library(ggplot2))
suppressMessages(library(viridis))
suppressMessages(library(dplyr))
suppressMessages(library(igraph))
suppressMessages(library(tidygraph))
suppressMessages(library(ggraph))

## ----eval=knitr::is_html_output()---------------------------------------------
# Function to check IMGT website availability
is_imgt_available <- function() {
  tryCatch({
    r <- httr::HEAD("https://www.imgt.org", timeout(5))
    httr::status_code(r) == 200
  }, error = function(e) {
    FALSE
  })
}

# Run getIMGT only if the website is available
if (is_imgt_available()) {
  TRBV_aa <- getIMGT(species = "human",
                     chain = "TRB",
                     frame = "inframe",
                     region = "v",
                     sequence.type = "aa")

  # Display first sequence as an example
  TRBV_aa[[1]][1]
} else {
  # Display a message if IMGT is not available
  "IMGT website is not accessible at the moment."
}

## -----------------------------------------------------------------------------
data("immapex_example.data")
Adaptive_example <- formatGenes(immapex_example.data[["Adaptive"]],
                                region = "v",
                                technology = "Adaptive", 
                                simplify.format = TRUE) 

head(Adaptive_example[,c("aminoAcid","vGeneName", "v_IMGT", "v_IMGT.check")])

## -----------------------------------------------------------------------------
if (is_imgt_available()) {
  Adaptive_example <- inferCDR(Adaptive_example,
                               chain = "TRB", 
                               reference = TRBV_aa,
                               technology = "Adaptive", 
                               sequence.type = "aa",
                               sequences = c("CDR1", "CDR2"))
  
  Adaptive_example[200:210,c("CDR1_IMGT", "CDR2_IMGT")]
} else {
  # Display a message if IMGT is not available
  "IMGT website is not accessible at the moment."
}

## -----------------------------------------------------------------------------
sequences <- generateSequences(prefix.motif = "CAS",
                               suffix.motif = "YF",
                               number.of.sequences = 1000,
                               min.length = 8,
                               max.length = 16)
sequences <- unique(sequences)
head(sequences)

## -----------------------------------------------------------------------------
nucleotide.sequences <- generateSequences(number.of.sequences = 1000,
                                          min.length = 8,
                                          max.length = 16, 
                                          sequence.dictionary = c("A", "C", "T", "G"))
head(nucleotide.sequences)

## -----------------------------------------------------------------------------
mutated.sequences <- mutateSequences(sequences, 
                                     number.of.sequences = 1,
                                     position.start = 3,                                  
                                     position.end = 8)
head(sequences)
head(mutated.sequences)

## -----------------------------------------------------------------------------
freq.matrix <- calculateFrequency(sequences, 
                                  max.length = 20)
head(freq.matrix[, 1:10])

## -----------------------------------------------------------------------------
shannon.entropy <- calculateEntropy(sequences, 
                                    method = "shannon")
head(shannon.entropy)

## -----------------------------------------------------------------------------
# Calculate the mean of Atchley factors at each position
atchley.profile <- calculateProperty(sequences, 
                                     property.set = "atchleyFactors", 
                                     summary.fun = "mean")
head(atchley.profile[, 1:6])

## -----------------------------------------------------------------------------
# Using the built-in AIRR data
data_airr <- immapex_example.data[["AIRR"]]

# Calculate paired V-J gene usage as percentages
vj_usage <- calculateGeneUsage(data_airr, 
                               loci = c("v_call", "j_call"),
                               summary = "percent")
vj_usage[1:5, 1:5]

## -----------------------------------------------------------------------------
motif.counts <- calculateMotif(sequences, 
                               motif.lengths = 3, 
                               min.depth = 5)
head(motif.counts)

## -----------------------------------------------------------------------------
ppm.matrix <- probabilityMatrix(sequences)
head(ppm.matrix)

## -----------------------------------------------------------------------------
set.seed(42)
# Generate a sample background frequency
back.freq <- sample(1:1000, 20)
names(back.freq) <- amino.acids
back.freq <- back.freq/sum(back.freq)

pwm.matrix <- probabilityMatrix(sequences,
                                max.length = 20,
                                convert.PWM = TRUE,
                                background.frequencies = back.freq)
head(pwm.matrix)

## -----------------------------------------------------------------------------
adj.matrix <- adjacencyMatrix(sequences, 
                              normalize = FALSE)
adj.matrix[1:10, 1:10]

## -----------------------------------------------------------------------------
# Building an edge list
g1 <- buildNetwork(data.frame(sequences = sequences),
                   seq_col = "sequences",
                   threshold = 2)

# Forming network
graph <- graph_from_edgelist(as.matrix(g1[,1:2]))
E(graph)$weight <- g1[,3]

# Remove isolated nodes for clearer visualization
graph <- delete_vertices(graph, which(degree(graph) == 0))

# Convert to tidygraph for use with ggraph
g_tidy <- as_tbl_graph(graph)

# Plot the network
ggraph(g_tidy, layout = "fr") + 
  geom_edge_link(aes(width = weight), color = "black", alpha = 0.5) + 
  geom_node_point(aes(size = degree(g_tidy, mode = "all")),
                  fill = "steelblue", color= "black", shape = 21) +
  theme_void() + 
  scale_edge_width(range = c(0.1, 0.5)) + 
  labs(size = "Degree") +
  theme(legend.position = "right")

## -----------------------------------------------------------------------------
# Generate one-hot encoding using the main function
enc_onehot <- sequenceEncoder(sequences, mode = "onehot", verbose = FALSE)

# You can achieve the same result with the alias
enc_onehot_alias <- onehotEncoder(sequences, verbose = FALSE)

# View the first few columns of the flattened matrix output
head(enc_onehot$flattened[, 1:10])

## -----------------------------------------------------------------------------
# Generate property-based encoding using FASGAI factors
enc_prop <- sequenceEncoder(sequences,
                            mode = "property",
                            property.set = "FASGAI",
                            verbose = FALSE)

# The propertyEncoder() alias is a convenient shortcut
enc_prop_alias <- propertyEncoder(sequences, property.set = "FASGAI", verbose = FALSE)

# View the first few columns of the flattened property matrix
head(enc_prop$flattened[, 1:6])

## -----------------------------------------------------------------------------
# Generate a geometric embedding for each sequence
enc_geo <- sequenceEncoder(sequences,
                           mode = "geometric",
                           method = "BLOSUM62",
                           theta = pi / 3,
                           verbose = FALSE)

# The alias provides a direct path to this functionality
enc_geo_alias <- geometricEncoder(sequences, 
                                  method = "BLOSUM62", 
                                  theta = pi / 3, 
                                  verbose = FALSE)

# The output is a single summary matrix where each row is a sequence
head(enc_geo$summary)

## -----------------------------------------------------------------------------
token.matrix <- tokenizeSequences(input.sequences =  c(sequences, mutated.sequences), 
                                  add.startstop = TRUE,
                                  start.token = "!",
                                  stop.token = "^", 
                                  convert.to.matrix = TRUE)
head(token.matrix[,1:18])

## -----------------------------------------------------------------------------
property.matrix <- propertyEncoder(input.sequences =  c(sequences, mutated.sequences), 
                                   property.set = "FASGAI")

property.sequences <- sequenceDecoder(property.matrix[[1]],
                                      mode = "property",
                                      property.set  = "FASGAI",
                                      call.threshold = 1)
head(sequences)
head(property.sequences)

## ----tidy=FALSE---------------------------------------------------------------
sequence.matrix <- onehotEncoder(input.sequences =  c(sequences, mutated.sequences))

OHE.sequences <- sequenceDecoder(sequence.matrix,
                                 mode= "onehot")

head(OHE.sequences)

## -----------------------------------------------------------------------------
# Step 1a: Generate two distinct classes of sequences
class1.sequences <- generateSequences(prefix.motif = "CAS",
                                      min.length = 3,
                                      number.of.sequences = 500)

class2.sequences <- generateSequences(prefix.motif = "CSG",
                                      min.length = 3,
                                      number.of.sequences = 500)

# Combine sequences and create labels
all.sequences <- c(class1.sequences, class2.sequences)
labels <- as.factor(c(rep("Class1", 500), rep("Class2", 500)))

# Step 1b: Use propertyEncoder to create a feature matrix from Atchley factors
feature.matrix <- propertyEncoder(all.sequences, 
                                  property.set = "atchleyFactors",
                                  verbose = FALSE)

# Combine the flattened feature matrix and labels into the final data frame
training.data <- data.frame(feature.matrix$flattened, labels)

## -----------------------------------------------------------------------------
suppressMessages(library(randomForest))

# Train the Random Forest model
set.seed(42) # for reproducibility
rf.model <- randomForest(labels ~ ., 
                         data = training.data, 
                         ntree = 100, 
                         importance = TRUE) # Set importance=TRUE to calculate scores

# Print the confusion matrix to see model performance
print(rf.model)

## -----------------------------------------------------------------------------
# Extract feature importance data from the model
importance.data <- as.data.frame(importance(rf.model))
importance.data$Feature <- rownames(importance.data)

# Get the top 15 most important features
top_features <- importance.data[order(importance.data$MeanDecreaseGini, decreasing = TRUE), ][1:15,]

# Plot using ggplot2
ggplot(top_features, aes(x = MeanDecreaseGini, y = reorder(Feature, MeanDecreaseGini))) +
  geom_col(aes(fill = MeanDecreaseGini)) +
  scale_fill_viridis_c() +
  labs(title = "Top 15 Most Important Features",
       x = "Mean Decrease in Gini Impurity",
       y = "Feature (Property and Position)") +
  theme_classic() +
  theme(legend.position = "none")

## -----------------------------------------------------------------------------
# Generate three distinct groups of sequences
groupA <- generateSequences(prefix.motif = "CA", 
                            number.of.sequences = 100, 
                            min.length = 2)
groupB <- generateSequences(prefix.motif = "QR", 
                            number.of.sequences = 100, 
                            min.length = 2)
groupC <- generateSequences(prefix.motif = "YY", 
                            number.of.sequences = 100, 
                            min.length = 2)

# Combine them and create a vector of original group IDs for later visualization
mixed.sequences <- c(groupA, groupB, groupC)
original.groups <- as.factor(rep(c("Group A", "Group B", "Group C"), each = 100))

# Use geometricEncoder to create embeddings
geometric.embeddings <- geometricEncoder(mixed.sequences, 
                                         method = "BLOSUM62", 
                                         verbose = FALSE)

## -----------------------------------------------------------------------------
# Perform PCA on the embedding matrix
pca.result <- prcomp(geometric.embeddings$summary, center = TRUE, scale. = TRUE)

# Prepare data for plotting with ggplot2
pca.data <- data.frame(PC1 = pca.result$x[,1],
                       PC2 = pca.result$x[,2],
                       Group = original.groups)

# Plot PC1 vs PC2 using ggplot2 and viridis
ggplot(pca.data, aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 3, alpha = 0.8) +
  scale_color_viridis(discrete = TRUE) +
  labs(title = "PCA of Geometric Sequence Embeddings",
       x = "Principal Component 1",
       y = "Principal Component 2") +
  theme_classic()

## -----------------------------------------------------------------------------
set.seed(42)
# Generate three distinct groups of sequences to simulate clonal families
family1 <- unique(generateSequences(prefix.motif = "CASS", 
                             suffix.motif = "YF", 
                             min.length = 6,
                             number.of.sequences = 60))
family2 <- unique(generateSequences(prefix.motif = "CARS", 
                             suffix.motif = "GF",
                             min.length = 6,
                             number.of.sequences = 60))
family3 <- unique(generateSequences(prefix.motif = "CSVA", 
                             suffix.motif = "HF", 
                             min.length = 6,
                             number.of.sequences = 60))

# Combine into a single data frame
all_sequences_df <- data.frame(
  sequence = c(family1, family2, family3),
  original_family = c(rep("Family 1", 42), rep("Family 2", 40), rep("Family 3", 46))
)

## -----------------------------------------------------------------------------
# Build the edge list: connect sequences with an edit distance of 3 or less
edge_list <- buildNetwork(all_sequences_df,
                          seq_col = "sequence",
                          threshold = 3)

# Replace numerical edge list with the sequences
edge_list_sequences <- as.matrix(data.frame(
  from = all_sequences_df$sequence[as.numeric(edge_list$from)],
  to = all_sequences_df$sequence[as.numeric(edge_list$to)],
  dist = edge_list$dist
))

# Create a graph object from the edge list and node data
# This graph now contains all our sequences as nodes
sequence_graph <- graph_from_data_frame(d = edge_list_sequences, 
                                        vertices = all_sequences_df, 
                                        directed = FALSE)
E(sequence_graph)$weight <- edge_list_sequences[,3]

## -----------------------------------------------------------------------------
# Perform community detection using the Walktrap algorithm
communities <- cluster_walktrap(sequence_graph)

# Add the community membership as an attribute to the graph nodes
V(sequence_graph)$community <- communities$membership

## -----------------------------------------------------------------------------
# Convert to tidygraph for use with ggraph
g_tidy <- as_tbl_graph(sequence_graph)

# Plot the network, coloring nodes by their detected community
ggraph(g_tidy, layout = "fr") + 
  geom_edge_link(aes(alpha = weight), show.legend = FALSE) +
  geom_node_point(aes(color = as.factor(community)), size = 4) +
  scale_color_viridis(discrete = TRUE, option = "D") +
  labs(title = "Sequence Network with Detected Communities",
       color = "Detected\nCommunity") +
  theme_void()

## -----------------------------------------------------------------------------
sessionInfo()