---
title: "Bioinformatics Workflows with evanverse"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Bioinformatics Workflows with evanverse}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 10,
  fig.height = 7,
  dpi = 300,
  out.width = "100%"
)
```

```{r setup, message = FALSE}
library(evanverse)
library(dplyr)
library(ggplot2)
library(tidyr)
```

# 🧬 Bioinformatics Workflows with evanverse

The `evanverse` package provides specialized tools for common bioinformatics workflows, including gene ID conversion, gene set analysis, pathway enrichment visualization, and biological data download utilities. This comprehensive guide demonstrates practical applications in genomics and systems biology.

## 🎯 Overview of Bioinformatics Functions

### Core Bioinformatics Tools

| Function | Purpose | Common Use Cases |
|----------|---------|------------------|
| `convert_gene_id()` | Gene identifier conversion | Symbol ↔ Ensembl, Entrez ↔ Symbol |
| `download_gene_ref()` | Reference genome downloads | Annotation files, gene models |
| `gmt2df()` | GMT file to data frame | Pathway analysis, gene set processing |
| `gmt2list()` | GMT file to named list | Enrichment analysis, functional annotation |
| `download_geo_data()` | GEO data retrieval | Public dataset analysis |
| `plot_venn()` | Venn diagram analysis | Gene set overlaps, differential expression |
| `plot_forest()` | Forest plots | Meta-analysis, effect sizes |

## 🔄 Gene Identifier Conversion

### Basic Gene ID Conversion

Gene identifier conversion is fundamental in bioinformatics for integrating datasets from different sources.

```{r gene-conversion-demo, eval = FALSE}
# Example gene symbols commonly used in cancer research
cancer_genes <- c("BRCA1", "BRCA2", "TP53", "EGFR", "MYC", "RAS", "PIK3CA", "AKT1")

# Convert gene symbols to Ensembl IDs
ensembl_ids <- convert_gene_id(
  genes = cancer_genes,
  from = "symbol",
  to = "ensembl",
  species = "human"
)

# Display conversion results
conversion_table <- data.frame(
  Gene_Symbol = cancer_genes,
  Ensembl_ID = ensembl_ids
)

print(conversion_table)
```

```{r gene-conversion-mock}
# Mock example for demonstration (since biomaRt requires internet)
cancer_genes <- c("BRCA1", "BRCA2", "TP53", "EGFR", "MYC", "KRAS", "PIK3CA", "AKT1")

# Simulated conversion results
mock_conversion <- data.frame(
  Gene_Symbol = cancer_genes,
  Ensembl_ID = c(
    "ENSG00000012048", "ENSG00000139618", "ENSG00000141510",
    "ENSG00000146648", "ENSG00000136997", "ENSG00000133703",
    "ENSG00000171608", "ENSG00000142208"
  ),
  Entrez_ID = c(672, 675, 7157, 1956, 4609, 3845, 5290, 207),
  stringsAsFactors = FALSE
)

cat("🧬 Gene ID Conversion Example\n")
cat("=============================\n")
print(mock_conversion)

cat("\n📊 Conversion Summary:\n")
cat("  • Input genes:", length(cancer_genes), "\n")
cat("  • Successful conversions:", nrow(mock_conversion), "\n")
cat("  • Success rate:", round(100 * nrow(mock_conversion) / length(cancer_genes), 1), "%\n")
```

### Advanced Conversion Workflows

```{r conversion-workflow}
# Simulate a real-world scenario with mixed identifier types
mixed_identifiers <- c(
  "BRCA1", "ENSG00000139618", "7157", "EGFR",
  "ENSG00000136997", "3845", "PIK3CA", "207"
)

# Function to detect identifier type
detect_id_type <- function(ids) {
  sapply(ids, function(id) {
    if (grepl("^ENSG", id)) return("ensembl")
    if (grepl("^[0-9]+$", id)) return("entrez")
    return("symbol")
  })
}

id_types <- detect_id_type(mixed_identifiers)
cat("🔍 Identifier Type Detection:\n")
print(data.frame(
  Identifier = mixed_identifiers,
  Detected_Type = id_types
))

# Group by identifier type for batch conversion
id_groups <- split(mixed_identifiers, id_types)
cat("\n📦 Grouped Identifiers for Conversion:\n")
str(id_groups)
```

## 📊 Gene Set Analysis with GMT Files

### Processing GMT Files

GMT (Gene Matrix Transposed) files are standard formats for gene set collections used in pathway analysis.

```{r gmt-processing-demo, eval = FALSE}
# Example: Process a pathway GMT file
# pathway_df <- gmt2df("path/to/c2.cp.kegg.v7.4.symbols.gmt")
# pathway_list <- gmt2list("path/to/c2.cp.kegg.v7.4.symbols.gmt")

# Display structure
# head(pathway_df, 10)
# length(pathway_list)
```

```{r gmt-mock-demo}
# Create mock GMT data to demonstrate structure
mock_pathways <- list(
  "KEGG_GLYCOLYSIS_GLUCONEOGENESIS" = c(
    "HK1", "HK2", "GPI", "PFKL", "ALDOA", "TPI1", "GAPDH",
    "PGK1", "PGAM1", "ENO1", "PKM", "LDHA", "PDK1"
  ),
  "KEGG_CITRATE_CYCLE" = c(
    "CS", "ACO1", "IDH1", "OGDH", "SUCLA2", "SDHA",
    "FH", "MDH1", "PCK1", "PDK1", "DLAT"
  ),
  "KEGG_FATTY_ACID_SYNTHESIS" = c(
    "ACACA", "FASN", "ACLY", "ACC2", "ELOVL6", "SCD",
    "FADS1", "FADS2", "ACSL1", "GPAM"
  ),
  "KEGG_DNA_REPAIR" = c(
    "BRCA1", "BRCA2", "TP53", "ATM", "CHEK1", "CHEK2",
    "RAD51", "XRCC1", "PARP1", "MSH2", "MLH1"
  )
)

# Convert list to data frame format (simulating gmt2df output)
mock_gmt_df <- do.call(rbind, lapply(names(mock_pathways), function(pathway) {
  data.frame(
    pathway = pathway,
    gene = mock_pathways[[pathway]],
    stringsAsFactors = FALSE
  )
}))

cat("📋 GMT File Processing Results\n")
cat("==============================\n")
cat("Number of pathways:", length(mock_pathways), "\n")
cat("Total gene-pathway associations:", nrow(mock_gmt_df), "\n")
cat("Average genes per pathway:", round(mean(lengths(mock_pathways)), 1), "\n\n")

cat("Sample pathway data frame:\n")
print(head(mock_gmt_df, 12))

# Pathway size distribution
pathway_sizes <- lengths(mock_pathways)
cat("\n📊 Pathway Size Distribution:\n")
print(data.frame(
  Pathway = names(pathway_sizes),
  Gene_Count = pathway_sizes
))
```

### Gene Set Overlap Analysis

```{r gene-set-overlap, fig.cap="Gene set overlap analysis showing relationships between biological pathways"}
# Analyze overlaps between pathways
pathway_genes <- mock_pathways[1:3]  # Use first 3 pathways for Venn diagram

# Create Venn diagram for pathway overlaps
venn_plot <- plot_venn(
  set1 = pathway_genes[[1]],
  set2 = pathway_genes[[2]],
  set3 = pathway_genes[[3]],
  category.names = names(pathway_genes),
  fill = get_palette("vividset", type = "qualitative", n = 3),
  title = "Metabolic Pathway Gene Overlaps"
)

print(venn_plot)

# Calculate detailed overlap statistics
all_genes <- unique(unlist(pathway_genes))
cat("\n🔍 Detailed Overlap Analysis:\n")
cat("===============================\n")
cat("Total unique genes across pathways:", length(all_genes), "\n")

# Pairwise overlaps
pathway_names <- names(pathway_genes)
for (i in 1:(length(pathway_names) - 1)) {
  for (j in (i + 1):length(pathway_names)) {
    overlap <- length(intersect(pathway_genes[[i]], pathway_genes[[j]]))
    cat(sprintf("%s ∩ %s: %d genes\n",
                gsub("KEGG_", "", pathway_names[i]),
                gsub("KEGG_", "", pathway_names[j]),
                overlap))
  }
}
```

## 🎯 Differential Expression Analysis Workflow

### Simulated RNA-seq Analysis

```{r rnaseq-workflow, fig.cap="Differential expression analysis visualization with volcano plot"}
# Simulate RNA-seq differential expression results
set.seed(123)
n_genes <- 2000

# Simulate log fold changes and p-values
gene_results <- data.frame(
  Gene = paste0("Gene_", 1:n_genes),
  LogFC = rnorm(n_genes, mean = 0, sd = 1.2),
  PValue = rbeta(n_genes, shape1 = 1, shape2 = 10),
  stringsAsFactors = FALSE
)

# Add some significant genes
significant_indices <- sample(1:n_genes, 200)
gene_results$LogFC[significant_indices] <- gene_results$LogFC[significant_indices] +
  sample(c(-2, 2), 200, replace = TRUE)
gene_results$PValue[significant_indices] <- gene_results$PValue[significant_indices] * 0.01

# Calculate adjusted p-values
gene_results$FDR <- p.adjust(gene_results$PValue, method = "BH")

# Classify genes
gene_results$Regulation <- "Not Significant"
gene_results$Regulation[gene_results$FDR < 0.05 & gene_results$LogFC > 1] <- "Up-regulated"
gene_results$Regulation[gene_results$FDR < 0.05 & gene_results$LogFC < -1] <- "Down-regulated"

# Create volcano plot
volcano_colors <- c(
  "Up-regulated" = get_palette("vividset", type = "qualitative", n = 3)[1],
  "Down-regulated" = get_palette("vividset", type = "qualitative", n = 3)[2],
  "Not Significant" = "#CCCCCC"
)

p1 <- ggplot(gene_results, aes(x = LogFC, y = -log10(FDR), color = Regulation)) +
  geom_point(alpha = 0.6, size = 1.2) +
  scale_color_manual(values = volcano_colors) +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "#666666") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "#666666") +
  labs(
    title = "Differential Gene Expression Analysis",
    subtitle = "Volcano plot showing treatment vs. control comparison",
    x = "Log₂ Fold Change",
    y = "-log₁₀(FDR-adjusted p-value)",
    color = "Regulation"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
    plot.subtitle = element_text(size = 11, color = "#666666"),
    legend.position = "bottom"
  )

print(p1)

# Summary statistics
regulation_summary <- table(gene_results$Regulation)
cat("\n📊 Differential Expression Summary:\n")
cat("===================================\n")
print(regulation_summary)

cat("\nTop 10 up-regulated genes (by fold change):\n")
top_up <- gene_results[gene_results$Regulation == "Up-regulated", ] %>%
  arrange(desc(LogFC)) %>%
  head(10)
print(top_up[, c("Gene", "LogFC", "FDR")])
```

### Pathway Enrichment Analysis

```{r pathway-enrichment, fig.cap="Pathway enrichment analysis showing biological processes affected by treatment"}
# Simulate pathway enrichment analysis results
enrichment_results <- data.frame(
  Pathway = c(
    "Cell Cycle", "Apoptosis", "DNA Repair", "Inflammation",
    "Metabolism", "Signaling", "Transport", "Development"
  ),
  GeneRatio = c(0.15, 0.22, 0.18, 0.31, 0.09, 0.25, 0.12, 0.08),
  FDR = c(0.001, 0.003, 0.008, 0.0001, 0.045, 0.002, 0.021, 0.089),
  GeneCount = c(23, 34, 28, 48, 14, 39, 18, 12),
  stringsAsFactors = FALSE
)

# Calculate enrichment score
enrichment_results$EnrichmentScore <- -log10(enrichment_results$FDR)

# Create enrichment plot
p2 <- ggplot(enrichment_results, aes(x = GeneRatio, y = reorder(Pathway, EnrichmentScore))) +
  geom_point(aes(color = EnrichmentScore, size = GeneCount), alpha = 0.8) +
  scale_color_gradientn(
    colors = get_palette("warm_blush", type = "sequential", n = 4),
    name = "-log₁₀(FDR)"
  ) +
  scale_size_continuous(name = "Gene Count", range = c(3, 12)) +
  geom_vline(xintercept = 0.1, linetype = "dashed", color = "#666666", alpha = 0.7) +
  labs(
    title = "Pathway Enrichment Analysis",
    subtitle = "Biological processes enriched in differentially expressed genes",
    x = "Gene Ratio (enriched genes / pathway total)",
    y = "Biological Pathway"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
    plot.subtitle = element_text(size = 11, color = "#666666"),
    panel.grid.major.y = element_blank(),
    legend.position = "right"
  )

print(p2)

cat("\n🎯 Pathway Enrichment Summary:\n")
cat("==============================\n")
significant_pathways <- enrichment_results[enrichment_results$FDR < 0.05, ]
cat("Significant pathways (FDR < 0.05):", nrow(significant_pathways), "\n")
cat("Most enriched pathway:", significant_pathways$Pathway[which.max(significant_pathways$EnrichmentScore)], "\n")
cat("Total genes in significant pathways:", sum(significant_pathways$GeneCount), "\n")
```

## 🌐 Multi-omics Integration

### Combining Genomics and Transcriptomics

```{r multiomics-integration, fig.cap="Multi-omics data integration showing genomic variants and expression changes"}
# Simulate multi-omics data integration
set.seed(456)
selected_genes <- c("BRCA1", "TP53", "EGFR", "MYC", "KRAS", "PIK3CA", "AKT1", "PTEN")

# Create integrated omics data
omics_data <- data.frame(
  Gene = rep(selected_genes, each = 3),
  DataType = rep(c("Mutation", "CNV", "Expression"), length(selected_genes)),
  Value = c(
    # Mutation frequencies (0-1)
    c(0.12, 0.34, 0.08, 0.15, 0.22, 0.09, 0.06, 0.18),
    # Copy number variations (-2 to 2)
    c(-0.5, -1.2, 1.8, 0.3, 0.8, -0.8, 1.1, -1.5),
    # Expression fold changes (-3 to 3)
    c(-1.5, -2.8, 2.1, 1.8, -1.2, 2.3, -0.8, -2.1)
  ),
  Patient_Group = rep(c("Group_A", "Group_B", "Group_C"), length(selected_genes))
)

# Normalize values for visualization
omics_data$Normalized_Value <- ave(omics_data$Value, omics_data$DataType,
                                   FUN = function(x) scale(x)[,1])

# Create heatmap
p3 <- ggplot(omics_data, aes(x = DataType, y = Gene, fill = Normalized_Value)) +
  geom_tile(color = "white", size = 0.5) +
  scale_fill_gradientn(
    colors = get_palette("gradient_rd_bu", type = "diverging", n = 11),
    name = "Z-score",
    limits = c(-2, 2),
    breaks = c(-2, -1, 0, 1, 2)
  ) +
  labs(
    title = "Multi-omics Cancer Gene Analysis",
    subtitle = "Integrated view of mutations, copy number, and expression",
    x = "Data Type",
    y = "Cancer-related Genes"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
    plot.subtitle = element_text(size = 11, color = "#666666"),
    panel.grid = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

print(p3)

# Summary by data type
cat("\n🧬 Multi-omics Data Summary:\n")
cat("============================\n")
summary_stats <- omics_data %>%
  group_by(DataType) %>%
  summarise(
    Mean_Value = round(mean(Value), 3),
    SD_Value = round(sd(Value), 3),
    Min_Value = round(min(Value), 3),
    Max_Value = round(max(Value), 3),
    .groups = 'drop'
  )
print(summary_stats)
```

## 📈 Survival Analysis Visualization

### Forest Plot for Hazard Ratios

```{r survival-analysis, fig.cap="Forest plot showing hazard ratios for genetic markers in survival analysis"}
# Simulate survival analysis results
survival_data <- data.frame(
  Gene = c("BRCA1", "BRCA2", "TP53", "EGFR", "MYC", "KRAS", "PIK3CA", "AKT1"),
  HazardRatio = c(1.23, 0.87, 1.45, 1.12, 0.92, 1.67, 1.34, 0.78),
  CI_Lower = c(0.98, 0.71, 1.18, 0.89, 0.75, 1.32, 1.05, 0.61),
  CI_Upper = c(1.55, 1.07, 1.78, 1.41, 1.13, 2.11, 1.71, 0.99),
  PValue = c(0.067, 0.189, 0.001, 0.324, 0.445, 0.0001, 0.018, 0.041),
  stringsAsFactors = FALSE
)

# Add significance categories
survival_data$Significance <- ifelse(survival_data$PValue < 0.001, "***",
                            ifelse(survival_data$PValue < 0.01, "**",
                            ifelse(survival_data$PValue < 0.05, "*", "ns")))

# Create forest plot using evanverse plotting functions
p4 <- plot_forest(
  data = survival_data,
  label_col = "Gene",
  estimate_col = "HazardRatio",
  lower_col = "CI_Lower",
  upper_col = "CI_Upper",
  p_col = "PValue"
)

print(p4)

cat("\n🎯 Survival Analysis Summary:\n")
cat("=============================\n")
significant_genes <- survival_data[survival_data$PValue < 0.05, ]
cat("Significant prognostic markers:", nrow(significant_genes), "\n")
cat("Risk factors (HR > 1):", sum(significant_genes$HazardRatio > 1), "\n")
cat("Protective factors (HR < 1):", sum(significant_genes$HazardRatio < 1), "\n")

print(significant_genes[, c("Gene", "HazardRatio", "PValue", "Significance")])
```

## 🔬 Clinical Data Integration

### Biomarker Discovery Pipeline

```{r biomarker-discovery, fig.cap="Biomarker discovery showing gene expression patterns across clinical subtypes"}
# Simulate clinical biomarker data
set.seed(789)
n_patients <- 120
n_biomarkers <- 20

# Generate patient clinical data
clinical_data <- data.frame(
  Patient_ID = paste0("P", 1:n_patients),
  Subtype = sample(c("Luminal_A", "Luminal_B", "HER2+", "TNBC"), n_patients,
                   replace = TRUE, prob = c(0.4, 0.2, 0.15, 0.25)),
  Stage = sample(c("I", "II", "III", "IV"), n_patients,
                 replace = TRUE, prob = c(0.3, 0.35, 0.25, 0.1)),
  Age = round(rnorm(n_patients, 55, 12)),
  Survival_Months = round(rexp(n_patients, rate = 0.02)),
  stringsAsFactors = FALSE
)

# Generate biomarker expression data
biomarker_genes <- paste0("Biomarker_", 1:n_biomarkers)
expression_data <- matrix(rnorm(n_patients * n_biomarkers, mean = 5, sd = 2),
                         nrow = n_patients, ncol = n_biomarkers)
colnames(expression_data) <- biomarker_genes
rownames(expression_data) <- clinical_data$Patient_ID

# Add subtype-specific expression patterns
luminal_a_patients <- clinical_data$Patient_ID[clinical_data$Subtype == "Luminal_A"]
her2_patients <- clinical_data$Patient_ID[clinical_data$Subtype == "HER2+"]
tnbc_patients <- clinical_data$Patient_ID[clinical_data$Subtype == "TNBC"]

# Simulate subtype-specific biomarkers
expression_data[luminal_a_patients, "Biomarker_1"] <-
  expression_data[luminal_a_patients, "Biomarker_1"] + 3

expression_data[her2_patients, "Biomarker_5"] <-
  expression_data[her2_patients, "Biomarker_5"] + 4

expression_data[tnbc_patients, "Biomarker_12"] <-
  expression_data[tnbc_patients, "Biomarker_12"] + 2.5

# Convert to long format for visualization
expression_long <- as.data.frame(expression_data) %>%
  mutate(Patient_ID = rownames(.)) %>%
  gather(Biomarker, Expression, -Patient_ID) %>%
  left_join(clinical_data, by = "Patient_ID")

# Select top biomarkers for visualization
top_biomarkers <- c("Biomarker_1", "Biomarker_5", "Biomarker_12", "Biomarker_8")
plot_data <- expression_long %>%
  filter(Biomarker %in% top_biomarkers)

# Create biomarker expression plot
p5 <- ggplot(plot_data, aes(x = Subtype, y = Expression, fill = Subtype)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.5) +
  geom_jitter(alpha = 0.3, width = 0.2, size = 0.8) +
  scale_fill_manual(
    values = get_palette("vividset", type = "qualitative", n = 4)
  ) +
  facet_wrap(~Biomarker, scales = "free_y", ncol = 2) +
  labs(
    title = "Biomarker Expression Across Cancer Subtypes",
    subtitle = "Potential subtype-specific biomarkers for precision medicine",
    x = "Cancer Subtype",
    y = "Expression Level (log2 normalized)",
    fill = "Subtype"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold", color = "#0D47A1"),
    plot.subtitle = element_text(size = 11, color = "#666666"),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "bottom",
    strip.background = element_rect(fill = "#E3F2FD", color = NA)
  )

print(p5)

# Statistical summary
cat("\n📊 Biomarker Analysis Summary:\n")
cat("==============================\n")
subtype_counts <- table(clinical_data$Subtype)
print(subtype_counts)

cat("\nMean expression by subtype for key biomarkers:\n")
biomarker_summary <- plot_data %>%
  group_by(Biomarker, Subtype) %>%
  summarise(
    Mean_Expression = round(mean(Expression), 2),
    SD = round(sd(Expression), 2),
    .groups = 'drop'
  ) %>%
  arrange(Biomarker, desc(Mean_Expression))

print(biomarker_summary)
```

## 🛠️ Data Download and Management

### Public Dataset Retrieval

```{r data-management, eval = FALSE}
# Example of downloading reference data
# Note: These functions require internet connection and may take time

# Download gene reference annotation
gene_ref <- download_gene_ref(
  species = "human",
  build = "hg38",
  feature_type = "gene"
)

# Download GEO dataset
geo_data <- download_geo_data(
  geo_id = "GSE123456",
  destdir = "data/geo_downloads"
)

# Download pathway databases
pathway_url <- "https://data.broadinstitute.org/gsea-msigdb/msigdb/release/7.4/c2.cp.kegg.v7.4.symbols.gmt"
download_url(
  url = pathway_url,
  dest = "data/pathways/kegg_pathways.gmt"
)
```

```{r data-management-demo}
# Demonstrate file organization for bioinformatics projects
cat("📁 Recommended Project Structure for Bioinformatics:\n")
cat("==================================================\n")
cat("project/\n")
cat("├── data/\n")
cat("│   ├── raw/                 # Original data files\n")
cat("│   ├── processed/           # Cleaned/normalized data\n")
cat("│   ├── reference/           # Genome annotations, databases\n")
cat("│   └── results/             # Analysis outputs\n")
cat("├── scripts/\n")
cat("│   ├── preprocessing/       # Data cleaning scripts\n")
cat("│   ├── analysis/            # Statistical analysis\n")
cat("│   └── visualization/       # Plotting scripts\n")
cat("├── docs/                    # Documentation, protocols\n")
cat("└── reports/                 # Final reports, publications\n\n")

# Demonstrate batch file handling
file_extensions <- c("fastq.gz", "bam", "vcf", "gmt", "gff3", "bed")
file_descriptions <- c(
  "Raw sequencing reads",
  "Aligned sequencing data",
  "Variant calls",
  "Gene set definitions",
  "Gene annotations",
  "Genomic intervals"
)

file_info <- data.frame(
  Extension = file_extensions,
  Description = file_descriptions,
  stringsAsFactors = FALSE
)

cat("🗂️ Common Bioinformatics File Types:\n")
print(file_info)
```

## 🎯 Best Practices for Bioinformatics Workflows

### Reproducible Analysis Guidelines

```{r best-practices}
cat("🔬 BIOINFORMATICS BEST PRACTICES\n")
cat("================================\n\n")

cat("📋 Data Management:\n")
cat("  • Use version control (Git) for all scripts\n")
cat("  • Document data provenance and processing steps\n")
cat("  • Implement checkpoints and intermediate file saves\n")
cat("  • Use consistent file naming conventions\n\n")

cat("🧬 Gene Identifier Handling:\n")
cat("  • Always validate gene ID conversions\n")
cat("  • Store original identifiers alongside converted ones\n")
cat("  • Document the genome build and annotation version\n")
cat("  • Handle missing/ambiguous identifiers gracefully\n\n")

cat("📊 Statistical Analysis:\n")
cat("  • Apply appropriate multiple testing corrections\n")
cat("  • Set significance thresholds before analysis\n")
cat("  • Report effect sizes along with p-values\n")
cat("  • Validate results with independent datasets when possible\n\n")

cat("🎨 Visualization Guidelines:\n")
cat("  • Use color-blind friendly palettes\n")
cat("  • Include appropriate scales and legends\n")
cat("  • Provide clear titles and axis labels\n")
cat("  • Consider publication requirements for figures\n")
```

### Quality Control Checklist

```{r qc-checklist}
cat("✅ QUALITY CONTROL CHECKLIST\n")
cat("============================\n\n")

cat("🔍 Data Quality:\n")
cat("  [ ] Check for missing values and outliers\n")
cat("  [ ] Verify sample sizes and statistical power\n")
cat("  [ ] Validate gene identifier mappings\n")
cat("  [ ] Assess data distribution and normalization\n\n")

cat("📈 Analysis Validation:\n")
cat("  [ ] Cross-validate results with different methods\n")
cat("  [ ] Perform sensitivity analyses\n")
cat("  [ ] Check for batch effects and confounders\n")
cat("  [ ] Compare with known biological expectations\n\n")

cat("📊 Results Reporting:\n")
cat("  [ ] Include sample sizes and effect sizes\n")
cat("  [ ] Report confidence intervals\n")
cat("  [ ] Document software versions and parameters\n")
cat("  [ ] Provide supplementary data and code\n")
```

## 🚀 Advanced Workflow Examples

### Complete Analysis Pipeline

```{r complete-pipeline}
cat("🔄 COMPLETE BIOINFORMATICS PIPELINE EXAMPLE\n")
cat("===========================================\n\n")

# Simulate a complete analysis workflow
pipeline_steps <- data.frame(
  Step = 1:8,
  Process = c(
    "Data Import & Quality Control",
    "Gene ID Conversion & Mapping",
    "Differential Expression Analysis",
    "Multiple Testing Correction",
    "Pathway Enrichment Analysis",
    "Gene Set Overlap Analysis",
    "Visualization & Plotting",
    "Results Export & Reporting"
  ),
  evanverse_Functions = c(
    "read_table_flex(), file_info()",
    "convert_gene_id(), replace_void()",
    "User analysis + evanverse utilities",
    "Built-in R functions",
    "gmt2df(), gmt2list()",
    "plot_venn(), combine_logic()",
    "plot_forest(), get_palette()",
    "write_xlsx_flex(), remind()"
  ),
  Estimated_Time = c("5-10 min", "10-15 min", "30-60 min", "5 min",
                     "15-30 min", "10-20 min", "20-40 min", "10-15 min")
)

print(pipeline_steps)

cat("\n⏱️ Total Estimated Pipeline Time: 2-4 hours\n")
cat("🎯 Key Success Factors:\n")
cat("  • Proper data validation at each step\n")
cat("  • Consistent identifier handling\n")
cat("  • Appropriate statistical methods\n")
cat("  • Clear documentation and visualization\n")
```

## 🎯 Summary and Next Steps

The evanverse bioinformatics toolkit provides:

✅ **Gene identifier conversion** with species and build support
✅ **Pathway analysis tools** for GMT file processing
✅ **Visualization functions** optimized for biological data
✅ **Data download utilities** for public repositories
✅ **Multi-omics integration** capabilities
✅ **Quality control helpers** for robust analysis

### Continue Learning:

- 📊 [Package Management](package-management.html) - Advanced installation techniques
- 🎨 [Color Palette Guide](color-palettes.html) - Bioinformatics color schemes
- 📚 [Comprehensive Guide](comprehensive-guide.html) - Complete package overview

### Essential Bioinformatics Functions:

```{r bio-quick-ref, eval = FALSE}
# Gene identifier conversion
convert_gene_id(genes, from = "symbol", to = "ensembl", species = "human")

# Pathway analysis
pathways <- gmt2list("pathways.gmt")
plot_venn(gene_sets, colors = get_palette("vividset"))

# Data visualization
plot_forest(survival_data, hr_col = "HazardRatio")
get_palette("gradient_rd_bu", type = "diverging", n = 11)

# Data management
download_geo_data("GSE123456")
read_table_flex("expression_data.txt")
```

---

*🧬 Accelerate your bioinformatics research with evanverse!*