## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup_data---------------------------------------------------------------
library(selection.index)
library(MASS) # For robust ridge regression implementation (lm.ridge)

# 1. Load the built-in synthetic maize datasets
data("maize_pheno")
data("maize_geno")

# Ensure markers are numeric matrices
X <- as.matrix(maize_geno)

# Ensure pheno lines up perfectly with geno
# (Note: maize_pheno contains 4 repetitions per genotype; we take the means)
Y_agg <- mean_performance(
  data = maize_pheno[, c("Yield", "PlantHeight", "DaysToMaturity")],
  genotypes = maize_pheno$Genotype,
  replications = maize_pheno$Block
)

# Sort both datasets to ensure identical ordering
Y_agg <- Y_agg[order(Y_agg$Genotypes), ]
X <- X[order(rownames(X)), ]

# Ensure overlapping genotypes
common_genotypes <- intersect(Y_agg$Genotypes, rownames(X))
Y_agg <- Y_agg[Y_agg$Genotypes %in% common_genotypes, ]
X <- X[rownames(X) %in% common_genotypes, ]

# Select only multi-traits relevant to breeding
# Yield, PlantHeight, DaysToMaturity
Y <- as.matrix(Y_agg[, c("Yield", "PlantHeight", "DaysToMaturity")])

## ----calculate_gebv-----------------------------------------------------------
# Simulate Genomic Estimated Breeding Values (GEBVs) using Ridge Regression
# In best practice, you'd use cross-validation or a separate training/testing set
# We use lambda = 100 to handle the p >> n dimensionality problem
gebv_mat <- matrix(0, nrow = nrow(Y), ncol = ncol(Y))
colnames(gebv_mat) <- colnames(Y)

for (i in seq_len(ncol(Y))) {
  # Fit a ridge regression model for trait `i` using markers `X`
  model_ridge <- lm.ridge(Y[, i] ~ X, lambda = 100)

  # Predict values using coef() which correctly un-scales the coefficients: Include intercept
  betas <- coef(model_ridge)
  intercept <- betas[1]
  beta <- betas[-1] # Exclude intercept

  # Calculate the GEBV for the trait
  gebv_mat[, i] <- intercept + (X %*% beta)
}

head(gebv_mat, 3)

## ----calculated_lgsi----------------------------------------------------------
# 2. Compute Trait Covariances
pmat <- cov(Y) # Phenotypic Covariance (Approximation)

# In optimal practice, the true Genomic Covariance Matrix (\Gamma) is
# estimated using Restricted Maximum Likelihood (REML) utilizing both general
# phenotypic and genotypic information. Here, we simulate \Gamma as proportional
# to the phenotypic variance (assuming a high heritability correlation).
gmat <- pmat * 0.4 # Approximate Genotypic Covariance (Approximating \Gamma)

# 3. Define Economic Weights
weights <- data.frame(
  Trait = c("Yield", "PlantHeight", "DaysToMaturity"),
  Weight = c(5, -0.1, -0.1)
)
wmat <- weight_mat(weights)

# 4. Calculate Linear Genomic Selection Index (LGSI)
# For the testing population where we only use genomic values
lgsi_result <- lgsi(
  gebv_mat = gebv_mat,
  gmat = gmat,
  wmat = wmat
)

# Output Summary
lgsi_result$summary

## ----calculate_clgsi----------------------------------------------------------
clgsi_result <- clgsi(
  phen_mat = Y, # Observed phenotypic data
  gebv_mat = gebv_mat, # Genomic Estimated Breeding Values
  pmat = pmat, # Expected Phenotypic traits covariance
  gmat = gmat, # Expected Genotypic traits covariance
  wmat = wmat
)

clgsi_result$summary