## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)
library(compositional.mle)

## ----normal-likelihood--------------------------------------------------------
set.seed(42)
x <- rnorm(50, mean = 3, sd = 1.5)

# Log-likelihood function for normal distribution
loglike <- function(theta) {
  mu <- theta[1]
  sigma <- theta[2]
  if (sigma <= 0) return(-Inf)
  sum(dnorm(x, mean = mu, sd = sigma, log = TRUE))
}

# Visualize the log-likelihood surface
mu_grid <- seq(1, 5, length.out = 50)
sigma_grid <- seq(0.5, 3, length.out = 50)
ll_surface <- outer(mu_grid, sigma_grid, function(m, s) {
  mapply(function(mi, si) loglike(c(mi, si)), m, s)
})

# Contour plot
contour(mu_grid, sigma_grid, ll_surface, nlevels = 20,
        xlab = expression(mu), ylab = expression(sigma),
        main = "Log-Likelihood Surface")
points(mean(x), sd(x), pch = 19, col = "red", cex = 1.5)
legend("topright", "MLE", pch = 19, col = "red")

## ----score-example------------------------------------------------------------
# Score function for normal distribution
score <- function(theta) {
  mu <- theta[1]
  sigma <- theta[2]
  n <- length(x)
  c(
    sum(x - mu) / sigma^2,                        # d/d_mu
    -n / sigma + sum((x - mu)^2) / sigma^3        # d/d_sigma
  )
}

# At a point away from the MLE, the score points toward the MLE
theta_start <- c(1, 0.8)
s <- score(theta_start)
cat("Score at (1, 0.8):", round(s, 2), "\n")
cat("Direction: move", ifelse(s[1] > 0, "right", "left"), "in mu,",
    ifelse(s[2] > 0, "up", "down"), "in sigma\n")

## ----gradient-demo------------------------------------------------------------
# Demonstrate gradient ascent with different step sizes
run_gradient_ascent <- function(eta, max_iter = 50) {
  theta <- c(1, 0.8)
  path <- matrix(NA, max_iter + 1, 2)
  path[1, ] <- theta

  for (i in 1:max_iter) {
    theta <- theta + eta * score(theta)
    if (theta[2] <= 0) theta[2] <- 0.01  # Enforce constraint
    path[i + 1, ] <- theta
  }
  path
}

# Compare step sizes
path_small <- run_gradient_ascent(0.001)
path_good <- run_gradient_ascent(0.01)
path_large <- run_gradient_ascent(0.05)

# Plot paths
contour(mu_grid, sigma_grid, ll_surface, nlevels = 15,
        xlab = expression(mu), ylab = expression(sigma),
        main = "Gradient Ascent: Effect of Step Size")
lines(path_small[, 1], path_small[, 2], col = "blue", lwd = 2)
lines(path_good[, 1], path_good[, 2], col = "green", lwd = 2)
lines(path_large[1:20, 1], path_large[1:20, 2], col = "red", lwd = 2)
points(mean(x), sd(x), pch = 19, cex = 1.5)
legend("topright", c("Small (0.001)", "Good (0.01)", "Large (0.05)"),
       col = c("blue", "green", "red"), lwd = 2)

## ----line-search-demo---------------------------------------------------------
# Define the problem
problem <- mle_problem(
  loglike = loglike,
  score = score,
  constraint = mle_constraint(
    support = function(theta) theta[2] > 0,
    project = function(theta) c(theta[1], max(theta[2], 1e-6))
  )
)

result <- gradient_ascent(max_iter = 50)(problem, theta0 = c(1, 0.8))

cat("Final estimate:", round(result$theta.hat, 4), "\n")
cat("Iterations:", result$iterations, "\n")
cat("Converged:", result$converged, "\n")

## ----newton-comparison--------------------------------------------------------
# Fisher information for normal distribution
fisher <- function(theta) {
  sigma <- theta[2]
  n <- length(x)
  matrix(c(
    n / sigma^2, 0,
    0, 2 * n / sigma^2
  ), nrow = 2)
}

# Define problem with Fisher information
problem_with_fisher <- mle_problem(
  loglike = loglike,
  score = score,
  fisher = fisher,
  constraint = mle_constraint(
    support = function(theta) theta[2] > 0,
    project = function(theta) c(theta[1], max(theta[2], 1e-6))
  )
)

# Run gradient ascent
result_ga <- gradient_ascent(max_iter = 100)(problem, theta0 = c(1, 0.8))

# Run Newton-Raphson
result_nr <- newton_raphson(max_iter = 100)(problem_with_fisher, theta0 = c(1, 0.8))

cat("Gradient Ascent: ", result_ga$iterations, "iterations\n")
cat("Newton-Raphson:  ", result_nr$iterations, "iterations\n")

## ----compose-demo-------------------------------------------------------------
# Coarse-to-fine: grid search finds a good region, Newton polishes
strategy <- grid_search(lower = c(0, 0.1), upper = c(6, 4), n = 5) %>>%
  newton_raphson(max_iter = 20)

result <- strategy(problem_with_fisher, theta0 = c(1, 0.8))
cat("Result:", round(result$theta.hat, 4), "\n")

# Race different methods, pick the best
strategy <- gradient_ascent() %|% bfgs() %|% nelder_mead()
result <- strategy(problem, theta0 = c(1, 0.8))
cat("Winner:", result$solver, "\n")

## ----constraint-demo----------------------------------------------------------
# The constraint keeps sigma positive throughout optimization
result <- gradient_ascent(max_iter = 100)(problem, theta0 = c(0, 0.1))

cat("Final sigma:", result$theta.hat[2], "> 0 (constraint satisfied)\n")

## ----penalty-demo-------------------------------------------------------------
# Original log-likelihood (maximum at theta = (3,2))
loglike_simple <- function(theta) -sum((theta - c(3, 2))^2)

# Add L2 penalty
loglike_l2 <- with_penalty(loglike_simple, penalty_l2(), lambda = 1)

# Compare
theta <- c(3, 2)
cat("At theta = (3, 2):\n")
cat("  Original:", loglike_simple(theta), "\n")
cat("  With L2 penalty:", loglike_l2(theta), "\n")
cat("  The penalty shrinks the solution toward zero\n")