## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) library(compositional.mle) ## ----normal-likelihood-------------------------------------------------------- set.seed(42) x <- rnorm(50, mean = 3, sd = 1.5) # Log-likelihood function for normal distribution loglike <- function(theta) { mu <- theta[1] sigma <- theta[2] if (sigma <= 0) return(-Inf) sum(dnorm(x, mean = mu, sd = sigma, log = TRUE)) } # Visualize the log-likelihood surface mu_grid <- seq(1, 5, length.out = 50) sigma_grid <- seq(0.5, 3, length.out = 50) ll_surface <- outer(mu_grid, sigma_grid, function(m, s) { mapply(function(mi, si) loglike(c(mi, si)), m, s) }) # Contour plot contour(mu_grid, sigma_grid, ll_surface, nlevels = 20, xlab = expression(mu), ylab = expression(sigma), main = "Log-Likelihood Surface") points(mean(x), sd(x), pch = 19, col = "red", cex = 1.5) legend("topright", "MLE", pch = 19, col = "red") ## ----score-example------------------------------------------------------------ # Score function for normal distribution score <- function(theta) { mu <- theta[1] sigma <- theta[2] n <- length(x) c( sum(x - mu) / sigma^2, # d/d_mu -n / sigma + sum((x - mu)^2) / sigma^3 # d/d_sigma ) } # At a point away from the MLE, the score points toward the MLE theta_start <- c(1, 0.8) s <- score(theta_start) cat("Score at (1, 0.8):", round(s, 2), "\n") cat("Direction: move", ifelse(s[1] > 0, "right", "left"), "in mu,", ifelse(s[2] > 0, "up", "down"), "in sigma\n") ## ----gradient-demo------------------------------------------------------------ # Demonstrate gradient ascent with different step sizes run_gradient_ascent <- function(eta, max_iter = 50) { theta <- c(1, 0.8) path <- matrix(NA, max_iter + 1, 2) path[1, ] <- theta for (i in 1:max_iter) { theta <- theta + eta * score(theta) if (theta[2] <= 0) theta[2] <- 0.01 # Enforce constraint path[i + 1, ] <- theta } path } # Compare step sizes path_small <- run_gradient_ascent(0.001) path_good <- run_gradient_ascent(0.01) path_large <- run_gradient_ascent(0.05) # Plot paths contour(mu_grid, sigma_grid, ll_surface, nlevels = 15, xlab = expression(mu), ylab = expression(sigma), main = "Gradient Ascent: Effect of Step Size") lines(path_small[, 1], path_small[, 2], col = "blue", lwd = 2) lines(path_good[, 1], path_good[, 2], col = "green", lwd = 2) lines(path_large[1:20, 1], path_large[1:20, 2], col = "red", lwd = 2) points(mean(x), sd(x), pch = 19, cex = 1.5) legend("topright", c("Small (0.001)", "Good (0.01)", "Large (0.05)"), col = c("blue", "green", "red"), lwd = 2) ## ----line-search-demo--------------------------------------------------------- # Define the problem problem <- mle_problem( loglike = loglike, score = score, constraint = mle_constraint( support = function(theta) theta[2] > 0, project = function(theta) c(theta[1], max(theta[2], 1e-6)) ) ) result <- gradient_ascent(max_iter = 50)(problem, theta0 = c(1, 0.8)) cat("Final estimate:", round(result$theta.hat, 4), "\n") cat("Iterations:", result$iterations, "\n") cat("Converged:", result$converged, "\n") ## ----newton-comparison-------------------------------------------------------- # Fisher information for normal distribution fisher <- function(theta) { sigma <- theta[2] n <- length(x) matrix(c( n / sigma^2, 0, 0, 2 * n / sigma^2 ), nrow = 2) } # Define problem with Fisher information problem_with_fisher <- mle_problem( loglike = loglike, score = score, fisher = fisher, constraint = mle_constraint( support = function(theta) theta[2] > 0, project = function(theta) c(theta[1], max(theta[2], 1e-6)) ) ) # Run gradient ascent result_ga <- gradient_ascent(max_iter = 100)(problem, theta0 = c(1, 0.8)) # Run Newton-Raphson result_nr <- newton_raphson(max_iter = 100)(problem_with_fisher, theta0 = c(1, 0.8)) cat("Gradient Ascent: ", result_ga$iterations, "iterations\n") cat("Newton-Raphson: ", result_nr$iterations, "iterations\n") ## ----compose-demo------------------------------------------------------------- # Coarse-to-fine: grid search finds a good region, Newton polishes strategy <- grid_search(lower = c(0, 0.1), upper = c(6, 4), n = 5) %>>% newton_raphson(max_iter = 20) result <- strategy(problem_with_fisher, theta0 = c(1, 0.8)) cat("Result:", round(result$theta.hat, 4), "\n") # Race different methods, pick the best strategy <- gradient_ascent() %|% bfgs() %|% nelder_mead() result <- strategy(problem, theta0 = c(1, 0.8)) cat("Winner:", result$solver, "\n") ## ----constraint-demo---------------------------------------------------------- # The constraint keeps sigma positive throughout optimization result <- gradient_ascent(max_iter = 100)(problem, theta0 = c(0, 0.1)) cat("Final sigma:", result$theta.hat[2], "> 0 (constraint satisfied)\n") ## ----penalty-demo------------------------------------------------------------- # Original log-likelihood (maximum at theta = (3,2)) loglike_simple <- function(theta) -sum((theta - c(3, 2))^2) # Add L2 penalty loglike_l2 <- with_penalty(loglike_simple, penalty_l2(), lambda = 1) # Compare theta <- c(3, 2) cat("At theta = (3, 2):\n") cat(" Original:", loglike_simple(theta), "\n") cat(" With L2 penalty:", loglike_l2(theta), "\n") cat(" The penalty shrinks the solution toward zero\n")