## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE,eval = FALSE,echo = T) ## ----------------------------------------------------------------------------- # library(magrittr) # library(fastai) # # tst_param = function(val, grad = NULL) { # "Create a tensor with `val` and a gradient of `grad` for testing" # res = tensor(val) %>% float() # # if(is.null(grad)) { # grad = tensor(val / 10) # } else { # grad = tensor(grad) # } # # res$grad = grad %>% float() # res # } ## ----------------------------------------------------------------------------- # p = tst_param(1., 0.1) # p ## ----------------------------------------------------------------------------- # sgd_step(p, 1.) # p ## ----------------------------------------------------------------------------- # p$grad ## ----------------------------------------------------------------------------- # p = tst_param(1., 0.1) # weight_decay(p, 1., 0.1) # p ## ----------------------------------------------------------------------------- # p = tst_param(1., 0.1) # l2_reg(p, 1., 0.1) # p$grad ## ----------------------------------------------------------------------------- # params = L(lapply(0:3, function(x) tst_param(x))) # # opt = Optimizer(params, sgd_step, lr=0.1) # # opt$step() # # str(params$items) ## ----------------------------------------------------------------------------- # params = L(lapply(0:3, function(x) tst_param(x))) # # opt = Optimizer(params, list(weight_decay, sgd_step), lr=0.1, wd = 0.1) # # opt$step() # # str(params$items) ## ----------------------------------------------------------------------------- # params = L(lapply(0:3, function(x) tst_param(x))) # # opt = Optimizer(params, sgd_step, lr=0.1) # # try(params[3]$grad <- NULL, # TRUE) # # params[3]$grad # # opt$step() # # str(params$items) ## ----------------------------------------------------------------------------- # params = L(lapply(0:3, function(x) tst_param(x))) # # opt = Optimizer(list(params[0:1],params[2:3]), sgd_step, lr=0.1) # # opt$hypers$items[[1]][[1]] = 0.01 # # opt$step() # # str(params$items) ## ----------------------------------------------------------------------------- # params = L(lapply(0:3, function(x) tst_param(x))) # # opt = Optimizer(params, list(weight_decay, sgd_step), lr=0.1, wd = 0.1) # # opt$zero_grad() # # str(params$items) ## ----------------------------------------------------------------------------- # p = tst_param(c(1,2,3), c(4,5,6)) # state = average_grad(p, mom = 0.9, dampening = FALSE, grad_avg = NULL) # p$grad # # tensor([4., 5., 6.]) # # state = average_grad(p, mom=0.9, dampening = TRUE) # p$grad*0.1 # # tensor([0.4000, 0.5000, 0.6000]) # p$grad*(0.1*0.9+0.1) # # tensor([0.7600, 0.9500, 1.1400]) ## ----------------------------------------------------------------------------- # p = tst_param(c(1,2,3), c(4,5,6)) # state = average_sqr_grad(p, sqr_mom = 0.99, dampening = FALSE) # # p$grad$pow(2) # # tensor([16., 25., 36.]) # # p$grad$pow(2) * 1.99 # # tensor([31.8400, 49.7500, 71.6400]) # # state = average_sqr_grad(p, sqr_mom = 0.99) # p$grad$pow(2) * 1e-2 # # tensor([0.1600, 0.2500, 0.3600]) # state = average_sqr_grad(p, sqr_mom = 0.99) # # p$grad$pow(2)*(0.01*0.99+0.01) # # tensor([0.3184, 0.4975, 0.7164]) # # params = L(lapply(0:3, function(x) tst_param(x))) # opt = Optimizer(params, sgd_step, lr = 0.1) # opt$freeze_to(1L) ## ----------------------------------------------------------------------------- # params = L(lapply(0:3, function(x) tst_param(x))) # opt = SGD(params, lr = 0.1) # opt$step() # str(params$items) ## ----------------------------------------------------------------------------- # params = L(lapply(0:3, function(x) tst_param(x))) # opt = SGD(params, lr = 0.1, mom = 0.9) # opt$step() # str(params$items) ## ----------------------------------------------------------------------------- # params = L(lapply(0:3, function(x) tst_param(x))) # #Weight decay # opt = SGD(params, lr=0.1, mom=0.9, wd=0.1) # opt$step() # str(params$items) ## ----------------------------------------------------------------------------- # params = L(lapply(0:3, function(x) tst_param(x))) # #L2 reg # opt = SGD(params, lr=0.1, mom=0.9, wd=0.1, decouple_wd=FALSE) # opt$step() # str(params$items) ## ----------------------------------------------------------------------------- # params = tst_param(c(1:3), c(0.1,0.2,0.3)) # opt = RMSProp(params, lr=0.1) # opt$step() # opt$step() # step = (-0.1 * 0.1) / (sqrt((0.01*0.99+0.01) * 0.1**2) + 1e-8) # params; tensor(c(step, 1+step, 2+step)) ## ----------------------------------------------------------------------------- # params = tst_param(c(1:3), c(0.1,0.2,0.3)) # opt = RMSProp(params, lr=0.1, mom=0.9) # opt$step() # opt$step() # step = (- 0.1 * (0.1 + 0.9*0.1)) / (sqrt((0.01*0.99+0.01) * 0.1**2) + 1e-8) # params; tensor(c(step, 1+step, 2+step)) ## ----------------------------------------------------------------------------- # params = tst_param(c(1:3), c(0.1,0.2,0.3)) # opt = Adam(params, lr=0.1, wd=0) # opt$step() # step = (-0.1 * 0.1) / (sqrt(0.1**2) + 1e-8) # params; tensor(c(1+step, 2+step, 3+step)) ## ----------------------------------------------------------------------------- # opt$step() # params;tensor(tensor(c(1+2*step, 2+2*step, 3+2*step))) ## ----------------------------------------------------------------------------- # beta = 0.99 # r_inf = 2/(1-beta) - 1 # rs = lapply(5:500, function(s) {r_inf - 2*s*beta**s/(1-beta**s)}) %>% as.numeric() # v = sqrt(((rs-4) * (rs-2) * r_inf)/((r_inf-4)*(r_inf-2)*rs)) # df_high = data.frame(x = 1:length(v), y = v) # # library(highcharter) # hchart(df_high,'line', hcaes(x,y)) ## ----------------------------------------------------------------------------- # params = tst_param(c(1:3), c(0.1,0.2,0.3)) # opt = QHAdam(params, lr=0.1) # opt$step() # step = (-0.1 * (((1-0.7) * 0.1) + (0.7 * 0.1)) )/ ( # sqrt(((1-1.0) * 0.1**2) + (1.0 * 0.1**2)) + 1e-8) # params; tensor(c(1+step, 2+step, 3+step)) # # tensor([0.9000, 1.9000, 2.9000]) # # tensor([0.9000, 1.9000, 2.9000]) # opt$step() # params; tensor(c(1+2*step, 2+2*step, 3+2*step)) # # tensor([0.8000, 1.8000, 2.8000]) # # tensor([0.8000, 1.8000, 2.8000]) ## ----------------------------------------------------------------------------- # params = list(tst_param(c(1:3), c(0.1,0.2,0.3)), tst_param(c(1:3), c(0.01,0.02,0.03))) # opt = Larc(params, lr=0.1) # opt$step() # #First param local lr is 0.02 < lr so it's not clipped # opt$state[params[[1]]]['local_lr'] ## ----------------------------------------------------------------------------- # opt$state[params[[2]]]['local_lr'] ## ----------------------------------------------------------------------------- # params = list(tst_param(c(1:3), c(0.1,0.2,0.3)), tst_param(c(1:3), c(0.01,0.02,0.03))) # opt = Larc(params, lr=0.1, clip = FALSE) # opt$step() # #Second param local lr is 0.2 > lr so it's clipped # opt$state[params[[1]]]['local_lr'] ## ----------------------------------------------------------------------------- # opt$state[params[[2]]]['local_lr'] ## ----------------------------------------------------------------------------- # params = tst_param(c(1:3), c(0.1,0.2,0.3)) # opt = Lamb(params, lr=0.1) # opt$step() # params ## ----------------------------------------------------------------------------- # params = tst_param(c(1:3), c(0.1,0.2,0.3)) # p = params$data$clone() # g = tensor(c(0.1,0.2,0.3)) # opt = Lookahead(SGD(params, lr=0.1)) # # for(i in 1:5) { # opt$step() # } # #first 5 steps are normal SGD steps # params; p - g * 0.5 # # tensor([0.9500, 1.9000, 2.8500]) # # tensor([0.9500, 1.9000, 2.8500]) # # #Since k=6, sixth step is a moving average of the 6 SGD steps with the initial weight # opt$step() # params; p * 0.5 + (p-g*0.6) * 0.5 # # tensor([0.9700, 1.9400, 2.9100]) # # tensor([0.9700, 1.9400, 2.9100])