---
title: "get-started"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{get-started}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```
Find non-linear formulas that fits your input data.
You can systematically explore and memoize the possible formulas and it's cross-validation performance, in a parallel and incremental fashon.
Three interoperable search functions are available:
- `random.search` performs a random exploration,
- `genetic.search` employs a genetic optimization algorithm
- `exaustive.search` explore all the space not already explored (WARNING may take a long time and doesn't have intermediate save points.)
The library is designed with a massive parallelization use in mind, that's why you have
a `base.filepath` for shared results among all parallel processors,
and a `res.filepath` with local results only.
Periodically, the multiple `res.filepath` will be joined together in a shared `base.filepath`.
Let's broke this down step by step.
## Load the library and set-up the data
```{r setup}
library(symbolicr)
x1<-runif(100, min=2, max=67)
x2<-runif(100, min=0.01, max=0.1)
y <- log10(x1^2*x2) + rnorm(100, 0, 0.001)
X <- data.frame(x1=x1, x2=x2)
```
## Define hyper-parameters
```
l1.filepath <- paste0('regression/regression',type,'.exploration.l1.rData')
# create file
saveRDS(empty.sample(), l1.filepath)
K=15
N=30
max.formula.len=2
```
Note that you can control the formula space, and the algorithm employed
by just setting three variables
- seed
- formula.len
- n.squares
Thus, you can run multiple independent search processes both on your computer or server-side with a command similar to this one
```
R --no-echo --no-restore --file=find_f.R --args 3 2 1006
```
That will run genetic search to find formulas of max len 3, and order up to 2.
You just need to get the variables from command-line
```
options <- commandArgs(trailingOnly = TRUE)
formula.len <- as.integer(options[1])
n.squares <- as.integer(options[2])
seed <- as.integer(options[3])
```
otherwise, you can hard-code them
```
best.n.squares <- n.squares <- 1
best.formula.len <- formula.len <- 2
seed=1010
```
## Define rData file for checkpoint save
Used only in genetic and random search.
Exaustive search will append the missing formulas to the file pointed by `base.filepath`.
```
if(seed > 0){
res.filepath <- file.path(out.folder, paste0('regression',type,'.exploration.fl.',formula.len,'.ord.',n.squares,'seed.',seed,'.rData'))
}
```
## Define non-linear transformations
- `rdf` is the full dataset
- `x` is the variable to which we should apply the non-linearity
- `z` is a list with `min` and `absmin` fields
```
transformations <- list(
"log"=function(rdf, x, z){
log10(x)
},
"log10"=function(rdf, x, z){
log10(0.1+abs(z$min)+x)
},
"log_fwhm_fcrn_p"=function(rdf, x, z){
pos_x1 <- abs(min(rdf$x1, na.rm = T)) + rdf$x1
log10(0.1 + abs(z$min) + x + pos_x1)
},
"my_log10"=function(rdf, x, z){
# WHATEVER fancy function you like
mask <- rdf$x1 > 0.1
mask[mask == 0] <- NA
x.nyte <- mask * x
l <- log10(0.1+abs(z$min)+x.nyte)
l[is.na(l)] <- 0
return(l)
},
"inv"=function(rdf, x, z){
1/(0.1+abs(z$min)+x)
},
"sigmoid"=function(rdf, x,z){
1 / ( 1 + exp(-x))
},
"invsigmoid"=function(rdf, x,z){
1/(1 + exp(x))
}
)
```
## Start symbolic regression
Based on seed value, run a different algorithm
```
if(seed == 0){
base.filepath <- paste0('regression/regression',type,'.exploration.l',formula.len,'.rData')
res.new <- exaustive.search(regressors.df, l.Fn,
n.squares=n.squares,
formula.len=formula.len,
K=K, N=N, seed=seed,
transformations=transformations,
custom.abs.mins=list(),
glob.filepath = base.filepath,
chunk.size=NULL, cv.norm=T)
}else if(seed < 1000){
base.filepath <- paste0('regression/regression',type,'.exploration.l',formula.len,'.rData')
# general "random" search to find good candidates
new.sample.res <- random.search(regressors.df, l.Fn, n.squares, formula.len,
maxiter=10, K=K, N=N,
transformations=transformations,
glob.filepath = base.filepath,
local.filepath = res.filepath,
memoization=T, cv.norm = T)
}else{
# genetic can change formula length
base.filepath <- c(
l1.filepath,
l2.filepath,
l3.filepath#,
#l4.filepath
)
# finetuning procedure using genetic algorithm
best.vars.l <- list(
c('log.x1') )
best.finetuned <- genetic.search(
regressors.df,
l.Fn,
n.squares=n.squares,
max.formula.len=max.formula.len,
maxiter=100,
transformations=transformations,
glob.filepath=base.filepath,
local.filepath=res.filepath,
memoization=T,
pcrossover=0.1,
pmutation=0.9,
seed=NULL,
keepBest=T,
K=K,
N=N,
popSize = 100,
best.vars.l=best.vars.l,
cv.norm = T
)
}
```