---
title: "join-results"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{join-results}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

```{r setup}
library(symbolicr)
out.folder<-'regression'

```

## Collecting results from parallel independent tests

The library is designed with a massive parallelization use in mind, 
and this results in multiple independent rData files (`res.filepath`) containing formula performances.

Periodically, you have to join all these `res.filepath` in a shared `base.filepath`.
This code is an example on how to do that.

### Define base.filepaths

Base filepaths maybe split by formula length. 
I find this useful to avoid having a single big rData file.

```
l1.filepath <- paste0('regression/regression',type,'.exploration.l1.rData')
l2.filepath <- paste0('regression/regression',type,'.exploration.l2.rData')
l3.filepath <- paste0('regression/regression',type,'.exploration.l3.rData')

l1.res <- readRDS(l1.filepath)
l2.res <- readRDS(l2.filepath)
l3.res <- readRDS(l3.filepath)
```

### Check exploration space percentage

You may want to know what fraction of the formula space you have explored.
This is a way to do that, 
the assumption is that you have a transformation list in your environnment as defined in get-started vignette.

```
max.n.squares <- 2
complete.regressors <- symbolicr::compute.regressors.names(regressors.df, max.n.squares, transformations)

tot.regressors <- length(complete.regressors)
# percentage of L=1
100*nrow(l1.res)/choose(tot.regressors,1)

# percentage of L=2
100*nrow(l2.res)/choose(tot.regressors,2)
# 10.61434

# percentage of L=3
100*nrow(l3.res)/choose(tot.regressors,3)
```

### Load in one-shot all the independent results

```
# all possible table names
hyperparams <- expand.grid(formula.len=seq(1,3), n.squares=seq(0,2), seed=seq(0,1010))
# read them ALL!
new.res.l <- apply(hyperparams, MARGIN=1, simplify = F, FUN=function(row){
  flen <- row[['formula.len']]
  nsq <- row[['n.squares']]
  seed <- row[['seed']]

  if(seed == 0){
    res.filepath <- file.path(out.folder, paste0('regression',type,'.exploration.fl.',flen,'.ord.',nsq,'.rData'))
  }else{
    res.filepath <- file.path(out.folder, paste0('regression',type,'.exploration.fl.',flen,'.ord.',nsq,'seed.',seed,'.rData'))
  }

  if(file.exists(res.filepath)){
    print(res.filepath)
    res <- readRDS(res.filepath)
    cres <- res
  }else{
    cres <- data.frame(base.pe=double(),base.cor=double(),base.r.squared=double(),
                       base.max.pe=double(), base.iqr.pe=double(), base.max.cooksd=double(), base.max.cooksd.name=character(),
                       vars=character(), n.squares=integer(), formula.len=integer())
  }
  return(cres)
})
# drop empty
new.res.l<- new.res.l[which(sapply(new.res.l, nrow) > 0)]
new.res <- do.call(rbind, new.res.l)

dedup.res <- new.res[!duplicated(new.res$vars), ]

# how many NEW formulas of length 1/2/3/4?
l1.dedup.res <- dedup.res[dedup.res$formula.len==1, ]
l2.dedup.res <- dedup.res[dedup.res$formula.len==2, ]
l3.dedup.res <- dedup.res[dedup.res$formula.len==3, ]
```

### Differential analysis (preparation)

You may want to take note of the number of new results that will be appended to the rData,
so that you can evaluate only the new results.

```
# take note for analyze results..
nrow(l1.dedup.res)
nrow(l2.dedup.res)
nrow(l3.dedup.res)
```

### Update shared results

```
l1.res <- rbind(l1.res, l1.dedup.res)
l2.res <- rbind(l2.res, l2.dedup.res)
l3.res <- rbind(l3.res, l3.dedup.res)

l1.res.dedup <- l1.res[!duplicated(l1.res$vars), ]
l2.res.dedup <- l2.res[!duplicated(l2.res$vars), ]
l3.res.dedup <- l3.res[!duplicated(l3.res$vars), ]

saveRDS(l1.res.dedup, l1.filepath)
saveRDS(l2.res.dedup, l2.filepath)
saveRDS(l3.res.dedup, l3.filepath)
```