---
title: "smoothPLS_ScalarFD_02"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{smoothPLS_ScalarFD_02}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

```{r setup}
library(SmoothPLS)
library(ggplot2)
```

This notebook present a way of performing the SmoothPLS algorithm for a 
Scalar Functional Data.

It shows that Smooth PLS over perform FPLS with a time vector regul_time without
a lot of points, and even with a lot of points.

# Parameters
```{r}
nind = 100 # number of individuals (train set)
start = 0 # First time
end = 100 # end time

lambda_0 = 0.2 # Exponential law parameter for state 0 
lambda_1 = 0.1 # Exponential law parameter for state 1
prob_start = 0.5 # Probability of starting with state 1

curve_type = 'num'

TTRatio = 0.2 # Train Test Ratio means we have floor(nind*TTRatio/(1-TTRatio))
# individuals in the test set
NotS_ratio = 0.2 # noise variance over total variance for Y
beta_0_real=5.4321 # Intercept value for the link between X(t) and Y

nbasis = 10 # number of basis functions
norder = 4 # 4 for cubic splines basis

regul_time = seq(start, end, 5)
regul_time_0 = seq(start, end, 1)

beta_real_func = beta_4_real_func # link between X(t) and Y

int_mode = 2 # in case of integration errors.
```

# Synthetic SFD data
We can also generate synthetic Scalar Functional Data SFD.
The important input is type='num'.

# Generate_X_df
For SFD for X_df two new arguments are important: the noise added to the signal
and the seed for repeatability.
```{r}
df = generate_X_df(nind = nind, start = start,end =  end, curve_type = 'num',
                   noise_sd = 0.15, seed = 123)
head(df)
```

```{r}
# Visualisation
ggplot(df, aes(x = time, y = value, group = id, color = factor(id))) +
  geom_line(alpha = 0.8) +
  labs(title = "Noised cosinus curves",
       x = "Time", y = "Value",
       color = "Individual") +
  theme_minimal() +
  theme(legend.position = "none")
```



# Data manipulation
## regularize_time_series
```{r}
df_regul = regularize_time_series(df, time_seq = regul_time,
                                     curve_type = 'num')
df_regul
```

## convert_to_wide_format
```{r}
df_wide = convert_to_wide_format(df_regul)
df_wide
```

# generate_Y_df

```{r}
plot(regul_time_0, beta_real_func(regul_time_0))
```



WARNING! Here we have to regularize X(t) before evaluating Y!
```{r}
Y_df = generate_Y_df(df = df_regul, curve_type = 'num',
                     beta_real_func_or_list = beta_real_func,
                     beta_0_real = beta_0_real, NotS_ratio = NotS_ratio,
                     seed = 123)
Y = Y_df$Y_noised
head(Y_df)
```

```{r hist Y noised}
oldpar <- par(mfrow=c(1, 2))
hist(Y_df$Y_real)
hist(Y_df$Y_noised)
par(oldpar)
```

# Basis
```{r basis creation}
basis = create_bspline_basis(start, end, nbasis, norder)
#basis = fda::create.fourier.basis(rangeval=c(start, end), nbasis=(nbasis)) 
plot(basis, main=paste0(nbasis, " Function basis"))
```

# Naive PLS

```{r}
naive_pls_obj = naivePLS(df_list = df, Y = Y_df$Y_noised,
                        regul_time_obj = regul_time,
                        curve_type_obj = 'num', 
                        id_col_obj = 'id', time_col_obj = 'time',
                        print_steps = TRUE, plot_rmsep = TRUE,
                        print_nbComp = TRUE, plot_reg_curves = TRUE)
```

## Regression coefficients
```{r}
for(i in 2:length(naive_pls_obj$reg_obj)){
  plot(naive_pls_obj$reg_obj[[i]])
  title(naive_pls_obj$curves_names[i-1])
}
plot(naive_pls_obj$reg_obj$Num_1, 
   xlab='regul_time', ylab='beta coefficient value', 
   main="naive pls regression coefficients")
lines(x = regul_time, y = naive_pls_obj$reg_obj$Num_1[, 2], col='blue')
lines(x = regul_time_0, y=beta_real_func(start:end, end), col='red')
```

# FPLS
```{r}
fpls_obj = funcPLS(df_list = df, Y = Y_df$Y_noised,
                   basis_obj = basis,
                   curve_type_obj = 'num',
                   regul_time_obj = regul_time,
                   id_col_obj = 'id',time_col_obj = 'time',
                   print_steps = TRUE, plot_rmsep = TRUE, 
                   print_nbComp = TRUE, plot_reg_curves = TRUE)
```

#Smooth PLS
```{r}
spls_obj = smoothPLS(df_list = df, Y = Y_df$Y_noised, basis_obj = basis, 
                     orth_obj = TRUE, curve_type_obj = curve_type, 
                     regul_time_obj = regul_time,
                     int_mode = int_mode, print_steps = TRUE, plot_rmsep = TRUE,
                     print_nbComp = TRUE, plot_reg_curves = TRUE)
```

## Delta
```{r}
delta_0 = spls_obj$reg_obj$Intercept
print(delta_0)

delta = spls_obj$reg_obj$NumFD_1

y_lim = eval_max_min_y(f_list = list(beta_real_func, delta), 
                       regul_time = regul_time)


plot(regul_time, beta_real_func(regul_time), type='l', xlab="Beta_t",
     ylim = y_lim)
plot(delta, add=TRUE, col='blue')
```

# Curve comparison
```{r}
cat("curve_1 : smooth PLS regression curve.\n")
cat("curve_2 : functional PLS regression curve.\n")
cat("curve_3 : naive PLS regression coefficients\n")

evaluate_curves_distances(real_f = beta_real_func,
                          regul_time = regul_time, 
                          fun_fd_list = list(
                            delta, 
                            fpls_obj$reg_obj$NumFD_1,
                            approxfun(x = regul_time,
                                      y = naive_pls_obj$opti_reg_coef)
                            )
                          )
```

```{r}
y_lim = eval_max_min_y(f_list = list(spls_obj$reg_obj$NumFD_1,
                                     fpls_obj$reg_obj$NumFD_1,
                                     approxfun(x = regul_time,
                                               y = naive_pls_obj$opti_reg_coef),
                                     beta_real_func ), 
                         regul_time = regul_time_0)
  
plot(regul_time_0, beta_real_func(regul_time_0), col = 'black', 
     ylim = y_lim, xlab = 'Time', ylab = 'Value', type = 'l')
lines(regul_time_0, approxfun(x = regul_time, 
                              y = naive_pls_obj$opti_reg_coef)(regul_time_0), 
      col = 'green')
title(paste0(names(spls_obj$reg_obj)[2], " regression curves"))
plot(spls_obj$reg_obj$NumFD_1, col = 'blue', add = TRUE)
plot(fpls_obj$reg_obj$NumFD_1, col = 'red', add = TRUE)
legend("bottomleft",
       legend = c("Real curve", "NaivePLS coef", 
                  "SmoothPLS reg curve", "FunctionalPLS reg curve"),
       col = c("black", "green", "blue", "red"),
       lty = 1,
       lwd = 1)
```

```{r}
y_lim = eval_max_min_y(f_list = list(beta_real_func, 
                                     spls_obj$reg_obj$NumFD_1,
                                     fpls_obj$reg_obj$NumFD_1),
                       regul_time = regul_time)

plot(regul_time_0, beta_real_func(regul_time_0), type='l', xlab="Beta_t",
     ylim = y_lim, lwd = 2)
plot(spls_obj$reg_obj$NumFD_1, add=TRUE, col='blue', lwd = 1)
plot(fpls_obj$reg_obj$NumFD_1, add=TRUE, col='red', lwd = 1)
legend("bottomleft",
       legend = c("Real curve", 
                  "SmoothPLS reg curve", "FunctionalPLS reg curve"),
       col = c("black", "blue", "red"),
       lty = 1,
       lwd = 1)
```

```{r}
evaluate_curves_distances(real_f = beta_real_func,
                          regul_time = regul_time,
                          fun_fd_list = list(spls_obj$reg_obj$NumFD_1,
                                             fpls_obj$reg_obj$NumFD_1))
```

Plot the second derivatives for the smoothness :
```{r}
plot((fda::deriv.fd(expr = spls_obj$reg_obj$NumFD_1, Lfdobj = 2)), 
     col='blue')
plot((fda::deriv.fd(expr = fpls_obj$reg_obj$NumFD_1, Lfdobj = 2)),
     add=TRUE, col='red')
title("Second derivatives")
legend("bottomleft",
         legend = c("delta_SmoothPLS''", "delta_FPLS''"),
         col = c("blue", "red"),
         lty = 1,
         lwd = 1)
```

# Results
```{r}
train_results = data.frame(matrix(ncol = 5, nrow = 3))
colnames(train_results) = c("PRESS", "RMSE", "MAE", "R2", "var_Y")
rownames(train_results) = c("NaivePLS", "FPLS", "SmoothPLS")

test_results = train_results
```

```{r}
print(paste0("There is ", 100*NotS_ratio, "% of noised in Y"))
```

## Train Set
```{r}
Y_train = Y_df$Y_noised

# Naive
Y_hat = predict(naive_pls_obj$plsr_model, 
                ncomp = naive_pls_obj$nbCP_opti, 
                newdata = as.matrix(df_wide[, -c(1)]))
train_results["NaivePLS", ] = evaluate_results(Y_train, Y_hat)


# FPLS
Y_hat_fpls = (predict(fpls_obj$plsr_model, ncomp = fpls_obj$nbCP_opti,
                newdata = fpls_obj$trans_alphas) 
              + fpls_obj$reg_obj$Intercept
              + mean(Y))

Y_hat_fpls = smoothPLS_predict(df_predict_list = df,
                               delta_list = fpls_obj$reg_obj, 
                               curve_type_obj = curve_type,
                               int_mode = int_mode,
                               regul_time_obj = regul_time)

train_results["FPLS", ] = evaluate_results(Y_train, Y_hat_fpls)

# Smooth PLS
Y_hat_spls = smoothPLS_predict(df_predict_list = df,
                               delta_list = spls_obj$reg_obj, 
                               curve_type_obj = curve_type,
                               int_mode = int_mode,
                               regul_time_obj = regul_time)

train_results["SmoothPLS", ] = evaluate_results(Y_train, Y_hat_spls)

train_results["NaivePLS", "nb_cp"] = naive_pls_obj$nbCP_opti
train_results["FPLS", "nb_cp"] = fpls_obj$nbCP_opti
train_results["SmoothPLS", "nb_cp"] = spls_obj$nbCP_opti

```

```{r}
train_results
```

## Test Set
Test set creation
```{r}
nind_test = number_of_test_id(TTRatio = TTRatio, nind = nind)

df_test = generate_X_df(nind = nind_test, start = start, end = end, 
                        curve_type = curve_type, lambda_0 = lambda_0, lambda_1 = lambda_1,
                        prob_start = prob_start)

Y_df_test = generate_Y_df(df_test, curve_type = curve_type, 
                          beta_real_func_or_list = beta_real_func, 
                          beta_0_real, NotS_ratio, 
                          id_col = 'id', time_col = 'time')

df_test_regul = regularize_time_series(df_test, time_seq = regul_time,
                                       curve_type = curve_type)
df_test_wide = convert_to_wide_format(df_test_regul)
```


Evaluation
```{r}

Y_test = Y_df_test$Y_noised

# Naive
Y_hat = predict(naive_pls_obj$plsr_model,
                ncomp = naive_pls_obj$nbCP_opti, 
                newdata = as.matrix(df_test_wide[, -c(1)]))
test_results["NaivePLS", ] = evaluate_results(Y_test, Y_hat)

# FPLS
Y_hat_fpls = smoothPLS_predict(df_predict_list = df_test,
                               delta_list = fpls_obj$reg_obj,
                               curve_type_obj = curve_type,
                               int_mode = int_mode, 
                               regul_time_obj = regul_time) 

test_results["FPLS", ] = evaluate_results(Y_test, Y_hat_fpls)

# Smooth PLS
Y_hat_spls = smoothPLS_predict(df_predict_list = df_test,
                               delta_list = list(delta_0, delta),
                               curve_type_obj = curve_type,
                               int_mode = int_mode, 
                               regul_time_obj = regul_time)

test_results["SmoothPLS", ] = evaluate_results(Y_test, Y_hat_spls)

test_results["NaivePLS", "nb_cp"] = naive_pls_obj$nbCP_opti
test_results["FPLS", "nb_cp"] = fpls_obj$nbCP_opti
test_results["SmoothPLS", "nb_cp"] = spls_obj$nbCP_opti
```

```{r}
test_results
```

## Plot results

```{r}
train_results
test_results
```

```{r}
plot_model_metrics_base(train_results, test_results)
```

```{r}
plot_model_metrics_base(train_results, test_results,
                        models_to_plot = c('FPLS', 'SmoothPLS'))
```