## ----include = FALSE----------------------------------------------------------
options(rmarkdown.html_vignette.check_title = FALSE)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----rfae---------------------------------------------------------------------
# Load libraries
library(RFAE)
library(data.table)
library(ggplot2)
library(arf)
library(ranger)
set.seed(42)

# Train-test split
trn <- sample(1:nrow(iris), 100)
tst <- setdiff(1:nrow(iris), trn)

# Train a RF and an ARF
rf <- ranger(Species ~., data = iris[trn, ], num.trees=50)
arf <- adversarial_rf(iris[trn, ], num_trees = 50, parallel = FALSE)

## ----par, eval=FALSE----------------------------------------------------------
# # Register cores - Unix
# library(doParallel)
# registerDoParallel(cores = 2)

## ----par2, eval=FALSE---------------------------------------------------------
# # Register cores - Windows
# library(doParallel)
# cl <- makeCluster(2)
# registerDoParallel(cl)

## ----rfae3--------------------------------------------------------------------
# Rerun in parallel
arf <- adversarial_rf(iris[trn, ], num_trees=50)
rf <- ranger(Species ~., iris[trn, ], num.trees=50)

## ----encoding-----------------------------------------------------------------
# One encoding for each type of RF
# We choose k=2 to allow visualisation
emap_arf <- encode(arf, iris[trn, ], k=2)
emap_rf <- encode(rf, iris[trn, ], k=2)

# Print out first five rows of embeddings
# The first five rows of x
iris[trn, ][1:5, ]
# The first five embedded samples for ARF
emap_arf$Z[1:5, ]
# The first five embedded samples for RF
emap_rf$Z[1:5, ]

## ----encoding2, fig.height=5, fig.width=7-------------------------------------
# Plot the embedded training data
tmp <- data.frame(
  dim1 = emap_arf$Z[, 1],
  dim2 = emap_arf$Z[, 2],
  class = iris[trn, ]$Species
)
ggplot(tmp, aes(x = dim1, y = dim2, color = class)) +
  geom_point(size = 2, alpha = 0.8) +
  theme_minimal() +
  labs(
    x = "Diffusion Component 1",
    y = "Diffusion Component 2",
    color = "Species"
  )

## ----encoding4----------------------------------------------------------------
A <- emap_arf$A
A[1:5, 1:5]

## ----encoding5, fig.height=5, fig.width=7-------------------------------------
# Project testing data
emb <- predict(emap_arf, arf, iris[tst, ])

# Plot test embeddings
tmp <- data.frame(
  dim1 = emb[, 1],
  dim2 = emb[, 2],
  class = iris[tst, ]$Species
)

ggplot(tmp, aes(x = dim1, y = dim2, color = class)) +
  geom_point(size = 2, alpha = 0.8) +
  theme_minimal() +
  labs(
    x = "Diffusion Component 1",
    y = "Diffusion Component 2",
    color = "Species"
  )

## ----decoding1----------------------------------------------------------------
# Decode data
out <- decode_knn(arf, emap_arf, emb)
# Reconstructed testing data
out$x_hat[1:5, ]
# Original testing data
iris[tst, ][1:5, ]

## ----errors-------------------------------------------------------------------
errors <- reconstruction_error(out$x_hat, iris[tst, ])
# Error in numerical features
errors$num_error
# Error in categorical features
errors$cat_error
# Average numerical error
errors$num_avg
# Average categorical error
errors$cat_avg
# Overall error
errors$ovr_error

## ----errors2, fig.height=5, fig.width=7---------------------------------------

# Plotting the errors by each feature
error_df <- data.frame(
  Variable = c(names(errors$num_error), names(errors$cat_error)),
  Error = c(unlist(errors$num_error), unlist(errors$cat_error)),
  Type = c(rep("Numeric", length(errors$num_error)), 
           rep("Categorical", length(errors$cat_error)))
)

ggplot(error_df, aes(x = reorder(Variable, Error), y = Error, fill = Type)) +
  geom_bar(stat = "identity", width = 0.7) +
  geom_hline(aes(yintercept = errors$ovr_error), 
             linetype = "dashed", color = "red") +
  annotate("text", x = 1.5, y = errors$ovr_error + 0.02, 
           label = paste("Avg Error:", round(errors$ovr_error, 3))) +
  theme_minimal() +
  labs(
    title = "Reconstruction Error by Feature",
    x = NULL,
    y = "Distortion"
  )