## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## ----setup--------------------------------------------------------------------
library(tidylearn)
library(dplyr)
library(ggplot2)

## -----------------------------------------------------------------------------
# Reduce dimensions before classification
reduced <- tl_reduce_dimensions(iris,
                                response = "Species",
                                method = "pca",
                                n_components = 3)

# Inspect reduced data
head(reduced$data)

## -----------------------------------------------------------------------------
# Train classifier on reduced features (remove .obs_id column first)
reduced_data <- reduced$data %>% select(-starts_with(".obs"))
model_reduced <- tl_model(reduced_data, Species ~ ., method = "logistic")
print(model_reduced)

## -----------------------------------------------------------------------------
# Make predictions
preds <- predict(model_reduced)
accuracy <- mean(preds$.pred == iris$Species)
cat("Accuracy with PCA features:", round(accuracy * 100, 1), "%\n")

## -----------------------------------------------------------------------------
# Split data for fair comparison
split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)

# Model with original features
model_original <- tl_model(split$train, Species ~ ., method = "logistic")
preds_original <- predict(model_original, new_data = split$test)
acc_original <- mean(preds_original$.pred == split$test$Species)

# Model with PCA features
reduced_train <- tl_reduce_dimensions(split$train,
                                     response = "Species",
                                     method = "pca",
                                     n_components = 3)

# Remove .obs_id column before modeling (it's just an identifier)
reduced_train_data <- reduced_train$data %>% select(-starts_with(".obs"))
model_pca <- tl_model(reduced_train_data, Species ~ ., method = "logistic")

# Need to transform test data using same PCA
test_predictors <- split$test %>% select(-Species)
test_transformed <- predict(reduced_train$reduction_model, new_data = test_predictors)
test_transformed$Species <- split$test$Species
test_transformed <- test_transformed %>% select(-starts_with(".obs"))

preds_pca <- predict(model_pca, new_data = test_transformed)
acc_pca <- mean(preds_pca$.pred == split$test$Species)

# Compare results
cat("Original features (4):", round(acc_original * 100, 1), "%\n")
cat("PCA features (3):", round(acc_pca * 100, 1), "%\n")
cat("Feature reduction:", round((1 - 3/4) * 100, 1), "%\n")

## -----------------------------------------------------------------------------
# Add cluster features
data_clustered <- tl_add_cluster_features(iris,
                                         response = "Species",
                                         method = "kmeans",
                                         k = 3)

# Check new features
names(data_clustered)

## -----------------------------------------------------------------------------
# Train model with cluster features
model_cluster <- tl_model(data_clustered, Species ~ ., method = "forest")
print(model_cluster)

## -----------------------------------------------------------------------------
# Compare models with and without cluster features
split_comp <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42)

# Without cluster features
model_no_cluster <- tl_model(split_comp$train, Species ~ ., method = "forest")
preds_no_cluster <- predict(model_no_cluster, new_data = split_comp$test)
acc_no_cluster <- mean(preds_no_cluster$.pred == split_comp$test$Species)

# With cluster features
train_clustered <- tl_add_cluster_features(split_comp$train,
                                          response = "Species",
                                          method = "kmeans",
                                          k = 3)
model_with_cluster <- tl_model(train_clustered, Species ~ ., method = "forest")

# Need to get cluster model for test data
cluster_model <- attr(train_clustered, "cluster_model")
test_clusters <- predict(cluster_model, new_data = split_comp$test[, -5])
test_clustered <- split_comp$test
test_clustered$cluster_kmeans <- as.factor(test_clusters$cluster)

preds_with_cluster <- predict(model_with_cluster, new_data = test_clustered)
acc_with_cluster <- mean(preds_with_cluster$.pred == split_comp$test$Species)

cat("Without cluster features:", round(acc_no_cluster * 100, 1), "%\n")
cat("With cluster features:", round(acc_with_cluster * 100, 1), "%\n")

## -----------------------------------------------------------------------------
# Use only 10% of labels
set.seed(123)
labeled_indices <- sample(nrow(iris), size = 15)  # Only 15 out of 150 labeled!

# Train semi-supervised model
model_semi <- tl_semisupervised(iris, Species ~ .,
                               labeled_indices = labeled_indices,
                               cluster_method = "kmeans",
                               supervised_method = "logistic")

print(model_semi)

## -----------------------------------------------------------------------------
# Check how labels were propagated
label_mapping <- model_semi$semisupervised_info$label_mapping
print(label_mapping)

## -----------------------------------------------------------------------------
# Evaluate performance
preds_semi <- predict(model_semi)
accuracy_semi <- mean(preds_semi$.pred == iris$Species)

cat("Accuracy with only", length(labeled_indices), "labels:",
    round(accuracy_semi * 100, 1), "%\n")
cat("Proportion of data labeled:", round(length(labeled_indices)/nrow(iris) * 100, 1), "%\n")

## -----------------------------------------------------------------------------
# Fully supervised with same amount of data
labeled_data <- iris[labeled_indices, ]
model_full <- tl_model(labeled_data, Species ~ ., method = "logistic")
preds_full <- predict(model_full, new_data = iris)
accuracy_full <- mean(preds_full$.pred == iris$Species)

cat("Fully supervised (15 samples):", round(accuracy_full * 100, 1), "%\n")
cat("Semi-supervised (15 labels + propagation):", round(accuracy_semi * 100, 1), "%\n")

## ----eval=FALSE---------------------------------------------------------------
# # Flag anomalies as a feature
# model_anomaly_flag <- tl_anomaly_aware(iris, Species ~ .,
#                                       response = "Species",
#                                       anomaly_method = "dbscan",
#                                       action = "flag",
#                                       supervised_method = "logistic")
# 
# # Check anomaly info
# cat("Anomalies detected:", model_anomaly_flag$anomaly_info$n_anomalies, "\n")

## ----eval=FALSE---------------------------------------------------------------
# # Remove anomalies before training
# model_anomaly_remove <- tl_anomaly_aware(iris, Species ~ .,
#                                         response = "Species",
#                                         anomaly_method = "dbscan",
#                                         action = "remove",
#                                         supervised_method = "logistic")
# 
# cat("Anomalies removed:", model_anomaly_remove$anomalies_removed, "\n")

## -----------------------------------------------------------------------------
# Train separate models for different clusters
stratified_models <- tl_stratified_models(mtcars, mpg ~ .,
                                         cluster_method = "kmeans",
                                         k = 3,
                                         supervised_method = "linear")

# Check structure
names(stratified_models)
length(stratified_models$supervised_models)

## -----------------------------------------------------------------------------
# Predictions using stratified models
preds_stratified <- predict(stratified_models)
head(preds_stratified)

## -----------------------------------------------------------------------------
# Calculate RMSE
rmse_stratified <- sqrt(mean((preds_stratified$.pred - mtcars$mpg)^2))
cat("Stratified Model RMSE:", round(rmse_stratified, 2), "\n")

# Compare with single model
model_single <- tl_model(mtcars, mpg ~ ., method = "linear")
preds_single <- predict(model_single)
rmse_single <- sqrt(mean((preds_single$.pred - mtcars$mpg)^2))
cat("Single Model RMSE:", round(rmse_single, 2), "\n")

## -----------------------------------------------------------------------------
# Step 1: Split data
workflow_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42)

# Step 2: Reduce dimensions
workflow_reduced <- tl_reduce_dimensions(workflow_split$train,
                                        response = "Species",
                                        method = "pca",
                                        n_components = 3)

# Step 3: Add cluster features to reduced data (remove .obs_id first)
workflow_reduced_clean <- workflow_reduced$data %>% select(-starts_with(".obs"))
workflow_clustered <- tl_add_cluster_features(workflow_reduced_clean,
                                             response = "Species",
                                             method = "kmeans",
                                             k = 3)

# Step 4: Train final model
workflow_model <- tl_model(workflow_clustered, Species ~ ., method = "forest")

print(workflow_model)

## -----------------------------------------------------------------------------
# Transform test data through same pipeline
# 1. Apply PCA transformation
test_pca <- predict(workflow_reduced$reduction_model,
                   new_data = workflow_split$test[, -5])
test_pca$Species <- workflow_split$test$Species

# 2. Get cluster assignments
cluster_model_wf <- attr(workflow_clustered, "cluster_model")
test_clusters_wf <- predict(cluster_model_wf,
                            new_data = test_pca[, grep("PC", names(test_pca))])
test_pca$cluster_kmeans <- as.factor(test_clusters_wf$cluster)

# 3. Predict
workflow_preds <- predict(workflow_model, new_data = test_pca)
workflow_accuracy <- mean(workflow_preds$.pred == workflow_split$test$Species)

cat("Complete Workflow Accuracy:", round(workflow_accuracy * 100, 1), "%\n")

## -----------------------------------------------------------------------------
# Simulate credit data
set.seed(42)
n <- 500
credit_data <- data.frame(
  age = rnorm(n, 40, 12),
  income = rnorm(n, 50000, 20000),
  debt_ratio = runif(n, 0, 0.5),
  credit_score = rnorm(n, 700, 100),
  years_employed = rpois(n, 5)
)

# Create target variable (default risk)
credit_data$default <- factor(
  ifelse(credit_data$debt_ratio > 0.4 & credit_data$credit_score < 650, "Yes", "No")
)

# Split data
credit_split <- tl_split(credit_data, prop = 0.7, stratify = "default", seed = 123)

## -----------------------------------------------------------------------------
# Strategy 1: Add customer segments as features
credit_clustered <- tl_add_cluster_features(credit_split$train,
                                           response = "default",
                                           method = "kmeans",
                                           k = 4)

model_credit <- tl_model(credit_clustered, default ~ ., method = "forest")

# Transform test data
cluster_model_credit <- attr(credit_clustered, "cluster_model")
test_clusters_credit <- predict(cluster_model_credit,
                               new_data = credit_split$test[, -6])
test_credit <- credit_split$test
test_credit$cluster_kmeans <- as.factor(test_clusters_credit$cluster)

preds_credit <- predict(model_credit, new_data = test_credit)
accuracy_credit <- mean(preds_credit$.pred == credit_split$test$default)

cat("Credit Risk Model Accuracy:", round(accuracy_credit * 100, 1), "%\n")

## -----------------------------------------------------------------------------
# Final integrated example
final_data <- iris
final_split <- tl_split(final_data, prop = 0.7, stratify = "Species", seed = 999)

# Combine PCA + clustering
final_reduced <- tl_reduce_dimensions(final_split$train,
                                     response = "Species",
                                     method = "pca",
                                     n_components = 3)
# Remove .obs_id column before clustering
final_reduced_clean <- final_reduced$data %>% select(-starts_with(".obs"))
final_clustered <- tl_add_cluster_features(final_reduced_clean,
                                          response = "Species",
                                          method = "kmeans",
                                          k = 3)
final_model <- tl_model(final_clustered, Species ~ ., method = "logistic")

cat("Final integrated model created successfully!\n")
print(final_model)