catboost models

Function Works
tidypredict_fit(), tidypredict_sql(), parse_model()
tidypredict_to_column()
tidypredict_test()
tidypredict_interval(), tidypredict_sql_interval()
parsnip

tidypredict_ functions

library(catboost)
# Prepare data
X <- data.matrix(mtcars[, c("mpg", "cyl", "disp")])
y <- mtcars$hp

pool <- catboost.load_pool(
  X,
  label = y,
  feature_names = as.list(c("mpg", "cyl", "disp"))
)

model <- catboost.train(
  pool,
  params = list(
    iterations = 10L,
    depth = 3L,
    learning_rate = 0.5,
    loss_function = "RMSE",
    logging_level = "Silent",
    allow_writing_files = FALSE
  )
)

mtcars %>% tidypredict_to_column(model) %>% glimpse() ```

Supported objectives

CatBoost supports many objective functions. The following objectives are supported by tidypredict:

Regression objectives (identity transform)

Binary classification (sigmoid transform)

Multiclass classification

Binary classification example

X_bin <- data.matrix(mtcars[, c("mpg", "cyl", "disp")])
y_bin <- mtcars$am

pool_bin <- catboost.load_pool(
  X_bin,
  label = y_bin,
  feature_names = as.list(c("mpg", "cyl", "disp"))
)

model_bin <- catboost.train(
  pool_bin,
  params = list(
    iterations = 10L,
    depth = 3L,
    learning_rate = 0.5,
    loss_function = "Logloss",
    logging_level = "Silent",
    allow_writing_files = FALSE
  )
)

tidypredict_test(model_bin, xg_df = X_bin)

Multiclass classification example

X_multi <- data.matrix(iris[, 1:4])
y_multi <- as.integer(iris$Species) - 1L

pool_multi <- catboost.load_pool(
  X_multi,
  label = y_multi,
  feature_names = as.list(colnames(iris)[1:4])
)

model_multi <- catboost.train(
  pool_multi,
  params = list(
    iterations = 10L,
    depth = 3L,
    learning_rate = 0.5,
    loss_function = "MultiClass",
    logging_level = "Silent",
    allow_writing_files = FALSE
  )
)

# Multiclass returns a list of formulas, one per class
formulas <- tidypredict_fit(model_multi)
names(formulas)

Test multiclass predictions:

tidypredict_test(model_multi, xg_df = X_multi)

Categorical features

CatBoost models can use categorical features with one-hot encoding.

With raw CatBoost

For raw CatBoost models, you need to manually establish the hash-to-category mapping:

pool_cat <- catboost.load_pool(
  df_cat[, c("num_feat", "cat_feat")],
  label = df_cat$target
)

model_cat <- catboost.train(
  pool_cat,
  params = list(
    iterations = 10L,
    depth = 3L,
    learning_rate = 0.5,
    loss_function = "RMSE",
    logging_level = "Silent",
    allow_writing_files = FALSE,
    one_hot_max_size = 10
  )
)

# Parse and set category mapping manually
pm_cat <- parse_model(model_cat)
pm_cat <- set_catboost_categories(pm_cat, model_cat, df_cat)

# Now use the parsed model
tidypredict_fit(pm_cat)

Parse model spec

Here is an example of the model spec:

pm <- parse_model(model)
str(pm, 2)
str(pm$trees[1])

Limitations