% % %
PortfolioTesteR lets you test ML-based stock selection with the same workflow you already know:
All examples use the bundled weekly/daily datasets; no internet is required.
suppressPackageStartupMessages(library(PortfolioTesteR))
suppressPackageStartupMessages(library(data.table))
# Bundled sample data
data(sample_prices_weekly)
data(sample_prices_daily)
# Optional peek
head(sample_prices_weekly[, 1:6])
#> Key: <Date>
#> Date AAPL AMZN BA BAC CAT
#> <Date> <num> <num> <num> <num> <num>
#> 1: 2017-01-06 27.23376 39.7995 147.5085 18.70115 76.05074
#> 2: 2017-01-13 27.49475 40.8570 147.2582 18.97326 77.22778
#> 3: 2017-01-20 27.71648 40.4165 147.9072 18.66817 77.95100
#> 4: 2017-01-27 28.16688 41.7885 155.4819 19.26185 81.58563
#> 5: 2017-02-03 29.81370 40.5100 150.5680 19.20414 76.87957
#> 6: 2017-02-10 30.64816 41.3730 155.4447 19.03097 79.37682
Simple pooled regression on tabular features. The helpers build lagged features and future labels (4-week horizon).
# Prepare tabular features and labels
X <- ml_prepare_features(sample_prices_weekly, sample_prices_daily)
Y <- make_labels(sample_prices_weekly, horizon = 4L, type = "log")
# Linear/Ridge baseline
ridge <- ml_make_model("ridge")
set.seed(1)
res_baseline <- ml_backtest(
features_list = X, labels = Y,
fit_fn = ridge$fit,
predict_fn = ridge$predict,
schedule = list(is = 104L, oos = 4L, step = 4L), # 2y IS, 1m OOS, monthly step
transform = "zscore", # IS-only scaling (no leakage)
selection = list(top_k = 15L),
weighting = list(method = "rank"),
prices = sample_prices_weekly,
initial_capital = 1e5,
name = "Baseline Ridge (pooled)"
)
print(res_baseline$backtest)
#> Backtest Result: Baseline Ridge (pooled)
#> =====================================
#> Warmup Period: 104 observations (no trading)
#> Active Period: 2018-12-31 to 2019-12-31 (54 observations)
#> Initial Capital: $1e+05
#> Final Value: $135,743
#> Total Return (active period): 35.74%
#> Total Return (full period): 35.74%
#> Transactions: 784
#>
#> Annualized Return: 34.22%
#> Annualized Volatility: 11.74%
#> Sharpe Ratio: 2.91
#> Max Drawdown: -6.08%
plot(res_baseline$backtest, type = "performance")
This section is optional. It runs only if at least
one of ranger
/xgboost
is installed.
# Start with ridge (always available)
models <- list(ml_make_model("ridge"))
# Add RF only if ranger is installed
if (has_rf) {
models <- c(models, list(ml_make_model("rf", num.trees = if (heavy) 400L else 200L, mtry = 3L)))
}
# Add XGBoost only if xgboost is installed
if (has_xgb) {
models <- c(models, list(
ml_make_model(
"xgboost",
params = list(objective = "reg:squarederror", max_depth = 4L, eta = 0.07),
nrounds = if (heavy) 300L else 150L
)
))
}
# Build the ensemble from whatever is available
ens <- do.call(ml_make_ensemble, models)
set.seed(2)
res_ens <- ml_backtest(
features_list = X, labels = Y,
fit_fn = ens$fit, predict_fn = ens$predict,
schedule = list(is = 104L, oos = 4L, step = 4L),
transform = "zscore",
selection = list(top_k = 15L),
weighting = list(method = "rank"),
prices = sample_prices_weekly,
initial_capital = 1e5,
name = "Ensemble (available learners)"
)
print(res_ens$backtest)
#> Backtest Result: Ensemble (available learners)
#> =====================================
#> Warmup Period: 104 observations (no trading)
#> Active Period: 2018-12-31 to 2019-12-31 (54 observations)
#> Initial Capital: $1e+05
#> Final Value: $133,347
#> Total Return (active period): 33.35%
#> Total Return (full period): 33.35%
#> Transactions: 872
#>
#> Annualized Return: 31.93%
#> Annualized Volatility: 12.13%
#> Sharpe Ratio: 2.63
#> Max Drawdown: -7.33%
plot(res_ens$backtest, type = "performance")
This section is optional and runs only if
xgboost
is installed.
symbols <- setdiff(names(sample_prices_weekly), "Date")
gmap <- demo_sector_map(symbols, n_groups = 4L) # demo mapping for the sample data
xgb_g <- ml_make_model(
"xgboost",
params = list(objective = "reg:squarederror", max_depth = 3L, eta = 0.05),
nrounds = if (heavy) 250L else 150L
)
set.seed(3)
res_xgb_sect <- ml_backtest(
features_list = X, labels = Y,
fit_fn = xgb_g$fit, predict_fn = xgb_g$predict,
schedule = list(is = 104L, oos = 4L, step = 4L),
group = "per_group", group_map = gmap, # sector-neutral training
transform = "zscore",
selection = list(top_k = 15L),
weighting = list(method = "softmax", temperature = 12),
prices = sample_prices_weekly,
initial_capital = 1e5,
name = "XGBoost (per-sector neutral)"
)
print(res_xgb_sect$backtest)
#> Backtest Result: XGBoost (per-sector neutral)
#> =====================================
#> Warmup Period: 104 observations (no trading)
#> Active Period: 2018-12-31 to 2019-12-31 (54 observations)
#> Initial Capital: $1e+05
#> Final Value: $129,909
#> Total Return (active period): 29.91%
#> Total Return (full period): 29.91%
#> Transactions: 870
#>
#> Annualized Return: 28.66%
#> Annualized Volatility: 10.95%
#> Sharpe Ratio: 2.62
#> Max Drawdown: -6.74%
plot(res_xgb_sect$backtest, type = "performance")
Optional and disabled by default. Enable locally by
running Sys.setenv(RUN_SEQ = "true")
before knitting, and
only if your installation supports sequence models.
tf_autograph_scope() # isolate TF Autograph cache to avoid detritus NOTE
# Build a compact 'returns pyramid' of momentum-like sequences (lagged)
r1 <- panel_lag(calc_momentum(sample_prices_weekly, 1L), 1L)
r4 <- panel_lag(calc_momentum(sample_prices_weekly, 4L), 1L)
r12 <- panel_lag(calc_momentum(sample_prices_weekly, 12L), 1L)
features_seq <- list(r1 = r1, r4 = r4, r12 = r12)
Y_seq <- make_labels(sample_prices_weekly, horizon = 4L, type = "log")
units <- if (heavy) 32L else 8L
epochs <- if (heavy) 12L else 4L
seq_gru <- ml_make_seq_model(
"gru",
steps = 26L,
units = units,
epochs = epochs,
seed = 42L
)
set.seed(42)
res_seq_gru <- ml_backtest_seq(
features_list = features_seq,
labels = Y_seq,
steps = 26L,
horizon = 4L,
fit_fn = seq_gru$fit,
predict_fn = seq_gru$predict,
schedule = list(is = 104L, oos = 4L, step = 4L),
group = "pooled",
normalize = "zscore",
selection = list(top_k = 10L),
weighting = list(method = "softmax", temperature = 12),
prices = sample_prices_weekly,
initial_capital = 1e5,
name = "Seq-GRU (pooled): returns pyramid (26x3)"
)
print(res_seq_gru$backtest)
plot(res_seq_gru$backtest, type = "performance")
Optional and disabled by default. Enable with
RUN_SEQ="true"
.
tf_autograph_scope() # isolate TF Autograph cache for this chunk too
symbols <- setdiff(names(sample_prices_weekly), "Date")
gmap <- demo_sector_map(symbols, n_groups = 4L)
units_g <- if (heavy) 32L else 8L
epochs_g <- if (heavy) 12L else 4L
seq_gru_g <- ml_make_seq_model(
"gru",
steps = 26L,
units = units_g,
epochs = epochs_g,
seed = 123L
)
set.seed(123)
res_seq_gru_g <- ml_backtest_seq(
features_list = features_seq,
labels = Y_seq,
steps = 26L,
horizon = 4L,
fit_fn = seq_gru_g$fit,
predict_fn = seq_gru_g$predict,
schedule = list(is = 104L, oos = 4L, step = 4L),
group = "per_group", group_map = gmap,
normalize = "zscore",
selection = list(top_k = 10L),
weighting = list(method = "softmax", temperature = 12),
prices = sample_prices_weekly,
initial_capital = 1e5,
name = "Seq-GRU (per-sector neutral): returns pyramid (26x3)"
)
print(res_seq_gru_g$backtest)
plot(res_seq_gru_g$backtest, type = "performance")
Runs only off-CRAN (when NOT_CRAN=true
).
topk_vals <- c(8L, 10L, 12L, 15L)
temp_vals <- c(8, 12, 16)
score_tbl <- tune_ml_backtest(
features_list = X, labels = Y, prices = sample_prices_weekly,
fit_fn = ridge$fit, predict_fn = ridge$predict,
schedule = list(is = 104L, oos = 4L, step = 4L),
grid = list(
top_k = topk_vals,
temperature = temp_vals,
method = "softmax",
transform = "zscore"
)
)
score_tbl[order(-sharpe)][1:10]
Try next:
1) Replace GRU with LSTM/CNN1D if available (still gated by
RUN_SEQ
).
2) Blend tabular and sequence scores (e.g., average ranks) before
weighting.
3) Add exposure caps or per-sector position limits when selecting
Top-K.
sessionInfo()
#> R version 4.2.1 (2022-06-23 ucrt)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 26200)
#>
#> Matrix products: default
#>
#> locale:
#> [1] LC_COLLATE=C
#> [2] LC_CTYPE=English_United Kingdom.utf8
#> [3] LC_MONETARY=English_United Kingdom.utf8
#> [4] LC_NUMERIC=C
#> [5] LC_TIME=English_United Kingdom.utf8
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] data.table_1.16.0 PortfolioTesteR_0.1.3
#>
#> loaded via a namespace (and not attached):
#> [1] Rcpp_1.0.13 bslib_0.9.0 compiler_4.2.1 jquerylib_0.1.4
#> [5] iterators_1.0.14 tools_4.2.1 xts_0.14.1 digest_0.6.37
#> [9] jsonlite_2.0.0 evaluate_1.0.5 lifecycle_1.0.4 lattice_0.20-45
#> [13] rlang_1.1.6 Matrix_1.6-5 foreach_1.5.2 cli_3.6.1
#> [17] rstudioapi_0.17.1 curl_7.0.0 yaml_2.3.10 xfun_0.53
#> [21] fastmap_1.2.0 ranger_0.16.0 knitr_1.50 sass_0.4.10
#> [25] xgboost_1.7.11.1 grid_4.2.1 glmnet_4.1-10 R6_2.6.1
#> [29] survival_3.3-1 rmarkdown_2.29 TTR_0.24.4 codetools_0.2-18
#> [33] htmltools_0.5.8.1 splines_4.2.1 shape_1.4.6.1 cachem_1.1.0
#> [37] zoo_1.8-12