--- title: "Automatic Protection" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Automatic Protection} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r, include = FALSE} required <- c("bench", "brio", "callr", "cli", "cpp11", "decor", "desc", "glue", "purrr", "readr", "stringr", "utils", "vctrs", "withr") if (!all(vapply(required, requireNamespace, logical(1), quietly = TRUE))) { knitr::opts_chunk$set(eval = FALSE) knitr::knit_exit() } ``` ```{r, include=FALSE} library(cppally) ``` ```{r, include=FALSE} cpp_source <- function(..., code, debug = FALSE, env = parent.frame()){ preamble <- c("#include ", "using namespace cppally;") code <- paste(c(preamble, code), collapse = "\n") cppally::cpp_source(debug = debug, env = env, code = code, ...) } chunk_impl <- function(x, language){ paste0("```", language, "\n", x, "\n```\n") } as_code_chunk <- function(x, language){ cat(chunk_impl(x, language)) } as_cpp_chunk <- function(x){ as_code_chunk(x, "cpp") } ``` ```{r, include=FALSE} # Compile necessary examples in one-go # All examples are benchmarks so debug = FALSE # cppally-only examples examples <- c( bench_protect_insert_release_cppally = ' #include using namespace cppally; #include [[cppally::register]] double bench_protect_insert_release_cppally(int n) { SEXP dummy = Rf_ScalarInteger(42); R_PreserveObject(dummy); auto start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n; ++i) { r_sexp x(dummy); // insert into pool } // destructor → release from pool auto end = std::chrono::high_resolution_clock::now(); R_ReleaseObject(dummy); double ns = std::chrono::duration(end - start).count(); return ns / n; // nanoseconds per insert+release cycle } ', bench_protect_copy_cppally = ' #include using namespace cppally; #include [[cppally::register]] double bench_protect_copy_cppally(int n) { SEXP dummy = Rf_ScalarInteger(42); r_sexp dummy2 = r_sexp(dummy); auto start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n; ++i) { r_sexp x = dummy2; // Copy } auto end = std::chrono::high_resolution_clock::now(); double ns = std::chrono::duration(end - start).count(); return ns / n; // nanoseconds per copy } ', cppally_na_count = ' [[cppally::register]] int cppally_na_count(r_vec x){ r_size_t n = x.length(); int na_count = 0; for (r_size_t i = 0; i < n; ++i){ r_str str = x.get(i); // r_str protects the underlying CHARSXP na_count += is_na(str); } return na_count; } ', cppally_fast_na_count = ' [[cppally::register]] int cppally_fast_na_count(r_vec x){ r_size_t n = x.length(); int na_count = 0; for (r_size_t i = 0; i < n; ++i){ r_str_view str = x.get(i); // `r_str_view` does NOT re-protect the underlying CHARSXP na_count += is_na(str); } return na_count; } ', cppally_fast_na_count_v2 = ' [[cppally::register]] int cppally_fast_na_count_v2(r_vec x){ r_size_t n = x.length(); int na_count = 0; for (r_size_t i = 0; i < n; ++i){ // view() is safe in a short-lived read-only context na_count += is_na(x.view(i)); } return na_count; } ' ) # cpp11-using examples (and the R C API baseline). Compiled separately so the # cpp11 includes / linking_to / namespace usage stays out of the cppally-only # translation unit. cpp11_examples <- c( bench_protect_insert_release_cpp11 = ' #include [[cppally::linking_to("cpp11")]] using namespace cpp11; #include [[cppally::register]] double bench_protect_insert_release_cpp11(int n) { SEXP dummy = Rf_ScalarInteger(42); R_PreserveObject(dummy); auto start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n; ++i) { sexp x(dummy); // insert into pool } // destructor → release from pool auto end = std::chrono::high_resolution_clock::now(); R_ReleaseObject(dummy); double ns = std::chrono::duration(end - start).count(); return ns / n; // nanoseconds per insert+release cycle } ', bench_protect_copy_cpp11 = ' #include [[cppally::linking_to("cpp11")]] using namespace cpp11; #include [[cppally::register]] double bench_protect_copy_cpp11(int n) { SEXP dummy = Rf_ScalarInteger(42); sexp dummy2 = sexp(dummy); auto start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n; ++i) { sexp x = dummy2; // Copy } auto end = std::chrono::high_resolution_clock::now(); double ns = std::chrono::duration(end - start).count(); return ns / n; // nanoseconds per copy } ', C_na_count = ' // Pure R C API NA count - As fast as it can reasonably get [[cppally::register]] // Registered via cppally for convenience int C_na_count(SEXP x){ r_size_t n = Rf_xlength(x); int na_count = 0; const SEXP *p_x = STRING_PTR_RO(x); for (r_size_t i = 0; i < n; ++i){ SEXP str = p_x[i]; // No protection so no extra overhead na_count += str == NA_STRING; } return na_count; } ', cpp11_na_count = ' #include [[cppally::linking_to("cpp11")]] [[cppally::register]] int cpp11_na_count(SEXP x){ using namespace cpp11; strings x_(x); R_xlen_t n = x_.size(); int na_count = 0; for (R_xlen_t i = 0; i < n; ++i){ r_string str = x_[i]; // r_string protects the underlying CHARSXP na_count += cpp11::is_na(str); } return na_count; } ' ) # Display-only snippets (no [[cppally::register]] tag, so not exposed to R). # Compiled in their own batch since they are illustrative rather than benchmarked. display_only <- c( view_good = ' void good(r_str x){ r_str_view str = x; if (str.cpp_str() == "true"){ print("true"); } else { print("false"); } } ', view_bad = ' r_str_view bad(){ r_str new_str("I will be destroyed at the end of `bad()`"); r_str_view bad_str = new_str; // A view of new_str return bad_str; // Points to underlying CHARSXP but nothing protecting it } ' ) cpp_source(code = paste(examples, collapse = "\n")) cpp_source(code = paste(cpp11_examples, collapse = "\n")) cpp_source(code = paste(display_only, collapse = "\n")) ``` ## Motivation and design Heavily inspired by cpp11's double-linked protection pool, this vignette explores cppally's automatic protection design and benchmarks its overhead against cpp11. Like cpp11, cppally offers automatic protection for all cppally types enabling users of the API to not need to think about protection of every R object. cppally uses a double-linked protection pool design like cpp11, but instead of a single pair list, we implement a double-linked chain of VECSXP (vector list) chunks with a reference counting system. This offers much less overhead in inserting and releasing and practically free copying due to the reference counting design. Because we utilise vector list chunks, adding nodes is cheap (via `SET_VECTOR_ELT()`). When an `r_sexp` is copied, we increment the reference count associated with that node and only release the node once the reference count is 0, which means all `r_sexp` objects pointing to that `SEXP` have been destroyed. The vector list chunks initialise at size 2^10 and double in size every time a chunk is filled, to a maximum size of 2^14. While these sizes seem to strike a nice balance, they are somewhat arbitrary and likely not optimal, so there is likely room for improvement. ## Protection benchmark: cpp11 vs cppally All functions have been compiled with C++20, GCC 14.2.0 with -O2 optimisations. First, let's load cppally ```{r} library(cppally) ``` **Insert/release benchmark** ```{r, echo=FALSE, comment="", results='asis'} as_cpp_chunk(cpp11_examples[["bench_protect_insert_release_cpp11"]]) ``` ```{r, echo=FALSE, comment="", results='asis'} as_cpp_chunk(examples[["bench_protect_insert_release_cppally"]]) ``` **Results in nanoseconds per insert & release** ```{r} insert_release_cpp11 <- replicate(10^4, bench_protect_insert_release_cpp11(10^4)) mean(insert_release_cpp11) insert_release_cppally <- replicate(10^4, bench_protect_insert_release_cppally(10^4)) mean(insert_release_cppally) ``` On my machine, cpp11 performs an insert & release every ~`r round(mean(insert_release_cpp11))` nanoseconds. cppally performs significantly better, with ~`r round(mean(insert_release_cppally))` nanoseconds per insert & release. **Copy benchmark** ```{r, echo=FALSE, comment="", results='asis'} as_cpp_chunk(cpp11_examples[["bench_protect_copy_cpp11"]]) ``` ```{r, echo=FALSE, comment="", results='asis'} as_cpp_chunk(examples[["bench_protect_copy_cppally"]]) ``` **Results in nanoseconds per copy** ```{r} copy_sexp_cpp11 <- replicate(10^4, bench_protect_copy_cpp11(10^4)) mean(copy_sexp_cpp11) copy_sexp_cppally <- replicate(10^4, bench_protect_copy_cppally(10^4)) mean(copy_sexp_cppally) ``` In these benchmark results we can see a drastic difference, with cpp11 at ~`r round(mean(copy_sexp_cpp11))` ns/copy and cppally at ~`r round(mean(copy_sexp_cppally), 1)` ns/copy. **Impact of protection overhead, a real example** Let's look at a real example of how much protection overhead can have an impact on performance. **Example:** Counting the number of `NA` values in a character vector ```{r, echo=FALSE, comment="", results='asis'} as_cpp_chunk(cpp11_examples[["C_na_count"]]) as_cpp_chunk(cpp11_examples[["cpp11_na_count"]]) as_cpp_chunk(examples[["cppally_na_count"]]) ``` ```{r} set.seed(42) x <- sample(letters, 10^5, TRUE) x[sample.int(length(x), 10^3)] <- NA ``` **R C API results** - Extremely fast <30 microseconds ```{r} library(bench) mark(C_na_count(x)) ``` **cpp11 results** - ~5 milliseconds ```{r} mark(cpp11_na_count(x)) ``` **cppally results** - ~750 microseconds ```{r} mark(cppally_na_count(x)) ``` Counting values is a simple operation and because of its simplicity, the overhead associated with protection will dominate the benchmark. We can see that when removing the protection overhead and using the R C API directly, we get a baseline result of 30 microseconds. Think of that as the best possible result for this operation. The same operation with cpp11 results in a function execution time of 5 milliseconds, 180x slower than the R C API. Considerably better (but still relatively slower), `cppally_na_count()` results in a function execution time of 730 microseconds, 27x slower than the R C API. This is much better though it still goes to show that for certain operations, one may want to consider other approaches where performance is critical. ## cppally views: A solution to the protection overhead problem As we saw in the previous section, certain performance-heavy functions can be slowed down by cppally protection overhead. Luckily cppally offers some tools to avoid this if it becomes a real issue. ### r_str_view In the previous section, we extracted a temporary `r_str`, checked if it is `NA`, and then incremented the `NA` count if so. We didn't end up using the `r_str` element beyond just reading its value which means we could have improved performance by using `r_str_view`, a non-owning version of `r_str`. ```{r, echo=FALSE, comment="", results='asis'} as_cpp_chunk(examples[["cppally_fast_na_count"]]) ``` ```{r} mark(cppally_fast_na_count(x)) ``` Looking at the benchmark results, we have effectively eliminated the protection overhead and the median execution time is much closer to that of the R C API. ### view() We could have also used `view()`, a non-owning version of `get()` that extracts elements but in a non-owning context. ```{r, echo=FALSE, comment="", results='asis'} as_cpp_chunk(examples[["cppally_fast_na_count_v2"]]) ``` ```{r} mark(cppally_fast_na_count_v2(x)) ``` The results are similar to that of `cppally_fast_na_count()`. ### Using views safely views can be used to eliminate the small overhead associated with automatic protection of objects wrapping SEXP, but they must be used carefully. As demonstrated above, `r_str_view` is one such class which is a lightweight wrapper around a `SEXP` and never protects the underlying `SEXP`. Its lifetime must be shorter than the object it is pointing to. **DO**: ```{r, echo=FALSE, comment="", results='asis'} as_cpp_chunk(display_only[["view_good"]]) ``` The above example is safe because `str` is not returned by `good()` AND does not outlive `x`. **DON'T**: ```{r, echo=FALSE, comment="", results='asis'} as_cpp_chunk(display_only[["view_bad"]]) ``` The above is a classic example of what **not** to do with string views, which is to have the view outlive the owner. In this case `bad_str` is returned at the end of `bad()` at which point `new_str` goes out of scope and gets destroyed. This means nothing is protecting the underlying `CHARSXP` that `new_str` was protecting and once that `CHARSXP` is garbage-collected by R, `bad_str` will become a dangling-pointer, leading to use-after-free bugs (undefined behaviour, crashes, corrupted data, etc). To conclude, `views` can be a useful tool for performance-critical functions with the downside being that one must pay more attention to view object lifetimes to ensure they do not outlive the SEXP they are pointing to.