---
title: "Data Generation: Mechanisms and Examples"
css: css/datagen.css
name: RMSTpowerBoost-DataGen
output:
  rmarkdown::html_vignette:
    toc: true
    fig_caption: true
    code_folding: show
    df_print: paged
    highlight: null
    self_contained: true
vignette: >
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
  %\VignetteIndexEntry{Data Generation: Mechanisms and Examples}
  %\VignetteDepends{RMSTpowerBoost}
---


```{r setup, include=FALSE, purl=FALSE}
is_pkgdown <- identical(Sys.getenv("IN_PKGDOWN"), "true")

cache_root <- Sys.getenv("RMSTPOWERBOOST_VIGNETTE_CACHE", unset = "")
if (!nzchar(cache_root)) {
  # R CMD build renders vignettes in a temporary copy; local builds need a stable cache path.
  cache_root <- if (nzchar(Sys.getenv("_R_CHECK_PACKAGE_NAME_"))) {
    "cache"
  } else {
    file.path(tools::R_user_dir("RMSTpowerBoost", "cache"), "vignettes")
  }
}
cache_path <- file.path(cache_root, "RMSTpowerBoost-DataGen", "html")
dir.create(cache_path, recursive = TRUE, showWarnings = FALSE)
cache_path <- paste0(normalizePath(cache_path, winslash = "/", mustWork = FALSE), "/")

pkg_root <- if (file.exists("../DESCRIPTION") && dir.exists("../R")) ".." else "."
pkg_version <- read.dcf(file.path(pkg_root, "DESCRIPTION"))[1, "Version"]
r_files <- list.files(file.path(pkg_root, "R"), pattern = "[.]R$",
                      recursive = TRUE, full.names = TRUE)
cache_extra <- paste(
  getRversion(),
  pkg_version,
  paste(unname(tools::md5sum(r_files)), collapse = ""),
  sep = "-"
)

knitr::opts_chunk$set(
  collapse = TRUE,
  cache = TRUE,
  cache.path = cache_path,
  cache.extra = cache_extra,
  cache.lazy = FALSE,
  purl = FALSE,
  comment = NA,
  fig.width = 6, fig.height = 4,
  warning = FALSE, message = FALSE
)
library(RMSTpowerBoost)
set.seed(1)
```

# Quick start with `rmst.sim()`

For common settings — AFT or PH baseline, random censoring, and standard covariate distributions — `rmst.sim()` generates data from one function call. Use `covar_continuous()`, `covar_binary()`, and `covar_categorical()` to define covariates directly.

```{r quick_start, eval=TRUE}
df <- rmst.sim(
  n              = 200,
  model          = "aft_lognormal",
  baseline       = list(mu = 2.5, sigma = 0.6),
  treat_effect   = -0.25,
  covariates     = list(
    covar_continuous("age",  mean = 62, sd = 10),
    covar_binary("sex",      p = 0.45),
    covar_continuous("x",    dist = "lognormal", meanlog = 0, sdlog = 0.5)
  ),
  target_censoring = 0.25,
  L    = 36,
  seed = 42
)

print(df)
```

The result is a data frame of class `rmst_simdata`. Pass it directly to `rmst.power()` or `rmst.ss()`:

```r
rmst.power(Surv(time, status) ~ age + sex + x,
           data = df, arm = "arm",
           sample_sizes = c(100, 200, 300), L = 36)
```

For designs requiring explicit covariate-dependent censoring or stratified assignment, use the full recipe system described below.

---

# What this covers

We show how to construct a **recipe** (covariates (X), treatment (A), event time (Y), and censoring (C)), and then simulate datasets with `simulate_from_recipe()`. We also show batch generation via `generate_recipe_sets()` and how to read compact metadata back with `load_recipe_sets()`.

The simulator takes a plain **list** as the recipe. No YAML is required.

## Recipe skeleton

```r
list(
  n = 300,
  covariates = list(defs = list(/* see Covariates */)),
  treatment  = list(/* see Treatment */),
  event_time = list(/* see Event-time engines */),
  censoring  = list(/* see Censoring */),
  seed = 42
)
```

---

# Covariates

Each covariate has a `name`, `type` (`"continuous"` or `"categorical"`), a `dist` with `params`, and optional `transform` steps applied after generation (e.g., `"center(a)"`, `"scale(b)"`).

**Available distributions**

| Type        | `dist` (string) | Parameters                                |
| ----------- | --------------- | ----------------------------------------- |
| continuous  | `normal`        | `mean`, `sd`                              |
| continuous  | `lognormal`     | `meanlog`, `sdlog`                        |
| continuous  | `gamma`         | `shape`, `scale`                          |
| continuous  | `weibull`       | `shape`, `scale`                          |
| continuous  | `uniform`       | `min`, `max`                              |
| continuous  | `beta`          | `shape1`, `shape2`                        |
| continuous  | `t`             | `df`                                      |
| categorical | `bernoulli`     | `p` (probability of 1)                    |
| categorical | `categorical`   | `prob = c(...)`, `labels = c(...)` (opt.) |
| categorical | `ordinal`       | `prob = c(...)`, `labels = c(...)` (opt.) |

**Example: define covariates**

```r
covs <- list(
  list(name="age",   type="continuous",  dist="normal",     params=list(mean=62, sd=10),
       transform=c("center(60)","scale(10)")),
  list(name="sex",   type="categorical", dist="bernoulli",  params=list(p=0.45)),
  list(name="stage", type="categorical", dist="ordinal",
       params=list(prob=c(0.3,0.5,0.2), labels=c("I","II","III"))),
  list(name="x",     type="continuous",  dist="lognormal",  params=list(meanlog=0, sdlog=0.6))
)
```

---

# Treatment

Choose one `assignment`:

| Assignment        | Key fields                           | Meaning                                                                               |
| ----------------- | ------------------------------------ | ------------------------------------------------------------------------------------- |
| `"randomization"` | `allocation = "a:b"`                 | Bernoulli with probability (p_1 = a/(a+b)).                                           |
| `"stratified"`    | `allocation`, `stratify_by = c(...)` | Same allocation **within** each stratum defined by listed **categorical** covariates. |
| `"logistic_ps"`   | `ps_model = list(formula, beta)`     | Treatment probability is `plogis(eta)` from the user-specified model.                 |

**Examples**

Randomization:

```r
tr_rand <- list(assignment="randomization", allocation="1:1")
```

Stratified by `"stage"`:

```r
tr_strat <- list(assignment="stratified", allocation="2:1", stratify_by=c("stage"))
```

Logistic propensity:

```r
tr_ps <- list(
  assignment = "logistic_ps",
  ps_model  = list(
    formula = "~ 1 + x + sex",
    beta    = c(-0.3, 1.2, -0.6)  # (Intercept), x, sex
  )
)
```

---

# Event-time engines

Let `Z` be treatment (0/1), `X` be covariates, and `eta` be the **linear predictor** (defined in `effects`, below). Supported engines and baseline parameterizations:

| Model (user-facing) | `model` value       | Baseline parameters                       | Notes                                                                            |
| ------------------- | ------------------- | ----------------------------------------- | -------------------------------------------------------------------------------- |
| AFT Lognormal       | `"aft_lognormal"`   | `mu`, `sigma`                             | `log(T) = mu + eta + sigma * epsilon`, with `epsilon ~ N(0, 1)`.                 |
| AFT Weibull         | `"aft_weibull"`     | `shape`, `scale`                          | Baseline survival `S0(t) = exp(-(t/scale)^shape)` with an AFT shift via `eta`.   |
| AFT Log-Logistic    | `"aft_loglogistic"` | `shape`, `scale`                          | Log-logistic AFT model with `T = scale * exp(eta) * (U/(1-U))^(1/shape)`.        |
| PH Exponential      | `"ph_pwexp"`        | `rates = c(rate)`, `cuts = numeric(0)`    | Piecewise exponential with a **single** segment is exponential.                  |
| PH Weibull          | `"ph_weibull"`      | `shape`, `scale`                          | Proportional hazards with a Weibull baseline.                                    |
| PH Gompertz         | `"ph_gompertz"`     | `rate`, `gamma`                           | Hazard `h(t) = rate * exp(gamma * t)`.                                           |
| PH Piecewise Exp    | `"ph_pwexp"`        | `rates = c(r1,r2,...)`, `cuts = c(...)`   | Segment-specific hazard is `r_s * exp(eta)`.                                     |

**Effects and linear predictor**

Specify effects on the appropriate scale (AFT: log-time; PH: log-hazard):

```r
effects = list(
  intercept  = 0,                      # default is 0
  treatment  = -0.25,
  covariates = list(age = 0.01, sex = -0.2)
  # or: formula="~ age + sex", beta=c(0.01, -0.2)
)
```

> `effects$covariates` must be a **named list** (e.g., `list(age=0.01)`), not a named vector from `c()`.

---

# Censoring

We assume a **single censoring mechanism** that may depend on observed covariates (but not on the unobserved event time after conditioning on those covariates). Two user-facing modes are supported:

| Mode               | Fields                                                                                                                                                            | Semantics                                                                                                                                                                                       |
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `"target_overall"` | `target`, `admin_time`                                                                                                                                            | Finds an exponential random-censoring rate so the overall censoring fraction is approximately `target`, with an administrative floor at `admin_time` (if provided).                              |
| `"explicit"`       | Any of: `administrative = list(time=...)`; `random = list(dist="exponential", params=list(rate=...))`; `dependent = list(formula="~ ...", base=..., beta=c(...))` | Compose administrative, random, and **covariate-dependent** censoring directly. The `dependent` block defines a covariate-dependent **log-rate** model; it does **not** denote competing risks. |

> **Note.** No cause-specific or competing-risk censoring is generated here; the simulator's `dependent` option is strictly **covariate-dependent** censoring under a single mechanism.

## Examples

### Target overall censoring

```{r cz_target, eval=TRUE}
cz_target <- list(
  mode        = "target_overall",
  target      = 0.25,   # desired overall censoring fraction
  admin_time  = 36      # optional administrative cutoff (same units as event time)
)
```

### Explicit mix (administrative + random)

```{r cz_explicit, eval=TRUE}
cz_explicit <- list(
  mode = "explicit",
  administrative = list(time = 36),                                      # admin cutoff
  random         = list(dist = "exponential", params = list(rate = 0.02))# random censoring
)
```

### Explicit **covariate-dependent** censoring

The dependent block uses a **log-rate** model:
$$
\log \lambda_c(X) = \log(b) + X^\top \beta,
$$
where `b > 0` is the *baseline rate*. Implementation uses `base` as a **rate**, and `beta` on the **log-rate** scale. When `base` is supplied, the **formula must not include an intercept**.

```{r cz_dep_rate, eval=TRUE}
cz_dep <- list(
  mode = "explicit",
  dependent = list(
    # sex is coded numeric 0/1 in these recipes; if yours is a factor, use I(as.numeric(sex)-1)
    formula = "~ 0 + age + sex",   # NO intercept because we use 'base' as the rate
    base    = 0.03,                # positive baseline rate
    beta    = c(0.012, 0.35)       # (age, sex) on log-rate scale
  ),
  administrative = list(time = 48) # optional admin cutoff
)
```

> If you prefer a log-intercept style, omit `base` and include an intercept in the formula and in `beta`, e.g. `formula="~ 1 + age + sex"; beta=c(-3.5, 0.012, 0.35)`.

---

# Using a censoring recipe inside a full simulation

```{r full_sim_single_mechanism, eval=TRUE}
# Covariates (sex will be numeric 0/1 here)
covs <- list(
  list(name="age", type="continuous",  dist="normal",    params=list(mean=62, sd=10)),
  list(name="sex", type="categorical", dist="bernoulli", params=list(p=0.45))
)

# Censoring: single covariate-dependent mechanism using a positive base rate
cz_dep <- list(
  mode = "explicit",
  dependent = list(
    formula = "~ 0 + age + sex",  # NO intercept; sex is numeric 0/1
    base    = 0.03,               # baseline censoring rate (>0)
    beta    = c(0.012, 0.35)      # (age, sex) on log-rate scale
  ),
  administrative = list(time = 48)
)

# Quick sanity check (optional)
mm   <- model.matrix(~ 0 + age + sex, data.frame(age=c(50,70), sex=c(0,1)))
rate <- cz_dep$dependent$base * exp(mm %*% matrix(cz_dep$dependent$beta, ncol=1))
stopifnot(all(rate > 0 & is.finite(rate)))

# Recipe
rec <- list(
  n = 300,
  covariates = list(defs = covs),
  treatment  = list(assignment="randomization", allocation="1:1"),
  event_time = list(
    model    = "ph_pwexp",
    baseline = list(rates = c(0.05), cuts = numeric(0)), # exponential baseline
    effects  = list(intercept=0, treatment=-0.3, covariates=list(age=0.01, sex=-0.2))
  ),
  censoring  = cz_dep,
  seed = 2025
)

# Simulate
dat <- simulate_from_recipe(validate_recipe(rec))

# Output
knitr::kable(
  data.frame(achieved_censoring = attr(dat, "achieved_censoring")),
  caption = "Achieved censoring rate"
)
knitr::kable(utils::head(dat, 8), caption = "First 8 rows of simulated data")
```

---

# Worked examples

We now build full recipes and call `simulate_from_recipe()`. We report realized censoring via `attr(dat, "achieved_censoring")`.

## Example 1 — AFT Lognormal

```{r ex1_lognormal, eval=TRUE}
# Covariates
covs1 <- list(
  list(name="age",   type="continuous",  dist="normal",     params=list(mean=62, sd=10),
       transform=c("center(60)","scale(10)")),
  list(name="sex",   type="categorical", dist="bernoulli",  params=list(p=0.45)),
  list(name="stage", type="categorical", dist="ordinal",
       params=list(prob=c(0.3,0.5,0.2), labels=c("I","II","III"))),
  list(name="x",     type="continuous",  dist="lognormal",  params=list(meanlog=0, sdlog=0.6))
)

# Recipe
rec1 <- list(
  n = 300,
  covariates = list(defs = covs1),
  treatment  = list(assignment="randomization", allocation="1:1"),
  event_time = list(model="aft_lognormal",
                    baseline=list(mu=3.0, sigma=0.6),
                    effects=list(intercept=0, treatment=-0.25,
                                 covariates=list(age=0.01, sex=-0.2, x=0.05))),
  censoring  = list(mode="target_overall", target=0.25, admin_time=36),
  seed = 11
)

# Simulate
dat1 <- simulate_from_recipe(validate_recipe(rec1))

# Display: data (first rows)
knitr::kable(utils::head(dat1, 8), caption = "Example 1 — AFT Lognormal: first 8 rows")

# Display: key attributes (compact)
attr_tbl1 <- data.frame(
  attribute = c("achieved_censoring", "n_rows", "n_cols", "class"),
  value = c(
    as.character(attr(dat1, "achieved_censoring")),
    nrow(dat1),
    ncol(dat1),
    paste(class(dat1), collapse = ", ")
  )
)
knitr::kable(attr_tbl1, caption = "Example 1 — Attributes")
```

## Example 2 — AFT Weibull

```{r ex2_weibull, eval=TRUE}
# Start from Example 1 and tweak the event-time engine
rec2 <- rec1
rec2$event_time <- list(
  model    = "aft_weibull",
  baseline = list(shape=1.3, scale=12),
  effects  = list(intercept=0, treatment=-0.20, covariates=list(age=0.008, x=0.04))
)

dat2 <- simulate_from_recipe(validate_recipe(rec2), seed=12)

# Display: data (first rows)
knitr::kable(utils::head(dat2, 8), caption = "Example 2 — AFT Weibull: first 8 rows")

# Display: key attributes (compact)
attr_tbl2 <- data.frame(
  attribute = c("achieved_censoring", "n_rows", "n_cols", "class"),
  value = c(
    as.character(attr(dat2, "achieved_censoring")),
    nrow(dat2),
    ncol(dat2),
    paste(class(dat2), collapse = ", ")
  )
)
knitr::kable(attr_tbl2, caption = "Example 2 — Attributes")
```

## Example 3 — PH Exponential (single segment)

```{r ex3_ph_exp1, eval=TRUE}
rec3 <- list(
  n = 400,
  covariates = list(defs = covs1),
  treatment  = list(assignment="randomization", allocation="1:1"),
  event_time = list(
    model    = "ph_pwexp",
    baseline = list(rates = c(0.05), cuts = numeric(0)),
    effects  = list(intercept=0, treatment=-0.3, covariates=list(age=0.01, x=0.03))
  ),
  censoring  = list(mode="target_overall", target=0.20, admin_time=30),
  seed = 13
)

dat3 <- simulate_from_recipe(validate_recipe(rec3))

# Display: data (first rows)
knitr::kable(utils::head(dat3, 8), caption = "Example 3 — PH Exponential: first 8 rows")

# Display: a small summary + attributes
summ3 <- data.frame(
  Metric = c("n", "Events", "Censoring rate", "Mean time", "Median time"),
  Value  = c(nrow(dat3),
             sum(dat3$status == 1, na.rm = TRUE),
             round(mean(dat3$status == 0, na.rm = TRUE), 3),
             round(mean(dat3$time,   na.rm = TRUE), 2),
             round(stats::median(dat3$time, na.rm = TRUE), 2))
)
knitr::kable(summ3, caption = "Example 3 — Summary")

attr_tbl3 <- data.frame(
  attribute = c("achieved_censoring", "n_rows", "n_cols", "class"),
  value = c(
    as.character(attr(dat3, "achieved_censoring")),
    nrow(dat3),
    ncol(dat3),
    paste(class(dat3), collapse = ", ")
  )
)
knitr::kable(attr_tbl3, caption = "Example 3 — Attributes")
```

## Example 4 — PH Piecewise Exponential (multi-segment)

```{r ex4_ph_exp_multi, eval=TRUE}
rec4 <- list(
  n = 500,
  covariates = list(defs = list(
    list(name="age", type="continuous",  dist="normal",    params=list(mean=60, sd=8)),
    list(name="sex", type="categorical", dist="bernoulli", params=list(p=0.5)),
    list(name="x",   type="continuous",  dist="lognormal", params=list(meanlog=0, sdlog=0.5))
  )),
  treatment  = list(assignment="randomization", allocation="1:1"),
  event_time = list(
    model    = "ph_pwexp",
    baseline = list(rates=c(0.10, 0.06, 0.03), cuts=c(6, 18)),
    effects  = list(intercept=0, treatment=-0.4, covariates=list(age=0.01, x=0.03))
  ),
  censoring  = list(mode="target_overall", target=0.25, admin_time=30),
  seed = 123
)

dat4 <- simulate_from_recipe(validate_recipe(rec4))

# Display: data (first rows)
knitr::kable(utils::head(dat4, 8), caption = "Example 4 — PH Piecewise Exponential (multi-segment): first 8 rows")

# Display: summary + attributes
summ4 <- data.frame(
  Metric = c("n", "Events", "Censoring rate", "Mean time", "Median time"),
  Value  = c(nrow(dat4),
             sum(dat4$status == 1, na.rm = TRUE),
             round(mean(dat4$status == 0, na.rm = TRUE), 3),
             round(mean(dat4$time,   na.rm = TRUE), 2),
             round(stats::median(dat4$time, na.rm = TRUE), 2))
)
knitr::kable(summ4, caption = "Example 4 — Summary")

attr_tbl4 <- data.frame(
  attribute = c("achieved_censoring", "n_rows", "n_cols", "class"),
  value = c(
    as.character(attr(dat4, "achieved_censoring")),
    nrow(dat4),
    ncol(dat4),
    paste(class(dat4), collapse = ", ")
  )
)
knitr::kable(attr_tbl4, caption = "Example 4 — Attributes")
```

---

# Generate data based on formula (event & censoring)

This example uses `~` formulas for both event-time covariate effects and covariate-dependent censoring. We keep the treatment effect as a separate scalar to avoid design ambiguity.

```{r ex_formula_both_selfcontained, eval=TRUE}
# Covariates (sex numeric 0/1)
covs_f <- list(
  list(name="age", type="continuous",  dist="normal",    params=list(mean=62, sd=10)),
  list(name="sex", type="categorical", dist="bernoulli", params=list(p=0.45))
)

trt_f <- list(assignment="randomization", allocation="1:1")

# Event-time engine: AFT lognormal with formula-based covariate effects
evt_f <- list(
  model    = "aft_lognormal",
  baseline = list(mu = 3.1, sigma = 0.55),
  effects  = list(
    treatment  = -0.25,                 # effect of treatment (log-time scale)
    formula    = "~ 0 + age + sex",     # covariate effects via formula (no intercept)
    beta       = c(0.010, -0.20)        # (age, sex) -- match order in formula
  )
)

# Censoring: covariate-dependent via log-rate model (single mechanism)
cz_f <- list(
  mode = "explicit",
  dependent = list(
    formula = "~ 0 + age + sex",  # NO intercept; sex numeric 0/1
    base    = 0.03,               # positive rate
    beta    = c(0.012, 0.35)      # (age, sex) on log-rate scale
  ),
  administrative = list(time = 48)  # optional admin cutoff
)

# Full recipe
rec_f <- list(
  n = 350,
  covariates = list(defs = covs_f),
  treatment  = trt_f,
  event_time = evt_f,
  censoring  = cz_f,
  seed = 777
)

# Simulate
dat_f <- simulate_from_recipe(validate_recipe(rec_f))

# Display
knitr::kable(
  data.frame(achieved_censoring = attr(dat_f, "achieved_censoring")),
  caption = "Achieved censoring rate (formula-based example)"
)
knitr::kable(utils::head(dat_f, 8), caption = "First 8 rows — formula-based example")
```

---

# Batch generation with metadata

For simulation studies, write multiple scenarios and formats together. The writer creates a `manifest.rds` with a **list-column** `meta` describing each dataset. The loader reattaches attributes when reading back.

```{r batch_selfcontained, eval=TRUE}
# Build a base recipe here (self-contained)
base <- validate_recipe(list(
  n = 200,
  covariates = list(defs = list(
    list(name="age", type="continuous",  dist="normal", params=list(mean=62, sd=10)),
    list(name="sex", type="categorical", dist="bernoulli", params=list(p=0.45)),
    list(name="x",   type="continuous",  dist="lognormal", params=list(meanlog=0, sdlog=0.6))
  )),
  treatment  = list(assignment="randomization", allocation="1:1"),
  event_time = list(
    model    = "aft_weibull",
    baseline = list(shape=1.3, scale=12),
    effects  = list(intercept=0, treatment=-0.20, covariates=list(age=0.008, x=0.04))
  ),
  censoring  = list(mode="target_overall", target=0.25, admin_time=36),
  seed = 3026
))

out_dir <- file.path(tempdir(), "rmstss-manifest-demo")
unlink(out_dir, recursive = TRUE, force = TRUE)

man <- generate_recipe_sets(
  base_recipe = base,
  vary = list(n = c(200, 400),
              "event_time.effects.treatment" = c(-0.15, -0.25)),
  out_dir  = out_dir,
  formats  = c("rds","csv"),
  n_reps   = 1,
  seed_base = 2025
)

# Inspect the first row's compact metadata (fields only; no file paths)
m <- readRDS(file.path(out_dir, "manifest.rds"))
names(m)
if ("meta" %in% names(m) && length(m$meta[[1]]) > 0) {
  list(model = m$meta[[1]]$model,
       baseline = m$meta[[1]]$baseline,
       effects = m$meta[[1]]$effects,
       achieved_censoring = m$meta[[1]]$achieved_censoring,
       n = m$meta[[1]]$n)
} else {
  "Manifest is minimal (older run); use rebuild_manifest() to enrich."
}

# Load datasets back
sets <- load_recipe_sets(file.path(out_dir, "manifest.rds"))
attr(sets[[1]]$data, "achieved_censoring")
str(sets[[1]]$meta)
```

---

# Reproducibility tips

Set `seed` in the recipe or pass `seed=` to `simulate_from_recipe()`.

> **Assumptions.** The censoring hazard depends on observed covariates via a user-specified single mechanism. There are no competing risks. Datasets contain only `time`, `status`, treatment, and covariates - no dependent-censoring indicators.