## ----dev-load, include=FALSE-------------------------------------------------- # Prefer source build when available (works in RStudio, pkgdown, or local render) if (requireNamespace("devtools", quietly = TRUE) && file.exists(file.path("..","DESCRIPTION"))) { # Don't error on CRAN/build machines that don't have devtools or the source path try(devtools::load_all("..", quiet = TRUE), silent = TRUE) } # If we've already loaded from source, avoid re-attaching a different installed build later from_source <- try({ "DAGassist" %in% loadedNamespaces() && grepl(normalizePath(".."), getNamespaceInfo(asNamespace("DAGassist"), "path"), fixed = TRUE) }, silent = TRUE) from_source <- isTRUE(from_source) # Feature gates (computed *after* attempting load_all) has_show <- tryCatch({ "show" %in% names(formals(DAGassist::DAGassist)) }, error = function(e) FALSE) # Robust check: dev build defines a private .report_dotwhisker helper has_dotwhisker <- tryCatch({ exists(".report_dotwhisker", envir = asNamespace("DAGassist"), inherits = FALSE) }, error = function(e) FALSE) ## ----make-df, include=FALSE--------------------------------------------------- simulate_toy_panel <- function( n_id = 1000, n_t = 5, seed = 20251225, p_band = c(0.001, 0.08), max_tries = 160 ) { stopifnot(length(p_band) == 2, p_band[1] < p_band[2]) clamp <- function(x, lo, hi) pmin(pmax(x, lo), hi) inv_logit <- function(z) 1 / (1 + exp(-z)) rmultinom1 <- function(prob_named) { levs <- names(prob_named) levs[which.max(stats::runif(1) <= cumsum(prob_named))] } set.seed(seed) id <- seq_len(n_id) gender <- factor(sample(c("Female", "Male"), n_id, replace = TRUE, prob = c(0.51, 0.49))) immigrant <- rbinom(n_id, 1, 0.13) urban <- rbinom(n_id, 1, 0.72) class_levels <- c("Low", "Working", "Middle", "Upper") class <- factor(sample(class_levels, n_id, replace = TRUE, prob = c(0.18, 0.38, 0.34, 0.10)), levels = class_levels, ordered = TRUE) class_num <- as.integer(class) - 1L age0 <- round(clamp(rnorm(n_id, mean = 35, sd = 16), 0, 100 - (n_t - 1)), 1) # ----------------------- # Contract type (baseline; affects edu_year, income, children) # ----------------------- contract_levels <- c("Informal", "Temporary", "Permanent") contract0 <- character(n_id) for (i in seq_len(n_id)) { cnum <- class_num[i] u <- urban[i] im <- immigrant[i] gF <- (gender[i] == "Female") a0 <- age0[i] # Scores: Permanent higher for higher class + urban; Informal higher for immigrant score <- c( Informal = 0.15 - 0.25*cnum + 0.10*im - 0.05*u + 0.02*(a0 - 35), Temporary = 0.05 + 0.05*cnum - 0.05*im + 0.02*u + 0.01*(a0 - 35), Permanent = -0.20 + 0.22*cnum - 0.08*im + 0.10*u - 0.02*gF + 0.01*(a0 - 35) ) prob <- exp(score) / sum(exp(score)) contract0[i] <- rmultinom1(prob) } contract <- factor(rep(contract0, each = n_t), levels = contract_levels) # Numeric index for linear effects (Informal=0, Temporary=1, Permanent=2) contract_num <- as.integer(factor(contract0, levels = contract_levels)) - 1L contract_num_p <- rep(contract_num, each = n_t) # ----------------------- # Preference for children (baseline; affects children only) # ----------------------- # Interpretable desired children: ~0 to 5, centered at ~2 pref0 <- clamp(rnorm(n_id, mean = 2.0, sd = 1.0), 0, 5) pref <- rep(pref0, each = n_t) # Centered version for linear predictors pref_c <- pref0 - 2.0 pref_c_p <- rep(pref_c, each = n_t) target_deg_adult <- c( HS_grad = 0.40, Some_college = 0.28, BA = 0.22, MA = 0.08, PhD_or_prof = 0.02 ) stopifnot(abs(sum(target_deg_adult) - 1) < 1e-8) edu_me_sd <- 0.70 income_intercept <- 10.45 income_sd <- 0.58 marriage_intercept <- -3.75 children_cap <- 12L edu_fert_scale <- 1.25 assign_deg_by_quantiles <- function(z, age0, target_deg_adult) { adult_idx <- which(age0 >= 25) qs <- stats::quantile( z[adult_idx], probs = cumsum(target_deg_adult)[1:4], names = FALSE, type = 8 ) assign_deg <- function(zv, cuts) { out <- rep("PhD_or_prof", length(zv)) out[zv < cuts[4]] <- "MA" out[zv < cuts[3]] <- "BA" out[zv < cuts[2]] <- "Some_college" out[zv < cuts[1]] <- "HS_grad" out } deg <- rep(NA_character_, length(z)) deg[adult_idx] <- assign_deg(z[adult_idx], qs) # 22–24: allow BA but NOT grad degrees idx_22_24 <- which(age0 >= 22 & age0 < 25) if (length(idx_22_24) > 0) { tmp <- assign_deg(z[idx_22_24], qs) tmp[tmp %in% c("MA", "PhD_or_prof")] <- "BA" deg[idx_22_24] <- tmp } # 18–21: allow Some_college but NOT BA+ idx_18_21 <- which(age0 >= 18 & age0 < 22) if (length(idx_18_21) > 0) { tmp <- assign_deg(z[idx_18_21], qs) tmp[tmp %in% c("BA", "MA", "PhD_or_prof")] <- "Some_college" deg[idx_18_21] <- tmp } # <18: in progress deg[age0 < 18] <- "In_progress" deg[is.na(deg)] <- "HS_grad" deg } # helper: map degree to target years of schooling degree_to_target_years <- function(deg, age0) { target_years <- numeric(length(deg)) # K-12 accumulation for in-progress kids/teens k12 <- clamp(age0 - 5, 0, 13) target_years[deg == "In_progress"] <- k12[deg == "In_progress"] target_years[deg == "HS_grad"] <- 12 + rnorm(sum(deg == "HS_grad"), 0, 0.30) target_years[deg == "Some_college"] <- 13.5 + rnorm(sum(deg == "Some_college"), 0, 0.45) target_years[deg == "BA"] <- 16 + rnorm(sum(deg == "BA"), 0, 0.55) target_years[deg == "MA"] <- 18 + rnorm(sum(deg == "MA"), 0, 0.65) target_years[deg == "PhD_or_prof"] <- 21 + rnorm(sum(deg == "PhD_or_prof"), 0, 0.70) # feasibility given age max_feasible <- clamp(age0 - 5, 0, 22) clamp(target_years, 0, max_feasible) } # ---- Try loop: redraw endogenous noise until calibration holds ---- for (try_idx in seq_len(max_tries)) { set.seed(seed + 7000 + try_idx) # Panel skeleton year <- rep(0:(n_t - 1), times = n_id) pid <- rep(id, each = n_t) age <- clamp(rep(age0, each = n_t) + year, 0, 100) gender_p <- rep(gender, each = n_t) immigrant_p <- rep(immigrant, each = n_t) urban_p <- rep(urban, each = n_t) class_p <- rep(class, each = n_t) class_num_p <- rep(class_num, each = n_t) # ----------------------- # Education propensity index z # ----------------------- abil <- rnorm(n_id, 0, 1) z <- 0.85 * abil + 0.55 * class_num + 0.20 * urban + (-0.10) * immigrant + 0.05 * (gender == "Female") + (-0.04) * (age0 - 35) + 0.22 * contract_num + # <--- contract raises schooling rnorm(n_id, 0, 0.35) # Robust degree assignment (adult shares forced; HS > BA forced) deg <- assign_deg_by_quantiles(z, age0, target_deg_adult) edu_degree <- factor(rep(deg, each = n_t), levels = c("In_progress","HS_grad","Some_college","BA","MA","PhD_or_prof")) # True baseline education from degree + feasibility edu0_true <- degree_to_target_years(deg, age0) edu_target <- clamp(edu0_true, 0, 22) # target is baseline level (then accrual for young) # Evolve education in panel (<25 can grow toward a slightly higher personal target) # Give some additional accumulation for young people (esp. HS->BA pathways). edu_target2 <- edu_target bump <- rep(0, n_id) bump[deg %in% c("HS_grad","Some_college","BA") & age0 < 23] <- rnorm(sum(deg %in% c("HS_grad","Some_college","BA") & age0 < 23), 1.2, 0.6) bump <- clamp(bump, 0, 3.0) edu_target2 <- clamp(edu_target2 + bump, 0, 22) edu_true <- numeric(length(pid)) edu_true[year == 0] <- edu0_true for (i in seq_len(n_id)) { idx <- which(pid == i) for (t in seq_along(idx)) { if (t == 1) next prev <- edu_true[idx[t - 1]] a <- age[idx[t]] if (a < 25 && prev < edu_target2[i]) { delta <- clamp(rnorm(1, mean = 1.0, sd = 0.25), 0, 1.2) edu_true[idx[t]] <- pmin(prev + delta, edu_target2[i]) } else { edu_true[idx[t]] <- prev } } } # observed edu_year (measurement error) edu_year <- clamp(edu_true + rnorm(length(pid), 0, edu_me_sd), 0, 22) # ----------------------- # Job stability (time-varying; treatment-induced intermediate Z) # - affected by edu_true (treatment) and baseline contract # - affects mediators (income, married, birth_control) and outcome (children) # ----------------------- rho_job <- 0.70 # persistence over time job_trait_i <- rnorm(n_id, 0, 0.8) # stable id-level trait job_stability_t <- numeric(length(pid)) for (i in seq_len(n_id)) { idx <- which(pid == i) # baseline level at t=0 base0 <- -0.25 + 0.10 * (edu_true[idx[1]] - 12) + 0.30 * contract_num[i] + 0.10 * class_num[i] + 0.12 * urban[i] + (-0.10) * immigrant[i] + 0.02 * (age0[i] - 35) + 0.05 * (gender[i] == "Female") + job_trait_i[i] + rnorm(1, 0, 0.45) job_stability_t[idx[1]] <- clamp(base0, -3, 3) if (length(idx) > 1) { for (t in 2:length(idx)) { prev <- job_stability_t[idx[t - 1]] # current-period innovation (edu affects job stability each period) innov <- -0.05 + 0.08 * (edu_true[idx[t]] - 12) + 0.08 * (age[idx[t]] - 35) / 10 + rnorm(1, 0, 0.50) job_stability_t[idx[t]] <- clamp(rho_job * prev + (1 - rho_job) * (base0 + innov), -3, 3) } } } # ----------------------- # Religion (depends on baseline edu_true + class + urban) # ----------------------- rel_levels <- c("Christian","Muslim","Hindu","Buddhist","Jewish","Unaffiliated","Other") religion0 <- character(n_id) for (i in seq_len(n_id)) { e <- edu0_true[i]; cnum <- class_num[i]; u <- urban[i] score <- c( Christian = 0.95 - 0.010*e - 0.02*u + 0.03*cnum, Muslim = -0.45 - 0.006*e + 0.02*u, Hindu = -1.55 + 0.004*e, Buddhist = -1.60 + 0.004*e, Jewish = -1.85 + 0.030*e + 0.04*cnum + 0.03*u, Unaffiliated = 0.10 + 0.045*e + 0.08*u, Other = -0.95 + 0.004*e ) prob <- exp(score) / sum(exp(score)) religion0[i] <- rmultinom1(prob) } religion <- factor(rep(religion0, each = n_t), levels = rel_levels) # ----------------------- # Married (no married < 17; edu delays slightly) # ----------------------- rel_marriage_effect <- setNames(c( Christian = 0.18, Muslim = 0.20, Hindu = 0.10, Buddhist = 0.06, Jewish = 0.10, Unaffiliated = -0.15, Other = 0.00 ), rel_levels) lp_married <- marriage_intercept + 0.22 * (age - 18) + (-0.00115) * (age - 38)^2 + (-0.012) * edu_true + 0.35 * job_stability_t + # <--- add rel_marriage_effect[as.character(religion)] married <- rbinom(length(pid), 1, inv_logit(lp_married)) married[age < 17] <- 0L # ----------------------- # Birth control (edu increases use; marriage increases use) # ----------------------- rel_bc_effect <- setNames(c( Christian = -0.05, Muslim = -0.20, Hindu = -0.10, Buddhist = -0.05, Jewish = 0.05, Unaffiliated = 0.15, Other = 0.00 ), rel_levels) lp_bc <- -1.05 + 0.085 * edu_true + 0.55 * married + 0.25 * job_stability_t + # <--- add 0.04 * (age - 18) - 0.0009 * (age - 30)^2 + rel_bc_effect[as.character(religion)] birth_control <- rbinom(length(pid), 1, inv_logit(lp_bc)) # ----------------------- # Income (US-like) # ----------------------- lp_inc <- income_intercept + 0.060 * (edu_true - 12) + 0.12 * urban_p + (-0.05) * immigrant_p + 0.10 * class_num_p + (-0.03) * (gender_p == "Female") + 0.030 * (age - 25) - 0.00055 * (age - 45)^2 + 0.18 * contract_num_p + # from earlier 0.22 * job_stability_t income <- exp(rnorm(length(pid), mean = lp_inc, sd = income_sd)) income[age < 16] <- exp(rnorm(sum(age < 16), mean = 8.45, sd = 0.25)) # ----------------------- # Children: degree gradient + negative edu effect, with dispersion # ----------------------- deg_fert_effect <- c( In_progress = 0.00, HS_grad = 0.00, Some_college = -0.06, BA = -0.10, MA = -0.12, PhD_or_prof = -0.14 ) deg_eff_i <- deg_fert_effect[deg] fert_i <- rnorm(n_id, 0, 0.95) fert_pref <- rep(fert_i, each = n_t) rel_fert_effect <- setNames(c( Christian = 0.10, Muslim = 0.18, Hindu = 0.08, Buddhist = -0.05, Jewish = -0.02, Unaffiliated = -0.12, Other = 0.00 ), rel_levels) idx0 <- which(year == 0) job0 <- job_stability_t[idx0] rel0 <- rel_fert_effect[religion0] mar0 <- married[idx0] bc0 <- birth_control[idx0] inc0 <- income[idx0] a0 <- age0 age_term0 <- ifelse(a0 < 15, -10, 0.11 * (pmin(a0, 42) - 18) - 0.0022 * (pmin(a0, 42) - 28)^2) beta_edu_children0 <- (-0.020 * edu_fert_scale) mu_children0 <- exp( -0.85 + age_term0 + 0.55 * mar0 + (-0.45) * bc0 + beta_edu_children0 * edu0_true + deg_eff_i + (-0.0000007) * inc0 + 0.04 * class_num + 0.18 * contract_num + 0.45 * pref_c + 0.25 * job0 + # <--- add rel0 + fert_i ) children0 <- rnbinom(n_id, size = 1.3, mu = mu_children0) children0[a0 < 15] <- 0L children0 <- pmin(children0, children_cap) fertile <- (age >= 15 & age <= 45) beta_edu_births <- (-0.008 * edu_fert_scale) lp_birth <- -3.35 + 0.80 * married + (-0.90) * birth_control + beta_edu_births * edu_true + rep(deg_eff_i, each = n_t) + 0.02 * (age - 22) - 0.0010 * (age - 30)^2 + rel_fert_effect[as.character(religion)] + 0.12 * contract_num_p + 0.35 * pref_c_p + 0.18 * job_stability_t + # <--- add fert_pref births <- rpois(length(pid), exp(lp_birth) * fertile) births <- pmin(births, 2L) children <- numeric(length(pid)) for (i in seq_len(n_id)) { idx <- which(pid == i) children[idx] <- children0[i] + cumsum(births[idx]) } children <- pmin(children, children_cap) # Assemble df <- data.frame( id = pid, year = year, age = age, gender = gender_p, immigrant = factor(immigrant_p, levels = c(0, 1), labels = c("No", "Yes")), urban = factor(urban_p, levels = c(0, 1), labels = c("Rural", "Urban")), class = class_p, religion = religion, contract = contract, pref = pref, edu_year = round(edu_year, 1), edu_degree = edu_degree, married = married, birth_control = birth_control, income = round(income, 0), children = children, job_stability_t = job_stability_t ) # ----------------------- # Calibration: LAST WAVE, age-adjusted edu_year # ----------------------- dL <- df[df$year == max(df$year), ] fit <- lm(children ~ edu_year + age, data = dL) cc <- summary(fit)$coefficients est <- unname(cc["edu_year", "Estimate"]) pval <- unname(cc["edu_year", "Pr(>|t|)"]) if (is.na(pval)) pval <- 0 lo <- p_band[1]; hi <- p_band[2] ok <- (est < 0) && (pval >= lo) && (pval <= hi) if (ok || try_idx == max_tries) { attr(df, "pval_last_wave_age_adj") <- pval attr(df, "edu_est_last_wave_age_adj") <- est attr(df, "edu_fert_scale_used") <- edu_fert_scale attr(df, "edu_me_sd_used") <- edu_me_sd attr(df, "tries") <- try_idx return(df) } # If too weak / wrong sign: strengthen negative edu effect if (est >= 0 || pval > hi) edu_fert_scale <- edu_fert_scale * 1.12 # If too significant: weaken edu effect and add measurement error if (pval < lo) { edu_fert_scale <- edu_fert_scale * 0.85 edu_me_sd <- min(edu_me_sd * 1.08, 2.0) } } } # ---- Example usage ---- dat <- simulate_toy_panel(n_id = 1000, n_t = 5, seed = 20251225) d0 <- subset(dat, year == 0) dL <- subset(dat, year == max(year)) # Calibration check (what we target) summary(lm(children ~ edu_year + age, data = dL)) attr(dat, "pval_last_wave_age_adj") attr(dat, "edu_est_last_wave_age_adj") attr(dat, "tries") # Attainment realism (adult-only) d0_adult <- subset(d0, age >= 25 & edu_degree != "In_progress") prop.table(table(d0_adult$edu_degree)) # Degree-type result (adult, completed degrees) dA <- subset(dL, age >= 25 & edu_degree != "In_progress") dA$edu_degree <- relevel(dA$edu_degree, "HS_grad") df <- dat ## ----summary, echo=FALSE------------------------------------------------------ toy_summary_split <- function(dat, top_k = 3, digits = 2) { stopifnot(is.data.frame(dat)) fmt <- function(x) { ifelse(is.na(x), NA_character_, format(round(x, digits), nsmall = digits, trim = TRUE)) } num_table <- function(df) { out <- lapply(names(df), function(nm) { x <- df[[nm]] s <- summary(x) data.frame( variable = nm, type = class(x)[1], Min = fmt(unname(s["Min."])), Q1 = fmt(unname(s["1st Qu."])), Median = fmt(unname(s["Median"])), Mean = fmt(unname(s["Mean"])), Q3 = fmt(unname(s["3rd Qu."])), Max = fmt(unname(s["Max."])), stringsAsFactors = FALSE ) }) res <- do.call(rbind, out) rownames(res) <- NULL res } fac_table <- function(df) { out <- lapply(names(df), function(nm) { x <- df[[nm]] if (is.logical(x)) x <- factor(x, levels = c(FALSE, TRUE)) if (is.character(x)) x <- factor(x) x <- as.factor(x) tab <- sort(table(x, useNA = "no"), decreasing = TRUE) k <- min(top_k, length(tab)) top <- tab[seq_len(k)] rest <- sum(tab) - sum(top) parts <- paste0(names(top), ":", as.integer(top)) if (rest > 0) parts <- c(parts, paste0("(Other):", as.integer(rest))) data.frame( variable = nm, type = class(df[[nm]])[1], top_levels = paste(parts, collapse = " "), stringsAsFactors = FALSE ) }) res <- do.call(rbind, out) rownames(res) <- NULL res } is_cat <- vapply(dat, function(x) is.factor(x) || is.character(x) || is.logical(x), logical(1)) num_df <- dat[!is_cat] cat_df <- dat[is_cat] list( numeric = if (ncol(num_df)) num_table(num_df) else data.frame(), categorical = if (ncol(cat_df)) fac_table(cat_df) else data.frame() ) } # vignette usage: tabs <- toy_summary_split(dat, top_k = 3) knitr::kable(tabs$numeric, align = "l") knitr::kable(tabs$categorical, align = "l") ## ----example-dag, echo=FALSE, message=FALSE, fig.width=8, fig.height=5, warning=FALSE, fig.cap="*Example: The Causal Effects of Family Background and Life Course Events on Fertility Patterns*"---- library(dagitty) library(ggdag) library(tidyverse) x_pos <- c( children = 10, income = 5, edu_year = 0, urban = 10, immigrant = 10, class = 3, birth_control = 8, married = 5, gender = 3, age = 7, pref = 12, contract = 5, job_stability_t = 7 ) y_pos <- c( children = 0, income = 0.5, edu_year = 0, urban = 2, immigrant = -2, class = -4, birth_control = -4, married = -4, gender = 2, age = 2, pref = 0, contract = 2, job_stability_t = -2 ) dag_model <- dagify( children ~ income + edu_year + urban + immigrant + class + religion + birth_control + married + gender + age + pref + contract + job_stability_t, edu_year ~ class + immigrant + urban + gender + age + contract, job_stability_t ~ edu_year + contract + class + urban + immigrant + age + gender, income ~ edu_year + urban + immigrant + class + gender + age + contract +job_stability_t, birth_control ~ edu_year + religion + married + age + job_stability_t, married ~ edu_year + religion + age + job_stability_t, coords = list(x=x_pos, y=y_pos), exposure = "edu_year", outcome = "children", labels = c( children = "Children", income = "Income", edu_year = "Education", urban = "Urban/Rural", immigrant = "Immigrant", class = "Parents' Class", birth_control = "Birth Control", married = "Married", gender = "Gender", age = "Age", pref = "Preference\nfor Children", contract = "Contract Type", job_stability_t = "Job Stability" ) ) #check if valid DAG #is.dagitty(dag_model) #dagitty::isAcyclic(dag_model) #dagitty::findCycle(dag_model) ggdag_status(dag_model, text = FALSE, use_labels = "label") + geom_dag_label_repel(aes(label = NA)) + theme_dag() + scale_fill_manual( values = c( exposure = "black", outcome = "grey30", none = "grey90" ) ) + scale_color_manual( values = c( exposure = "black", outcome = "grey30", none = "grey60" ) ) + theme(legend.position = "none") ## ----roles-table-------------------------------------------------------------- DAGassist(dag_model, show="roles") ## ----model-comparison--------------------------------------------------------- DAGassist(dag_model, formula = lm(children ~ edu_year + age + class + gender + immigrant + urban + birth_control + income + married + job_stability_t + contract + pref, data = dat)) ## ----sate--------------------------------------------------------------------- DAGassist(dag_model, formula = lm(children ~ edu_year + age + class + gender + immigrant + urban + birth_control + income + married + job_stability_t + contract + pref, data = dat), estimand = "SATE") ## ----cde-est, warning=FALSE, fig.width=8, fig.height=5, fig.cap="*Visualizing all estimands*"---- library(DirectEffects) DAGassist(dag_model, formula = lm(children ~ edu_year + age + class + gender + immigrant + urban + birth_control + income + married + job_stability_t + contract + pref, data = dat), estimand = c("SATE", "SACDE"), type = "dotwhisker")