## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(datadiff) ## ----first-comparison--------------------------------------------------------- library(datadiff) ref <- data.frame( id = 1:4, revenue = c(1000.00, 2000.00, 3000.00, 4000.00), category = c("A", "B", "C", "D"), active = c(TRUE, TRUE, FALSE, TRUE) ) cand <- data.frame( id = 1:4, revenue = c(1000.005, 2000.001, 3000.009, 4000.00), # tiny differences category = c("a", "b", "c", "D"), # lowercase active = c(TRUE, TRUE, FALSE, TRUE) ) ## ----first-rules, results = "hide"-------------------------------------------- rules_path <- tempfile(fileext = ".yaml") write_rules_template( ref, key = "id", path = rules_path, numeric_abs = 0.01, # accept differences up to 0.01 character_case_insensitive = TRUE # ignore case for all char columns ) ## ----first-result------------------------------------------------------------- result <- compare_datasets_from_yaml(ref, cand, key = "id", path = rules_path) result$all_passed ## ----return-structure--------------------------------------------------------- names(result) ## ----applied-rules------------------------------------------------------------ result$applied_rules$revenue result$applied_rules$category ## ----col-presence------------------------------------------------------------- result$missing_in_candidate result$extra_in_candidate ## ----failing-rows------------------------------------------------------------- ref_fail <- data.frame(id = 1:5, value = c(1, 2, 3, 4, 5)) cand_fail <- data.frame(id = 1:5, value = c(1, 2, 99, 4, 99)) # rows 3 and 5 wrong result_fail <- compare_datasets_from_yaml(ref_fail, cand_fail, key = "id") result_fail$all_passed # Rows that failed at least one step failed_rows <- pointblank::get_sundered_data(result_fail$reponse, type = "fail") failed_rows ## ----extract-params, eval = FALSE--------------------------------------------- # # Keep only the first 100 failing rows per validation step # result <- compare_datasets_from_yaml(ref, cand, key = "id", # get_first_n = 100) # # # Random sample of 50 failing rows per step # result <- compare_datasets_from_yaml(ref, cand, key = "id", # sample_n = 50) # # # 10% of failing rows, capped at 500 # result <- compare_datasets_from_yaml(ref, cand, key = "id", # sample_frac = 0.1, sample_limit = 500) # # # Disable extraction entirely (fastest — only pass/fail counts are kept) # result <- compare_datasets_from_yaml(ref, cand, key = "id", # extract_failed = FALSE) ## ----no-yaml------------------------------------------------------------------ ref_quick <- data.frame(id = 1:3, x = c(1.0, 2.0, 3.0), label = c("A", "B", "C")) cand_quick <- data.frame(id = 1:3, x = c(1.0, 2.0, 3.0), label = c("A", "B", "C")) # No path needed — rules are generated on the fly result_quick <- compare_datasets_from_yaml(ref_quick, cand_quick, key = "id") result_quick$all_passed ## ----read-rules--------------------------------------------------------------- loaded <- read_rules(rules_path) loaded$defaults$na_equal loaded$by_type$numeric loaded$by_type$character ## ----byname-example----------------------------------------------------------- ref_full <- data.frame( id = 1:4, price = c(9.99, 19.99, 4.50, 149.00), # numeric: small absolute tolerance quantity = c(10L, 5L, 20L, 1L), # integer: exact description = c("Widget A", "Widget B", " Gadget", "TOOL"), # needs trim + case in_stock = c(TRUE, TRUE, FALSE, TRUE), # logical: exact created = as.Date(c("2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04")) ) cand_full <- data.frame( id = 1:4, price = c(9.995, 19.99, 4.50, 149.00), # row 1: diff = 0.005 < 0.01 quantity = c(10L, 5L, 20L, 1L), description = c("widget a", "Widget B", "Gadget", "tool"), # case + spaces in_stock = c(TRUE, TRUE, FALSE, TRUE), created = as.Date(c("2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04")) ) ## ----byname-rules, results = "hide"------------------------------------------- rules_full <- tempfile(fileext = ".yaml") write_rules_template( ref_full, key = "id", path = rules_full, numeric_abs = 1e-9, # conservative default character_case_insensitive = FALSE, # strict default for character character_trim = FALSE ) # Read, patch by_name, write back rules_obj <- read_rules(rules_full) rules_obj$by_name$price <- list(abs = 0.01) # ±0.01 for price rules_obj$by_name$description <- list(case_insensitive = TRUE, trim = TRUE) yaml::write_yaml(rules_obj, rules_full) ## ----byname-result------------------------------------------------------------ result_full <- compare_datasets_from_yaml(ref_full, cand_full, key = "id", path = rules_full) result_full$all_passed # Verify the effective rules for each column result_full$applied_rules$price result_full$applied_rules$description result_full$applied_rules$quantity ## ----abs-tolerance------------------------------------------------------------ ref_num <- data.frame(id = 1:3, price = c(1.00, 1000.00, 1e6)) cand_ok <- data.frame(id = 1:3, price = c(1.005, 1000.005, 1e6 + 0.005)) cand_nok <- data.frame(id = 1:3, price = c(1.02, 1000.02, 1e6 + 0.02)) rules_abs <- tempfile(fileext = ".yaml") write_rules_template(ref_num, key = "id", path = rules_abs, numeric_abs = 0.01) compare_datasets_from_yaml(ref_num, cand_ok, key = "id", path = rules_abs)$all_passed compare_datasets_from_yaml(ref_num, cand_nok, key = "id", path = rules_abs)$all_passed ## ----rel-tolerance------------------------------------------------------------ rules_rel <- tempfile(fileext = ".yaml") write_rules_template(ref_num, key = "id", path = rules_rel, numeric_abs = 0, numeric_rel = 0.01) # ref = 1000, diff = 9, threshold = 0.01 × 1000 = 10 → PASS cand_pct <- data.frame(id = 1:3, price = c(1.009, 1009.0, 1e6 * 1.009)) compare_datasets_from_yaml(ref_num, cand_pct, key = "id", path = rules_rel)$all_passed ## ----ieee754------------------------------------------------------------------ # In double precision, this is slightly above 0.01 100.01 - 100.00 ## ----warn-stop, eval = FALSE-------------------------------------------------- # result <- compare_datasets_from_yaml( # ref, cand, # key = "id", # warn_at = 0.05, # warn if > 5% of rows fail any step # stop_at = 0.20 # stop (error) if > 20% of rows fail any step # ) ## ----text-comparison---------------------------------------------------------- ref_txt <- data.frame(id = 1:4, label = c("Hello", "World", "Foo", "Bar")) cand_txt <- data.frame( id = 1:4, label = c("hello", " World ", "FOO", "Baz") # case, spaces, mismatch ) # Strict: rows 1, 2, 3 fail rules_strict <- tempfile(fileext = ".yaml") write_rules_template(ref_txt, key = "id", path = rules_strict) compare_datasets_from_yaml(ref_txt, cand_txt, key = "id", path = rules_strict)$all_passed # Relaxed: case + trim — only row 4 ("Baz" vs "Bar") fails rules_relax <- tempfile(fileext = ".yaml") write_rules_template(ref_txt, key = "id", path = rules_relax, character_case_insensitive = TRUE, character_trim = TRUE) compare_datasets_from_yaml(ref_txt, cand_txt, key = "id", path = rules_relax)$all_passed ## ----row-validation----------------------------------------------------------- ref_rows <- data.frame(id = 1:5, value = 1:5) cand_ok <- data.frame(id = 1:5, value = 1:5) # 5 rows — exact match cand_more <- data.frame(id = 1:7, value = 1:7) # 7 rows — 2 extra rules_count <- tempfile(fileext = ".yaml") write_rules_template(ref_rows, key = "id", path = rules_count, check_count_default = TRUE, expected_count_default = 5, row_count_tolerance_default = 0) compare_datasets_from_yaml(ref_rows, cand_ok, key = "id", path = rules_count)$all_passed compare_datasets_from_yaml(ref_rows, cand_more, key = "id", path = rules_count)$all_passed ## ----row-tolerance------------------------------------------------------------ rules_tol <- tempfile(fileext = ".yaml") write_rules_template(ref_rows, key = "id", path = rules_tol, check_count_default = TRUE, expected_count_default = 5, row_count_tolerance_default = 3) # accept 5 ± 3 # 7 rows: |7 - 5| = 2 ≤ 3 → PASS compare_datasets_from_yaml(ref_rows, cand_more, key = "id", path = rules_tol)$all_passed ## ----na-handling-------------------------------------------------------------- ref_na <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) cand_na <- data.frame(id = 1:3, value = c(1.0, NA, 3.0)) # identical NAs # na_equal: yes (default) — NA == NA passes rules_na_yes <- tempfile(fileext = ".yaml") write_rules_template(ref_na, key = "id", path = rules_na_yes, na_equal_default = TRUE) compare_datasets_from_yaml(ref_na, cand_na, key = "id", path = rules_na_yes)$all_passed # na_equal: no — NA == NA fails rules_na_no <- tempfile(fileext = ".yaml") write_rules_template(ref_na, key = "id", path = rules_na_no, na_equal_default = FALSE) compare_datasets_from_yaml(ref_na, cand_na, key = "id", path = rules_na_no)$all_passed ## ----ignore-columns----------------------------------------------------------- ref_ign <- data.frame(id = 1:3, value = 1:3, updated_at = Sys.time()) cand_ign <- data.frame(id = 1:3, value = 1:3, updated_at = Sys.time() + 3600) # different timestamp rules_ign <- tempfile(fileext = ".yaml") write_rules_template(ref_ign, key = "id", path = rules_ign, ignore_columns_default = "updated_at") compare_datasets_from_yaml(ref_ign, cand_ign, key = "id", path = rules_ign)$all_passed ## ----col-analysis------------------------------------------------------------- ref_cols <- data.frame(id = 1:2, a = 1:2, b = 1:2) cand_cols <- data.frame(id = 1:2, a = 1:2, c = 1:2) # b missing, c extra result_cols <- compare_datasets_from_yaml(ref_cols, cand_cols, key = "id") result_cols$missing_in_candidate # b result_cols$extra_in_candidate # c result_cols$all_passed # FALSE: b is missing ## ----analyze-columns---------------------------------------------------------- analysis <- analyze_columns(ref_cols, cand_cols, ignore_columns = character(0)) str(analysis) ## ----with-key----------------------------------------------------------------- ref_key <- data.frame(id = 1:3, value = c(10, 20, 30)) cand_key <- data.frame(id = c(3, 1, 2), value = c(30, 10, 20)) # shuffled result_key <- compare_datasets_from_yaml(ref_key, cand_key, key = "id") result_key$all_passed ## ----positional--------------------------------------------------------------- ref_pos <- data.frame(value = c(1.0, 2.0, 3.0)) cand_pos <- data.frame(value = c(1.0, 2.0, 3.0)) result_pos <- compare_datasets_from_yaml(ref_pos, cand_pos) result_pos$all_passed ## ----composite-key------------------------------------------------------------ ref_comp <- data.frame( year = c(2023, 2023, 2024), month = c(1, 2, 1), value = c(100, 200, 300) ) cand_comp <- data.frame( year = c(2024, 2023, 2023), month = c(1, 2, 1), value = c(300, 200, 100) ) result_comp <- compare_datasets_from_yaml(ref_comp, cand_comp, key = c("year", "month")) result_comp$all_passed ## ----key-override, results = "hide"------------------------------------------- rules_key <- tempfile(fileext = ".yaml") write_rules_template(ref_comp, key = "year", path = rules_key) # YAML says year # Override at call time with the composite key result_override <- compare_datasets_from_yaml( ref_comp, cand_comp, key = c("year", "month"), # overrides YAML path = rules_key ) result_override$all_passed ## ----duplicate-keys, warning = TRUE------------------------------------------- ref_dup <- data.frame(id = c(1, 1, 2), value = c(10, 11, 20)) cand_dup <- data.frame(id = c(1, 2), value = c(10, 20)) tryCatch( compare_datasets_from_yaml(ref_dup, cand_dup, key = "id"), warning = function(w) message("Warning: ", conditionMessage(w)) ) ## ----type-mismatch, warning = TRUE-------------------------------------------- ref_type <- data.frame(id = 1:2, year = c(2023L, 2024L)) # integer cand_type <- data.frame(id = 1:2, year = c("2023", "2024")) # character tryCatch( compare_datasets_from_yaml(ref_type, cand_type, key = "id"), warning = function(w) message("Warning: ", conditionMessage(w)) ) ## ----detect-types------------------------------------------------------------- df_types <- data.frame( id = 1L, amount = 1.5, label = "x", flag = TRUE, day = Sys.Date(), timestamp = Sys.time() ) detect_column_types(df_types) ## ----derive-rules------------------------------------------------------------- rules_obj2 <- read_rules(rules_path) merged <- derive_column_rules(ref, rules_obj2) merged$revenue merged$category ## ----preprocess--------------------------------------------------------------- df_raw <- data.frame(label = c(" Hello ", "WORLD", " Foo ")) rules_norm <- list( label = list(equal_mode = "normalized", case_insensitive = TRUE, trim = TRUE) ) preprocess_dataframe(df_raw, rules_norm) ## ----tolerance-debug---------------------------------------------------------- cmp <- data.frame( value = c(1.005, 1.02, 1.0), value__reference = c(1.000, 1.00, 1.0) ) rules_debug <- list(value = list(abs = 0.01, rel = 0)) cmp_annotated <- add_tolerance_columns(cmp, "value", rules_debug, ref_suffix = "__reference", na_equal = TRUE) cmp_annotated[, c("value__absdiff", "value__thresh", "value__ok")] ## ----lang-per-call, eval = FALSE---------------------------------------------- # result_fr <- compare_datasets_from_yaml( # ref, cand, # key = "id", # lang = "fr", # locale = "fr_FR" # ) ## ----lang-global, eval = FALSE------------------------------------------------ # options(datadiff.lang = "fr", # datadiff.locale = "fr_FR") # # # All calls now produce French reports without passing lang/locale every time # result <- compare_datasets_from_yaml(ref, cand, key = "id", path = rules_path) ## ----lazy-dbplyr, eval = FALSE------------------------------------------------ # library(DBI) # library(dplyr) # # con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") # DBI::dbWriteTable(con, "reference", ref) # DBI::dbWriteTable(con, "candidate", cand) # # tbl_ref <- dplyr::tbl(con, "reference") # tbl_cand <- dplyr::tbl(con, "candidate") # # result_lazy <- compare_datasets_from_yaml( # tbl_ref, tbl_cand, # key = "id", # path = rules_path # ) # # result_lazy$all_passed # DBI::dbDisconnect(con) ## ----large-arrow, eval = FALSE------------------------------------------------ # library(arrow) # # ds_ref <- arrow::open_dataset("path/to/reference/") # ds_cand <- arrow::open_dataset("path/to/candidate/") # # # Generate a template from the schema (no data loaded into RAM) # write_rules_template(ds_ref, key = "id", path = "rules.yaml") # # result <- compare_datasets_from_yaml( # data_reference = ds_ref, # data_candidate = ds_cand, # key = "id", # path = "rules.yaml", # duckdb_memory_limit = "8GB" # tune to your machine's RAM # ) # # result$all_passed