## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE, eval = FALSE)

## ----one-call-----------------------------------------------------------------
# run_dq_check("customer_accounts", config_dir = "path/to/configs")

## ----install------------------------------------------------------------------
# devtools::install("path/to/dqr/dqcheckr")
# library(dqcheckr)

## ----type-inference-----------------------------------------------------------
# infer_col_type(c("2024-01-15", "2024-06-30"))  # "date"
# infer_col_type(c("100", "200", "N/A"))          # "character" (only 67% numeric)
# infer_col_type(c("100", "200", "N/A", rep("50", 17)))  # "numeric" (95% numeric)
# infer_col_type(c(NA, "", NA))                   # "unknown"
# # with a custom threshold:
# infer_col_type(c(rep("1", 17), "a", "b", "c"), threshold = 0.80)  # "numeric" (85%)

## ----run-check----------------------------------------------------------------
# result <- run_dq_check(
#   dataset_name = "customer_accounts",
#   config_dir   = "path/to/configs",   # contains dqcheckr.yml + customer_accounts.yml
#   open_report  = TRUE                 # open HTML in browser when interactive
# )
# 
# # Console output (always printed):
# # [dqcheckr] customer_accounts: FAIL - 1 warning(s), 2 failure(s). Report: reports/...html
# 
# result$status       # "PASS", "WARN", or "FAIL"
# result$report_path  # full path to the HTML file
# result$snapshot_id  # integer row ID in snapshots table

## ----individual-checks--------------------------------------------------------
# library(dqcheckr)
# 
# # Build a minimal config programmatically
# cfg <- list(
#   rules = list(
#     max_missing_rate     = 0.05,
#     max_non_numeric_rate = 0.01,
#     min_row_count        = 0
#   ),
#   column_rules     = list(
#     status = list(allowed_values = c("ACTIVE", "CLOSED"))
#   ),
#   key_columns      = "id",
#   expected_columns = NULL
# )
# 
# df <- read.csv("data/customer_accounts.csv", colClasses = "character")
# df[] <- lapply(df, trimws)
# 
# # Run a single check
# results <- check_missing_rate(df, cfg)
# 
# # Inspect
# results[[1]]$status    # "PASS" or "FAIL"
# results[[1]]$observed  # e.g. "3.2% missing (4 of 125)"
# 
# # Run all single-snapshot checks at once
# all_results <- run_qc_checks(df, cfg)
# 
# # Overall status
# overall_status(all_results)  # "FAIL" > "WARN" > "PASS" > "INFO"

## ----custom-file--------------------------------------------------------------
# # File: custom/customer_accounts_checks.R
# 
# custom_checks <- function(df) {
#   results <- list()
# 
#   # Rule: ACTIVE accounts must not have a zero balance
#   active_zero <- df[df$account_status == "ACTIVE" &
#                     !is.na(df$account_balance) &
#                     df$account_balance == "0", ]
#   n <- nrow(active_zero)
#   results <- c(results, list(dq_result(
#     check_id   = "CUST-01",
#     check_name = "No zero-balance active accounts",
#     column     = "account_balance",   # enables per-column storage in SQLite
#     status     = if (n > 0) "FAIL" else "PASS",
#     observed   = sprintf("%d ACTIVE account(s) with balance 0", n),
#     message    = if (n > 0)
#       sprintf("%d ACTIVE account(s) have a zero balance.", n)
#     else
#       "No ACTIVE accounts have a zero balance."
#   )))
# 
#   results
# }

## ----snapshot-query-----------------------------------------------------------
# library(DBI)
# library(RSQLite)
# 
# con <- dbConnect(SQLite(), "data/snapshots.sqlite")
# 
# # Recent runs for one dataset
# dbGetQuery(con,
#   "SELECT id, file_name, overall_status, check_fail_count, run_timestamp
#    FROM snapshots
#    WHERE dataset_name = 'customer_accounts'
#    ORDER BY id DESC
#    LIMIT 10")
# 
# # Column-level stats for the most recent run
# dbGetQuery(con,
#   "SELECT column_name, dq_check, value, threshold
#    FROM column_snapshots
#    WHERE snapshot_id = (
#      SELECT MAX(id) FROM snapshots WHERE dataset_name = 'customer_accounts'
#    )
#    ORDER BY column_name, dq_check")
# 
# dbDisconnect(con)

## ----cross-dataset------------------------------------------------------------
# dbGetQuery(con,
#   "SELECT dataset_name, COUNT(*) AS runs,
#           SUM(check_fail_count) AS total_failures
#    FROM snapshots
#    GROUP BY dataset_name")

## ----starwars-setup-----------------------------------------------------------
# # Config directory contains:
# #   dqcheckr.yml         — global thresholds
# #   starwars_csv.yml     — CSV dataset config
# #   starwars_fwf.yml     — FWF dataset config
# 
# # Run checks on both formats
# result_csv <- run_dq_check("starwars_csv", config_dir = "config", open_report = TRUE)
# result_fwf <- run_dq_check("starwars_fwf", config_dir = "config", open_report = TRUE)