## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(dtaudit) library(data.table) ## ----data--------------------------------------------------------------------- orders <- data.table( order_id = 1:8, customer = c("Alice", "Bob", "Alice", "Carol", "Bob", "Alice", "Dave", "Eve"), product_id = c(101L, 102L, 101L, 103L, 104L, 102L, 105L, 106L), amount = c(50, 30, 50, 75, 20, 35, 60, 45) ) products <- data.table( product_id = c(101L, 102L, 103L, 104L, 107L), category = c("Electronics", "Books", "Clothing", "Books", "Food"), price = c(25.0, 15.0, 37.5, 10.0, 8.0) ) ## ----validate-pk-------------------------------------------------------------- validate_primary_keys(orders, "order_id") validate_primary_keys(products, "product_id") ## ----validate-join------------------------------------------------------------ validate_join(orders, products, by = "product_id") ## ----validate-join-stat------------------------------------------------------- validate_join(orders, products, by = "product_id", stat.x = "amount", stat.y = "price") ## ----filter------------------------------------------------------------------- merged <- merge(orders, products, by = "product_id", all.x = TRUE) # Keep only Electronics and Books, report dropped amount result <- filter_keep(merged, category %in% c("Electronics", "Books"), stat = amount) ## ----compare------------------------------------------------------------------ compare_datatables(orders, result) ## ----diagnose-nas------------------------------------------------------------- # Introduce some NAs to demonstrate merged_with_na <- copy(merged) merged_with_na[sample(.N, 2), category := NA] diagnose_nas(merged_with_na) ## ----diagnose-strings--------------------------------------------------------- diagnose_strings(orders$customer) ## ----date-coverage------------------------------------------------------------ dates <- as.IDate(c("2024-01-15", "2024-02-20", "2024-04-10", "2024-05-05")) check_date_coverage(dates, "2024-01-01", "2024-06-30") ## ----summary-table------------------------------------------------------------ get_summary_table(orders) ## ----audit-clean-------------------------------------------------------------- firms <- c("Apple Inc.", "MICROSOFT CORP", "Alphabet LLC", "apple", NA) audit_clean(firms, clean_firm_name)