## ----environment, echo = FALSE, message = FALSE, warning=FALSE---------------- knitr::opts_chunk$set(collapse = TRUE, comment = "", out.width = "600px", dpi = 70) options(tibble.print_min = 4L, tibble.print_max = 4L) library(dlookr) library(dplyr) library(ggplot2) ## ----import_data, warning=FALSE----------------------------------------------- dim(flights) flights ## ----diagnose----------------------------------------------------------------- diagnose(flights) ## ----diagnoses---------------------------------------------------------------- # Select columns by name diagnose(flights, year, month, day) # Select all columns between year and day (include) diagnose(flights, year:day) # Select all columns except those from year to day (exclude) diagnose(flights, -(year:day)) ## ----diagnose_pipe------------------------------------------------------------ flights %>% diagnose() %>% select(-unique_count, -unique_rate) %>% filter(missing_count > 0) %>% arrange(desc(missing_count)) ## ----diagnose_pipe_numeric---------------------------------------------------- diagnose_numeric(flights) ## ----diagnose_pipe_numeric_pipe----------------------------------------------- diagnose_numeric(flights) %>% filter(minus > 0 | zero > 0) ## ----diagnose_category-------------------------------------------------------- diagnose_category(flights) ## ----diagnose_category_pipe--------------------------------------------------- diagnose_category(flights) %>% filter(is.na(levels)) ## ----diagnose_category_pipe2-------------------------------------------------- flights %>% diagnose_category(top = 500) %>% filter(ratio <= 0.01) ## ----diagnose_outlier--------------------------------------------------------- diagnose_outlier(flights) ## ----diagnose_outlier_pipe---------------------------------------------------- diagnose_outlier(flights) %>% filter(outliers_cnt > 0) ## ----diagnose_outlier_pipe2--------------------------------------------------- diagnose_outlier(flights) %>% filter(outliers_ratio > 5) %>% mutate(rate = outliers_mean / with_mean) %>% arrange(desc(rate)) %>% select(-outliers_cnt) ## ----plot_outlier, fig.align='center', fig.width = 6, fig.height = 4---------- flights %>% plot_outlier(arr_delay) ## ----plot_outlier_pipe, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE---- # flights %>% # plot_outlier(diagnose_outlier(flights) %>% # filter(outliers_ratio >= 5) %>% # select(variables) %>% # unlist()) ## ----plot_na_pareto1, fig.align='center', fig.width = 6, fig.height = 4------- mice::boys %>% plot_na_pareto(col = "blue") ## ----plot_na_pareto2, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE---- # mice::boys %>% # plot_na_pareto(only_na = TRUE, main = "Pareto Chart for mice::boys") ## ----plot_na_pareto3, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE---- # mice::boys %>% # plot_na_pareto(grade = list(High = 0.1, Middle = 0.6, Low = 1), relative = TRUE) ## ----plot_na_pareto4, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE---- # plot_na_pareto(mice::boys, only_na = TRUE, plot = FALSE) ## ----plot_na_hclust, fig.align='center', fig.width = 6, fig.height = 4-------- mice::boys %>% plot_na_hclust(main = "Distribution of missing value") ## ----plot_na_hclust1, fig.align='center', fig.width = 6, fig.height = 4------- mice::boys %>% plot_na_intersect() ## ----plot_na_hclust3, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE---- # mice::boys %>% # plot_na_intersect(n_vars = 5) ## ----plot_na_hclust4, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE---- # mice::boys %>% # plot_na_intersect(only_na = FALSE, n_intersacts = 7) ## ----diagnose_web_report, eval=FALSE------------------------------------------ # flights %>% # diagnose_web_report(subtitle = "flights", output_dir = "./", # output_file = "Diagn.html", theme = "blue") ## ----diag_web_title, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The part of the report"---- knitr::include_graphics('img/diag_web_title.jpg') ## ----diag_web_content, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The dynamic contents of the report"---- knitr::include_graphics('img/diag_web_content.jpg') ## ----diagnose_paged_report, eval=FALSE---------------------------------------- # flights %>% # diagnose_paged_report(subtitle = "flights", output_dir = "./", # output_file = "Diagn.pdf", theme = "blue") ## ----diag_paged_cover, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The part of the report"---- knitr::include_graphics('img/diag_paged_cover.jpg') ## ----diag_paged_cntent, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The dynamic contents of the report"---- knitr::include_graphics('img/diag_paged_content.jpg') ## ----dbi_table, warning=FALSE, message=FALSE, eval=FALSE---------------------- # library(dplyr) # # carseats <- Carseats # carseats[sample(seq(NROW(carseats)), 20), "Income"] <- NA # carseats[sample(seq(NROW(carseats)), 5), "Urban"] <- NA # # # connect DBMS # con_sqlite <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") # # # copy carseats to the DBMS with a table named TB_CARSEATS # copy_to(con_sqlite, carseats, name = "TB_CARSEATS", overwrite = TRUE) ## ----dbi_diag, eval=FALSE----------------------------------------------------- # # Diagnosis of all columns # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose() # # # Positions values select columns, and In-memory mode # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose(1, 3, 8, in_database = FALSE) # # # Positions values select columns, and In-memory mode and collect size is 200 # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose(-8, -9, -10, in_database = FALSE, collect_size = 200) ## ----dbi_category, eval=FALSE------------------------------------------------- # # Positions values select variables, and In-memory mode and collect size is 200 # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose_category(7, in_database = FALSE, collect_size = 200) # # # Positions values select variables # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose_category(-7) ## ----dbi_numeric, eval=FALSE-------------------------------------------------- # # Diagnosis of all numerical variables # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose_numeric() # # # Positive values select variables, and In-memory mode and collect size is 200 # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose_numeric(Sales, Income, collect_size = 200) ## ----dbi_outlier, eval=FALSE-------------------------------------------------- # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose_outlier() %>% # filter(outliers_ratio > 1) ## ----plot_outlier_dbi, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE, eval=FALSE---- # # Visualization of numerical variables with a ratio of # # outliers greater than 1% # # the result is same as a data.frame, but not display here. reference above in document. # con_sqlite %>% # tbl("TB_CARSEATS") %>% # plot_outlier(con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose_outlier() %>% # filter(outliers_ratio > 1) %>% # select(variables) %>% # pull()) ## ----dbi_diag_report, eval=FALSE---------------------------------------------- # # create web report file. # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose_web_report() # # # create pdf file. file name is Diagn.pdf, and collect size is 350 # con_sqlite %>% # tbl("TB_CARSEATS") %>% # diagnose_paged_report(collect_size = 350, output_file = "Diagn.pdf")