## ----environment, echo = FALSE, message = FALSE, warning=FALSE---------------- knitr::opts_chunk$set(collapse = TRUE, comment = "", out.width = "600px", dpi = 70) options(tibble.print_min = 4L, tibble.print_max = 4L) library(dlookr) library(dplyr) library(ggplot2) ## ----import_data, warning=FALSE----------------------------------------------- str(Carseats) ## ----missing------------------------------------------------------------------ carseats <- Carseats suppressWarnings(RNGversion("3.5.0")) set.seed(123) carseats[sample(seq(NROW(carseats)), 20), "Income"] <- NA suppressWarnings(RNGversion("3.5.0")) set.seed(456) carseats[sample(seq(NROW(carseats)), 10), "Urban"] <- NA ## ----describe----------------------------------------------------------------- describe(carseats) ## ----describes2--------------------------------------------------------------- # Select columns by name describe(carseats, Sales, CompPrice, Income) # Select all columns between year and day (include) describe(carseats, Sales:Income) # Select all columns except those from year to day (exclude) describe(carseats, -(Sales:Income)) ## ----describe_pipe------------------------------------------------------------ carseats %>% describe() %>% select(described_variables, skewness, mean, p25, p50, p75) %>% filter(!is.na(skewness)) %>% arrange(desc(abs(skewness))) ## ----describe_pipe2----------------------------------------------------------- carseats %>% group_by(US) %>% describe(Sales, Income) ## ----describe_pipe3----------------------------------------------------------- carseats %>% group_by(US, Urban) %>% describe(Sales, Income) ## ----normality---------------------------------------------------------------- normality(carseats) ## ----normality2--------------------------------------------------------------- # Select columns by name normality(carseats, Sales, CompPrice, Income) # Select all columns between year and day (inclusive) normality(carseats, Sales:Income) # Select all columns except those from year to day (inclusive) normality(carseats, -(Sales:Income)) ## ----normality_pipe----------------------------------------------------------- library(dplyr) carseats %>% normality() %>% filter(p_value <= 0.01) %>% arrange(abs(p_value)) ## ----normality_pipe2---------------------------------------------------------- carseats %>% group_by(ShelveLoc, US) %>% normality(Income) %>% arrange(desc(p_value)) ## ----normality_pipe3---------------------------------------------------------- carseats %>% mutate(log_income = log(Income)) %>% group_by(ShelveLoc, US) %>% normality(log_income) %>% filter(p_value > 0.01) ## ----plot_normality, fig.align='center', fig.width = 6, fig.height = 4-------- # Select columns by name plot_normality(carseats, Sales, CompPrice) ## ----plot_normality2, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE---- # carseats %>% # filter(ShelveLoc == "Good") %>% # group_by(US) %>% # plot_normality(Income) ## ----correlate---------------------------------------------------------------- correlate(carseats) ## ----correlate2--------------------------------------------------------------- # Select columns by name correlate(carseats, Sales, CompPrice, Income) # Select all columns between year and day (include) correlate(carseats, Sales:Income) # Select all columns except those from year to day (exclude) correlate(carseats, -(Sales:Income)) ## ----correlate3--------------------------------------------------------------- carseats %>% correlate(Sales:Income) %>% filter(as.integer(var1) > as.integer(var2)) ## ----correlate4--------------------------------------------------------------- tab_corr <- carseats %>% filter(ShelveLoc == "Good") %>% group_by(Urban, US) %>% correlate(Sales) %>% filter(abs(coef_corr) > 0.5) tab_corr ## ----plot_correlate, fig.align='center', fig.width = 6, fig.height = 4-------- carseats %>% correlate() %>% plot() ## ----plot_correlate2, fig.align='center', fig.width = 6, fig.height = 4, eval=TRUE---- # Select columns by name correlate(carseats, Sales, Price) %>% plot() ## ----plot_correlate3, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE, eval=TRUE---- carseats %>% filter(ShelveLoc == "Good") %>% group_by(Urban) %>% correlate() %>% plot() ## ----target_by---------------------------------------------------------------- categ <- target_by(carseats, US) ## ----target_by2--------------------------------------------------------------- # If the variable of interest is a numerical variable cat_num <- relate(categ, Sales) cat_num summary(cat_num) ## ----target_by3, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE---- plot(cat_num) ## ----target_by4--------------------------------------------------------------- # If the variable of interest is a categorical variable cat_cat <- relate(categ, ShelveLoc) cat_cat summary(cat_cat) ## ----target_by5, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE---- plot(cat_cat) ## ----target_by6--------------------------------------------------------------- # If the variable of interest is a numerical variable num <- target_by(carseats, Sales) ## ----target_by7--------------------------------------------------------------- # If the variable of interest is a numerical variable num_num <- relate(num, Price) num_num summary(num_num) ## ----target_by8, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE---- plot(num_num) ## ----target_by8_2, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE---- plot(num_num, hex_thres = 350) ## ----target_by9--------------------------------------------------------------- # If the variable of interest is a categorical variable num_cat <- relate(num, ShelveLoc) num_cat summary(num_cat) ## ----target_by10, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE---- plot(num_cat) ## ----eda_web_report, eval=FALSE----------------------------------------------- # heartfailure %>% # eda_web_report(target = "death_event", subtitle = "heartfailure", # output_dir = "./", output_file = "EDA.html", theme = "blue") ## ----eda_web_title, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The part of the report"---- knitr::include_graphics('img/eda_web_title.jpg') ## ----eda_paged_report, eval=FALSE--------------------------------------------- # heartfailure %>% # eda_paged_report(target = "death_event", subtitle = "heartfailure", # output_dir = "./", output_file = "EDA.pdf", theme = "blue") ## ----eda_paged_cover, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The part of the report"---- knitr::include_graphics('img/eda_paged_cover.jpg') ## ----eda_paged_cntent, echo=FALSE, out.width='80%', fig.align='center', fig.pos="!h", fig.cap="The dynamic contents of the report"---- knitr::include_graphics('img/eda_paged_content.jpg') ## ----dbi_table, warning=FALSE, message=FALSE, eval=FALSE---------------------- # library(dplyr) # # carseats <- Carseats # carseats[sample(seq(NROW(carseats)), 20), "Income"] <- NA # carseats[sample(seq(NROW(carseats)), 5), "Urban"] <- NA # # # connect DBMS # con_sqlite <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") # # # copy carseats to the DBMS with a table named TB_CARSEATS # copy_to(con_sqlite, carseats, name = "TB_CARSEATS", overwrite = TRUE) ## ----dbi_describe, eval=FALSE------------------------------------------------- # # Positive values select variables # con_sqlite %>% # tbl("TB_CARSEATS") %>% # describe(Sales, CompPrice, Income) # # # Negative values to drop variables, and In-memory mode and collect size is 200 # con_sqlite %>% # tbl("TB_CARSEATS") %>% # describe(-Sales, -CompPrice, -Income, collect_size = 200) # # # Find the statistic of all numerical variables by 'ShelveLoc' and 'US', # # and extract only those with the 'ShelveLoc' variable level as "Good". # con_sqlite %>% # tbl("TB_CARSEATS") %>% # group_by(ShelveLoc, US) %>% # describe() %>% # filter(ShelveLoc == "Good") # # # extract only those with 'Urban' variable level is "Yes", # # and find 'Sales' statistics by 'ShelveLoc' and 'US' # con_sqlite %>% # tbl("TB_CARSEATS") %>% # filter(Urban == "Yes") %>% # group_by(ShelveLoc, US) %>% # describe(Sales) ## ----dbi_normality, eval=FALSE------------------------------------------------ # # Test all numerical variables by 'ShelveLoc' and 'US', # # and extract only those with the 'ShelveLoc' variable level is "Good". # con_sqlite %>% # tbl("TB_CARSEATS") %>% # group_by(ShelveLoc, US) %>% # normality() %>% # filter(ShelveLoc == "Good") # # # extract only those with 'Urban' variable level is "Yes", # # and test 'Sales' by 'ShelveLoc' and 'US' # con_sqlite %>% # tbl("TB_CARSEATS") %>% # filter(Urban == "Yes") %>% # group_by(ShelveLoc, US) %>% # normality(Sales) # # # Test log(Income) variables by 'ShelveLoc' and 'US', # # and extract only p.value greater than 0.01. # # # SQLite extension functions for log transformation # RSQLite::initExtension(con_sqlite) # # con_sqlite %>% # tbl("TB_CARSEATS") %>% # mutate(log_income = log(Income)) %>% # group_by(ShelveLoc, US) %>% # normality(log_income) %>% # filter(p_value > 0.01) ## ----plot_normality_dbi, fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE, eval=FALSE---- # # Extract only those with the 'ShelveLoc' variable level is "Good", # # and plot 'Income' by 'US' # # The result is the same as the data.frame, but not displayed here. Reference above in document. # con_sqlite %>% # tbl("TB_CARSEATS") %>% # filter(ShelveLoc == "Good") %>% # group_by(US) %>% # plot_normality(Income) ## ----dbi_correlation, eval=FALSE---------------------------------------------- # # Correlation coefficient # # that eliminates redundant combination of variables # con_sqlite %>% # tbl("TB_CARSEATS") %>% # correlate() %>% # filter(as.integer(var1) > as.integer(var2)) # # con_sqlite %>% # tbl("TB_CARSEATS") %>% # correlate(Sales, Price) %>% # filter(as.integer(var1) > as.integer(var2)) # # # Compute the correlation coefficient of the Sales variable by 'ShelveLoc' # # and 'US' variables. And extract only those with absolute # # value of the correlation coefficient is more significant than 0.5 # con_sqlite %>% # tbl("TB_CARSEATS") %>% # group_by(ShelveLoc, US) %>% # correlate(Sales) %>% # filter(abs(coef_corr) >= 0.5) # # # Extract only those with the 'ShelveLoc' variable level is "Good", # # and compute the correlation coefficient of the 'Sales' variable # # by 'Urban' and 'US' variables. # # And the correlation coefficient is negative and smaller than 0.5 # con_sqlite %>% # tbl("TB_CARSEATS") %>% # filter(ShelveLoc == "Good") %>% # group_by(Urban, US) %>% # correlate(Sales) %>% # filter(coef_corr < 0) %>% # filter(abs(coef_corr) > 0.5) ## ----plot_correlation_dbi, fig.align='center', fig.width = 6, fig.height = 4, warning=FALSE, eval=FALSE---- # # Extract only those with 'ShelveLoc' variable level is "Good", # # and visualize correlation plot of 'Sales' variable by 'Urban' # # and 'US' variables. # # The result is the same as the data.frame, but not displayed here. Reference above in document. # con_sqlite %>% # tbl("TB_CARSEATS") %>% # filter(ShelveLoc == "Good") %>% # group_by(Urban) %>% # correlate() %>% # plot(Sales) ## ----dbi_ctarget_by, eval=FALSE----------------------------------------------- # # If the target variable is a categorical variable # categ <- target_by(con_sqlite %>% tbl("TB_CARSEATS") , US) # # # If the variable of interest is a numerical variable # cat_num <- relate(categ, Sales) # cat_num # summary(cat_num) ## ----plot_target_by_dbi, fig.align='center', fig.align='center', fig.width = 6, fig.height = 4, eval=FALSE---- # # The result is the same as the data.frame, but not displayed here. Reference above in document. # plot(cat_num) ## ----dbi_eda_report, eval=FALSE----------------------------------------------- # # create a web report file. # con_sqlite %>% # tbl("TB_CARSEATS") %>% # eda_web_report() # # # create a pdf file. the file name is EDA.pdf, and the collect size is 350 # con_sqlite %>% # tbl("TB_CARSEATS") %>% # eda_paged_report(collect_size = 350, output_file = "EDA.pdf")