## ---- include=FALSE-----------------------------------------------------------
old_opts <- options()
old_knitr_opts <- knitr::opts_chunk$get()
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  out.width = "100%",
  fig.align = "center"
)

options(pillar.width = 85)
options(pillar.max_dec_width = 3)
options(pillar.sigfig = 2)

## ----setup, include=FALSE-----------------------------------------------------
library(eHDPrep)

## -----------------------------------------------------------------------------
data(example_data)
tibble::glimpse(example_data)

## ----echo=FALSE, fig.cap="Suggested workflow of low-level quality control functions in eHDPrep. Dashed lines and boxes represent optional steps.", fig.align='center'----
# ![](./images/Figure_1.png)
knitr::include_graphics("./images/Figure_1.png")

## ---- eval = FALSE------------------------------------------------------------
#  # Not run, just examples:
#  #excel
#  data <- import_dataset(file = "./dataset.xlsx", format = "excel")
#  #csv
#  data <- import_dataset(file = "./dataset.csv", format = "csv")
#  #tsv
#  data <- import_dataset(file = "./dataset.tsv", format = "tsv")
#  

## ----fig.show = "hide"--------------------------------------------------------
data(example_data)

# create a consistency table containing consistency rules
# below states: if a patient has a type of diabetes, they should have diabetes
ct <- tibble::tribble(~varA, ~varB, ~lgl_test, ~varA_boundaries, ~varB_boundaries,
                      "diabetes_type", "diabetes", NA, "Type I", "Yes",
                      "diabetes_type", "diabetes", NA, "Type II", "Yes") 

res <- assess_quality(data = example_data, id_var = patient_id, consis_tbl = ct)

## -----------------------------------------------------------------------------
res$completeness$row_completeness
res$completeness$variable_completeness

## ---- fig.height=3, out.width = "100%", fig.cap="Percentage completeness (x-axis) by count (y-axis) for both rows (red) and variables (purple) of `example_data`."----
res$completeness$completeness_plot

## ---- fig.height=5, out.width = "100%", fig.cap="Completeness heatmap for `example_data`. Yellow cells represent missing values and blue cells represent non-missing values."----
plot.new()
res$completeness$completeness_heatmap

## -----------------------------------------------------------------------------
res$internal_inconsistency


## -----------------------------------------------------------------------------
res$vars_with_zero_entropy

## -----------------------------------------------------------------------------
tmp = tempfile(fileext = ".csv")
assume_var_classes(data = example_data, out_file = tmp)

# (user makes manual edits externally)

import_var_classes(file = tmp)

## ---- echo = FALSE------------------------------------------------------------
# create an example class_tbl object
# note that diabetes_type is classes as ordinal and is not modified as its
# levels are not pre-coded
tibble::tribble(~"var", ~"datatype",
"patient_id", "id",
"tumoursize", "numeric",
"t_stage", "ordinal_tstage",
"n_stage", "ordinal_nstage",
"diabetes", "factor",
"diabetes_type", "factor",
"hypertension", "factor",
"rural_urban", "factor",
"marital_status", "factor",
"SNP_a", "genotype",
"SNP_b", "genotype",
"free_text", "freetext") -> data_types


## -----------------------------------------------------------------------------
data_types


## -----------------------------------------------------------------------------
apply_quality_ctrl(data = example_data,
                   id_var = patient_id,
                   class_tbl = data_types,
                   bin_cats =c("No" = "Yes", "rural" = "urban"),
                   min_freq = 0.6)


## ---- echo=FALSE--------------------------------------------------------------
tibble::tribble(~"var", ~"datatype",
"patient_id", "id",
"tumoursize", "numeric",
"t_stage", "ordinal_tstage",
"n_stage", "ordinal_nstage",
"diabetes_merged", "factor",
"hypertension", "factor",
"rural_urban", "factor",
"marital_status", "factor",
"SNP_a", "genotype",
"SNP_b", "genotype",
"free_text", "freetext") -> data_types_diabetes_m

## -----------------------------------------------------------------------------
data_types_diabetes_m

## -----------------------------------------------------------------------------
require(magrittr) # for pipe: %>%
example_data %>%
  # first merge diabetes variables
  merge_cols(primary_var = diabetes_type,
             secondary_var = diabetes,
             merge_var_name = "diabetes_merged",
             rm_in_vars = TRUE) %>%
  # pass data with diabetes_merged to high-level QC function
  apply_quality_ctrl(id_var = patient_id, class_tbl = data_types_diabetes_m,
                     bin_cats =c("No" = "Yes", "rural" = "urban")) ->
  post_QC_example_data
  
  post_QC_example_data


## -----------------------------------------------------------------------------
example_data %>%
  # first merge diabetes variables
    merge_cols(primary_var = diabetes_type,
             secondary_var = diabetes,
             merge_var_name = "diabetes_merged",
             rm_in_vars = TRUE) %>%
  # pass data with diabetes_merged to high-level QC function
  apply_quality_ctrl(id_var = patient_id, class_tbl = data_types_diabetes_m,
                     bin_cats =c("No" = "Yes", "rural" = "urban"),
                     # Relevant line:
                     to_numeric_matrix = TRUE) ->
  post_QC_example_data_m

  # concise summary of output:
  tibble::glimpse(post_QC_example_data_m)


## -----------------------------------------------------------------------------
qc_review <- review_quality_ctrl(before_tbl = example_data,
                                 after_tbl = post_QC_example_data,
                                 id_var = patient_id)


## -----------------------------------------------------------------------------
qc_review$variable_level_changes

## -----------------------------------------------------------------------------
qc_review$value_level_changes

# summary of above
qc_review$value_level_changes %>% 
  dplyr::distinct(across(!patient_id))

## ---- out.width = "100%", fig.height=3, fig.cap = "Proportion of values modified per patient in `example_data` following quality control. This plot summarises the modifications made to the data during quality control."----
qc_review$value_level_changes_plt

## ---- eval = FALSE------------------------------------------------------------
#  #csv
#  export_dataset(x = post_QC_example_data,
#                 file = "./post_QC_example_data.csv",
#                 format = "csv")
#  #tsv
#  export_dataset(x = post_QC_example_data,
#                 file = "./post_QC_example_data.csv",
#                 format = "tsv")

## ---- fig.height=3, fig.cap="Percentage completeness (x-axis) by count (y-axis) for variables of `example_data.`"----
variable_completeness(example_data)

row_completeness(data = example_data, id_var = patient_id)

plot_completeness(data = example_data, id_var = patient_id, plot = "variables")

## ---- fig.height=3, fig.cap="Percentage completeness (x-axis) by count (y-axis) for rows of `example_data`."----
plot_completeness(data = example_data, id_var = patient_id, plot = "rows")

## ---- fig.height=5, fig.show='hold', fig.cap="Completeness heatmap of `example_data` after pre-defined strings have been converted to `NA`. Yellow cells represent missing values and blue cells represent non-missing values."----
# show_rownames is passed to pheatmap() through the `...` parameter
hm <- completeness_heatmap(data = strings_to_NA(example_data),
                     id_var = patient_id, 
                     show_rownames = F)
plot.new()
hm

## ---- fig.height=5, fig.show='hold', fig.cap="Completeness heatmap of `example_data` after pre-defined strings have been converted to `NA`. Yellow cells represent missing values and blue cells represent non-missing values. Variables are annotated by their data type."----
hm <- completeness_heatmap(data = strings_to_NA(example_data),
                     id_var = patient_id, 
                     show_rownames = FALSE,
                     annotation_tbl = data_types)
plot.new()
hm


## ---- fig.height=3, fig.cap="Density plot comparing percentage row completeness of `example_data` before specified strings have been converted to `NA` (purple) and after (green)."----
compare_completeness(tbl_a = example_data, tbl_b = strings_to_NA(example_data),
                     dim = 1, tbl_a_lab = "example_data",
                     tbl_b_lab = "strings_to_NA(\nexample_data\n)")

## -----------------------------------------------------------------------------
example_incon_rules <-  tibble::tribble(~varA, ~varB, ~lgl_test, ~varA_boundaries, ~varB_boundaries,
                                        "diabetes_type", "diabetes", NA, "Type I", "Yes",
                                        "diabetes_type", "diabetes", NA, "Type II", "Yes"
                                        )
example_incon_rules

## -----------------------------------------------------------------------------
# validate the consistency rule table
validate_consistency_tbl(data = example_data, consis_tbl = example_incon_rules)

## -----------------------------------------------------------------------------
identify_inconsistency(data = example_data, consis_tbl = example_incon_rules)

## -----------------------------------------------------------------------------
merge <- merge_cols(data = example_data,
                    primary_var = diabetes_type,
                    secondary_var = diabetes,
                    merge_var_name = "diabetes_merged")


## ---- fig.height=3, fig.cap="Comparison of information content between two input variables and each input variable's mutual information with the merged variable (output). This plot can inform variable merging strategies. Mutual information of `merge\\$diabetes` with `output` is lower than information content of `merge\\$diabetes` which informs the user that some information loss has occurred in this merging strategy."----
merge_IC <- compare_info_content(input1 = merge$diabetes,
                                 input2 = merge$diabetes_type,
                                 composite = merge$diabetes_merged)

merge_IC

compare_info_content_plt(compare_info_content_res = merge_IC)

## -----------------------------------------------------------------------------
# default values
example_data_NAs1 <- strings_to_NA(data = example_data)

# predefined value "equivocal" is removed
unique(example_data_NAs1$t_stage)

# custom values (T1 does not represent missingness, just used as an example)
example_data_NAs2 <-strings_to_NA(data = example_data,
                                  strings_to_replace = "T1")

# custom value "T1" is removed
unique(example_data_NAs2$t_stage)

# numeric value is removed in patient_id
nums_to_NA(data = example_data, patient_id, nums_to_replace = c(1,3))


## -----------------------------------------------------------------------------
encode_cats(data = example_data, marital_status) %>%
  dplyr::select(dplyr::starts_with("marital_status"))


## -----------------------------------------------------------------------------
example_data %>%
  encode_ordinals(ord_levels = c("N0","N1","N2"), n_stage) %>%
  dplyr::select(n_stage)

# demonstrating how ordered factors can be converted to numeric vectors
example_data %>%
  encode_ordinals(ord_levels = c("N0","N1","N2"), n_stage) %>%
  dplyr::select(n_stage) %>%
  dplyr::mutate(dplyr::across(n_stage, as.numeric))

## -----------------------------------------------------------------------------
encode_genotypes(data = example_data, SNP_a, SNP_b) %>%
  dplyr::select(dplyr::starts_with("SNP"))

## -----------------------------------------------------------------------------
# Identify skipgrams in example_data$free_text
skipgrams <- skipgram_identify(x = example_data$free_text,
                  ids = example_data$patient_id,
                  num_of_words = 2,
                  max_interrupt_words = 5)
skipgrams

# Summarise frequency of skipgrams to consider which should be added to the
# data.
skipgram_freq(skipgram_tokens = skipgrams, min_freq = 0.5)

# Append chosen skipgrams to example_data
## a) by minimum frequency
skipgram_append(skipgram_tokens = skipgrams,
                id_var = patient_id,
                min_freq = 0.6,
                data = example_data)

## b) by specific skipgram(s)
skipgram_append(skipgram_tokens = skipgrams,
                id_var = patient_id,
                skipgrams2append = c("sixteen_week", "bad_strain"),
                data = example_data)


## -----------------------------------------------------------------------------
extract_freetext(data = example_data,
                 id_var = patient_id,
                 min_freq = 0.6, free_text)

## -----------------------------------------------------------------------------
# merge data
example_data_merged <- merge_cols(data = example_data,
                                  primary_var = diabetes_type,
                                  secondary_var = diabetes,
                                  merge_var_name = "diabetes_merged",
                                  rm_in_vars = T)

# review this step's effects on the involved variables:
count_compare(before_tbl = example_data,
          after_tbl = example_data_merged,
          cols2compare = c("diabetes", "diabetes_type", "diabetes_merged"),
          kableout = F)

## ---- fig.height=3, fig.cap="Proportion of values modified per patient in `example_data` following conversion of specific values to `NA`."----
#variable level modifications
report_var_mods(before_tbl = example_data,
                after_tbl = example_data_merged)

# value level modifications showing which exact missingness values
# were removed
mod_track(before_tbl = example_data,
          after_tbl = strings_to_NA(example_data), 
          id_var = patient_id)

# plot value level modifications
mod_track(before_tbl = example_data,
          after_tbl = strings_to_NA(example_data),
          id_var = patient_id, plot = T)


## -----------------------------------------------------------------------------
# example of data which has been quality controlled.
example_data %>%
  merge_cols(primary_var = diabetes_type,
             secondary_var = diabetes,
             merge_var_name = "diabetes_merged",
             rm_in_vars = TRUE) %>%
  apply_quality_ctrl(id_var = patient_id,
                     class_tbl = data_types_diabetes_m,
                     bin_cats =c("No" = "Yes", "rural" = "urban"),
                     min_freq = 0.6) ->
  post_qc_data

post_qc_data %>%
  encode_as_num_mat(id_var = patient_id) %>%
  tibble::glimpse()


## -----------------------------------------------------------------------------
post_qc_data %>%
  ordinal_label_levels()


## ----echo=FALSE, fig.cap="Workflow of low-level semantic enrichment functions in eHDPrep. The dashed lines and box represent an optional step.", fig.align='center'----
knitr::include_graphics("./images/Figure_6.png")

## ---- eval=T------------------------------------------------------------------
example_data %>%
  # first merge diabetes variables
  merge_cols(primary_var = diabetes_type,
             secondary_var = diabetes,
             merge_var_name = "diabetes_merged",
             rm_in_vars = TRUE) %>%
  # pass data with diabetes_merged to high-level QC function
  apply_quality_ctrl(id_var = patient_id,
                     class_tbl = data_types_diabetes_m,
                     bin_cats =c("No" = "Yes", "rural" = "urban"),
                     to_numeric_matrix = TRUE) ->
  post_qc_data


## -----------------------------------------------------------------------------
data(example_ontology)
example_ontology

## ---- out.width = "100%", fig.cap="Visualisation of `example_ontology` using the ggraph package."----
require(ggplot2)
ggraph::ggraph(example_ontology, layout = "sugiyama") +
    ggraph::geom_edge_diagonal(arrow = arrow(length = unit(3, 'mm')),
                       colour = "slategray3") +
    ggraph::geom_node_label(aes(label = name),
                            size = 2.5, repel = FALSE, hjust="inward") +
    theme_void() +
    theme(legend.position = "none") +
    coord_flip()


## -----------------------------------------------------------------------------
data(example_mapping_file)
example_mapping_file


## ---- warning=F---------------------------------------------------------------
qc_se_data <- semantic_enrichment(data = post_qc_data,
                                  ontology = example_ontology,
                                  mapping_file = example_mapping_file,
                                  mode = "in",
                                  root = "root")

## -----------------------------------------------------------------------------
tibble::glimpse(qc_se_data)

## -----------------------------------------------------------------------------
qc_se_data %>%
  dplyr::select(tumoursize, t_stage, n_stage,
                dplyr::starts_with("MV_property_of_cancer")) %>%
                tibble::glimpse()

## ---- echo = F----------------------------------------------------------------
# Some summary stats of SE

# number of aggregations
qc_se_data %>%
  dplyr::select(dplyr::starts_with("MV_")) %>%
  length() ->
  num_aggs

# number of meta-variables used (not above / 5 because of zero entropy vars)
qc_se_data %>%
  dplyr::select(dplyr::starts_with("MV_")) %>%
  names() %>%
  substr(.,4,nchar(.)-4) %>%
  unique() %>%
  length() ->
  num_MVs

## -----------------------------------------------------------------------------
example_edge_tbl

## -----------------------------------------------------------------------------
example_ontology <- edge_tbl_to_graph(example_edge_tbl)
example_ontology

## -----------------------------------------------------------------------------
joined_nw <- join_vars_to_ontol(ontol_graph = example_ontology,
                                var2entity_tbl = example_mapping_file,
                                root = "root", k = 0.5)

## ---- out.width = "100%", fig.width=11, fig.cap="Visualisation of `example_ontology` using the ggraph package, coloured by the category of the node."----
ggraph::ggraph(joined_nw, layout = "sugiyama") +
    ggraph::geom_edge_diagonal(arrow = arrow(length = unit(3, 'mm')),
                       colour = "slategray3") +
    ggraph::geom_node_label(aes(
      label = name, color = node_category), size = 2.5,
      repel = F, hjust="inward") +
    theme_void() +
    scale_color_brewer(palette = "Set2") +
    coord_flip() +
    theme(legend.position = c(0.08, 0.85))
    

## ---- out.width = "100%", fig.width=9, warning=FALSE, fig.cap="Visualisation of `example_ontology` using the ggraph package, coloured by the category of the node. Node size is proportional to node information content. Node labels denote node information content. Dataset variable nodes (right hand side of figure) are not visible as information content is only applicable to ontological entities."----
ggraph::ggraph(joined_nw, layout = "sugiyama") +
    ggraph::geom_edge_diagonal(arrow = arrow(length = unit(3, 'mm')),
                       colour = "slategray3") +
    ggraph::geom_node_point(aes(
      color = node_category, size = information_content)) +
    ggraph::geom_node_label(aes(
      label = round(information_content,2),
      color = node_category),  size = 2.5, hjust="inward") +
    scale_color_brewer(palette = "Set2") +
    theme_void() +
    coord_flip()

## -----------------------------------------------------------------------------
example_ontology %>%
  join_vars_to_ontol(var2entity_tbl = example_mapping_file, root = "root") %>%
  metavariable_info() ->
  metavariables_nw

## ----out.width="100%", fig.width=9, fig.cap="Visualisation of `example_ontology` using the ggraph package. Ontological entities which link two or more dataset variables as descendants are labelled with numeric identifiers for the set of variables linked. Variable sets 5 and 8 variables are shown to have multiple common ancestors. This demonstrates the need to consider the information content of common ancestors so that the most informative common ancestor is used in the labelling of meta-variables."----
metavariables_nw %>%
  # annotations are also considered a set. This isn't helpful for this visualisation
  # Therefore, the sets of non-meta-variables are removed below
  tidygraph::mutate(variable_set = ifelse(!is_metavariable, NA, variable_set)) %>%
  tidygraph::mutate(variable_set = as.factor(variable_set)) %>%
  ggraph::ggraph(layout = "sugiyama") +
    ggraph::geom_edge_diagonal(arrow = arrow(length = unit(3, 'mm')),
                       colour = "slategray3") +
    ggraph::geom_node_label(aes(label = ifelse(is_metavariable, 
                                       as.factor(as.numeric(variable_set)),
                                       name),
                        color = ifelse(is_metavariable, 
                                       as.character(as.numeric(variable_set)),
                                       node_category)),
                    repel = F, size = 2.5, hjust="inward") +
    theme_void() +
    theme(legend.position = "none") +
    coord_flip()

## -----------------------------------------------------------------------------
metavariable_variable_descendants(metavariables_nw)

## -----------------------------------------------------------------------------
example_ontology %>%
    join_vars_to_ontol(var2entity_tbl = example_mapping_file, root = "root") %>%
    metavariable_info() %>%
    metavariable_agg(data = post_qc_data) ->
    qc_se_data

## summary of output
tibble::glimpse(qc_se_data)


## ---- include=FALSE-----------------------------------------------------------
# restore original options
options(old_opts)
knitr::opts_chunk$set(old_knitr_opts)