## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## -----------------------------------------------------------------------------
# system.file will look for the path to where synthesisr is installed
# by using the example bibliographic data files, you can reproduce the vignette
bibfiles <- list.files(
  system.file("extdata/", package = "synthesisr"),
  full.names = TRUE
)

# we can print the list of bibfiles to confirm what we will import
# in this example, we have bibliographic data exported from Scopus and Zoological Record
print(bibfiles)

# now we can use read_refs to read in our bibliographic data files
# we save them to a data.frame object (because return_df=TRUE) called imported_files
library(synthesisr)
imported_files <- read_refs(
  filename = bibfiles,
  return_df = TRUE)


## -----------------------------------------------------------------------------
# first, we will remove articles that have identical titles
# this is a fairly conservative approach, so we will remove them without review
df <- deduplicate(
  imported_files,
  match_by = "title",
  method = "exact"
)


## -----------------------------------------------------------------------------
# there are still some duplicate articles that were not removed
# for example, the titles for articles 91 and 114 appear identical
df$title[c(91,114)]
# the dash-like symbol in title 91, however, is a special character not punctuation
# so it was not classified as identical

# similarly, there is a missing space in the title for article 96
df$title[c(21,96)]

# and an extra space in title 47
df$title[c(47, 101)]

# in this example, we will use string distance to identify likely duplicates
duplicates_string <- find_duplicates(
  df$title,
  method = "string_osa",
  to_lower = TRUE,
  rm_punctuation = TRUE,
  threshold = 7
)

# we can extract the line numbers from the dataset that are likely duplicated
# this lets us manually review those titles to confirm they are duplicates

manual_checks <- review_duplicates(df$title, duplicates_string)


## ---- include=FALSE, eval=TRUE------------------------------------------------
manual_checks[,1] <- substring(manual_checks[,1], 1, 60)
manual_checks

## -----------------------------------------------------------------------------
print(manual_checks)

# the titles under match #99 are not duplicates, so we need to keep them both
# we can use the override_duplicates function to manually mark them as unique
new_duplicates <- synthesisr::override_duplicates(duplicates_string, 99)

# now we can extract unique references from our dataset
# we need to pass it the dataset (df) and the matching articles (new_duplicates)
results <- extract_unique_references(df, new_duplicates)


## ----paged.print=TRUE---------------------------------------------------------

# synthesisr can write the full dataset to a bibliographic file
# but in this example, we will just write the first citation
# we also want it to be a nice clean bibliographic file, so we remove NA data
# this makes it easier to view the output when working with a single article
citation <- df[1,!is.na(df[1,])]

format_citation(citation)

write_refs(citation,
  format = "bib",
  file = FALSE
)