## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) library(epidm) library(data.table) library(dplyr) ## ----ukpid-example, echo=TRUE, warning=FALSE---------------------------------- # 1) Create example data id_test <- data.frame( stringsAsFactors = FALSE, record_id = c(1L,2L,3L,4L, 5L,6L,7L,8L,9L,10L,11L,12L,13L,14L,15L, 16L,17L,18L,19L,20L,21L,22L,23L,24L), nhs_number = c(9435754422, 9435754422,NA,9435754422,5555555555,NA, 9435773982,NA,9999999999,NA,9435773982,NA, 9435802508,9435802508,NA,NA,9435802508,9435802508,NA, 3333333333,NA,9999999999,9435817777, 9435817777), local_patient_identifier = c(NA,"IG12067", NA,NA,"IG12067","IG12067","KR2535","KR2535", "KR2535",NA,NA,NA,"UK8734","UK8734",NA,NA, "UK8734","UK8734",NA,NA,"JH45204", "HS45202","HS45202","JH45204"), patient_birth_date = c("1993-07-16", "1993-07-16","1993-07-16","1993-07-16", "1993-07-16",NA,"1967-02-10",NA,"1967-02-10",NA,NA, "1967-02-10",NA,NA,"1952-10-22","1952-10-22", "1952-10-22",NA,"1947-09-14","1947-09-14", "1947-09-14","1947-09-14","1947-09-14", "1947-09-14"), sex = c("Male","Male", "Male","Male",NA,"Male","Female","Female", "Female","Female","Female","Female","Male", "Male","Male","Male","Male","Male","Male", "Male","Male","Male",NA,"Male"), forename = c(NA,"DENNIS", NA,NA,"DENNIS",NA,"ELLIE","ELLIE",NA, "ELLIE","ELLIE","ELLIE","IAN","IAN","MALCOLM", "IAN","IAN",NA,"GRANT","ALAN","ALAN","ALAN", "GRANT","ALAN"), surname = c(NA,"NEDRY", "NEDRY",NA,"NEDRY","NEDRY","SATTLER","SATTLER", NA,"SATTLER","SATTLER","SATTLER","M",NA, "IAN","MALCOLM","MALCOLM",NA,"ALAN","GRANT", "GRANT","GRANT","ALAN","GRANT"), postcode = c("HA4 0FF", "HA4 0FF","HA4 0FF",NA,"HA4 0FF","HA4 0FF", "L3 1DZ","L3 1DZ","L3 1DZ","L3 1DZ",NA,"L3 1DZ", "BN14 9EP",NA,"BN14 9EP",NA,NA,NA,"CW6 9TX", "CW6 9TX",NA,NA,NA,NA), specimen_date = c("2024-08-14", "2023-02-03","2023-02-07","2023-02-04", "2023-02-09","2024-08-14","2021-03-28","2021-03-28", "2021-03-28","2021-03-28","2021-03-28", "2021-03-28","2024-07-06","2024-07-06","2024-07-06", "2023-10-31","2023-10-31","2023-10-31", "2022-01-23","2022-01-24","2022-01-25","2022-01-26", "2022-01-27","2022-01-28") ) # 2) Run uk_patient_id() result_id <- uk_patient_id( id_test, id = list( nhs_number = 'nhs_number', hospital_number = 'local_patient_identifier', date_of_birth = 'patient_birth_date', sex_mfu = 'sex', forename = 'forename', surname = 'surname', postcode = 'postcode' ), .useStages = 1:11, # optional .keepStages = FALSE, # optional .keepValidNHS = FALSE # optional ) # 3) Show a preview print(head(result_id), row.names = FALSE) ## ----grouptime-example, echo=TRUE--------------------------------------------- # Events example (14‑day static window): #1) Create example data episode_test <- structure( list( pat_id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), species = c(rep("E. coli", 7), rep("K. pneumonia", 7)), spec_type = c(rep("Blood", 7), rep("Blood", 4), rep("Sputum", 3)), sp_date = structure( c( 18262, 18263, 18281, 18282, 18262, 18263, 18281, 18265, 18270, 18281, 18283, 18259, 18260, 18281 ), class = "Date" ) ), row.names = c(NA, -14L), class = "data.frame" ) # 2) Run group_time() for events using a 14-day static window ep_static <- group_time( x = episode_test, date_start = 'sp_date', window = 14, window_type = 'static', group_vars = c('pat_id', 'species', 'spec_type'), indx_varname = 'static_indx', # optional min_varname = 'min_date', # optional (defaults) max_varname = 'max_date' # optional (defaults) ) # 3) Show a preview print(head(ep_static), row.names = FALSE) # Intervals example (start + end dates):1) Create example interval data (start + end dates) #1) Create example data spell_test <- data.frame( id = c(rep(99, 6), rep(88, 4), rep(3, 3)), provider = c("YXZ", rep("ZXY", 5), rep("XYZ", 4), rep("YZX", 3)), spell_start = as.Date( c( "2020-03-01", "2020-07-07", "2020-02-08", "2020-04-28", "2020-03-15", "2020-07-01", "2020-01-01", "2020-01-12", "2019-12-25", "2020-03-28", "2020-01-01", NA, NA ) ), spell_end = as.Date( c( "2020-03-10", "2020-07-26", "2020-05-22", "2020-04-30", "2020-05-20", "2020-07-08", "2020-01-23", "2020-03-30", "2020-01-02", "2020-04-20", "2020-01-01", NA, NA ) ) ) # 2) Run group_time() for intervals (start + end dates) spell_episodes <- group_time( x = spell_test, date_start = 'spell_start', date_end = 'spell_end', group_vars = c('id', 'provider'), indx_varname = 'spell_id', # optional min_varname = 'spell_min_date', # optional max_varname = 'spell_max_date' # optional ) # 3) Show a preview print(head(spell_episodes), row.names = FALSE) ## ----full-example, echo=TRUE-------------------------------------------------- # Example data generation # Helper to make plausible 10-digit NHS-like strings mk_nhs <- function(n) { apply(matrix(sample(0:9, n * 10, replace = TRUE), nrow = n, byrow = TRUE), 1, paste0, collapse = "") } # A small "people" frame to borrow shared attributes from persons <- tibble::tibble( id_person = 1:6, nhsnumber = c(mk_nhs(5), NA_character_), # include one missing NHS forename = c("John", "Jane", "Sam", "Aisha", "Maya", "John"), surname = c("Smith", "Doe", "Patel", "Khan", "Brown", "Smyth"), # one spelling variant date_of_birth = as.Date(c("1980-03-14","1991-11-02","1985-07-28","2002-01-09","2010-05-30","1980-03-14")), sex = c("M","F","M","F","U","M"), postcode = c("SW1A 1AA","E1 6AN","B1 1AA","M1 1AE","CF10 1EP","SW1A1AA") # one without space ) # --- SGSS-like data: multiple specimens per person, some within 30 days ------ sgss_data <- persons |> # duplicate person 1 (two specimens), and include others slice(c(1, 1, 2, 3, 4, 5, 6)) |> mutate( CDR_OPIE_ID = row_number() + 1000L, earliest_specimen_date = as.Date("2023-10-01") + c(0, 10, 5, 15, 40, 3, 2), GROUP_A_STREP_PCR = c("Detected","Not detected","Detected","Detected","Not detected","Detected","Detected"), third = c("emm1","emm1","emm3","emm12","",NA,"emm89") ) |> select( CDR_OPIE_ID, nhsnumber, forename, surname, date_of_birth, sex, postcode, earliest_specimen_date, GROUP_A_STREP_PCR, third ) # --- CIMS-like data: case notifications, some within 30 days of SGSS ---------- cims_data <- persons |> # person 3 appears twice, person 1 appears once etc. slice(c(1, 2, 3, 3, 4, 6)) %>% mutate( Case_identifier = row_number() + 2000L, Date_entered = as.Date("2023-10-05") + c(0, 6, 18, 50, 35, 1), Infection = c("iGAS","Scarlet fever","iGAS","iGAS","Scarlet fever","iGAS") ) |> select( Case_identifier, nhsnumber, forename, surname, date_of_birth, sex, postcode, Date_entered, Infection ) # Example start: # Example of importing SGSS and CIMS data ready for linkage lnk.data_sgss <- sgss_data lnk.data_cims <- cims_data # Update SGSS column classes so they match with CIMS lnk.data_sgss <- lnk.data_sgss |> mutate( date_of_birth = as.Date(date_of_birth), forename = as.character(forename), surname = as.character(surname), postcode = as.character(postcode), patient_demog_sex = as.character(sex), earliest_specimen_date = as.Date(earliest_specimen_date), nhsnumber = as.character(nhsnumber) ) # Update CIMS column classes so they match SGSS lnk.data_cims <- lnk.data_cims |> mutate( date_of_birth = as.Date(date_of_birth), forename = as.character(forename), surname = as.character(surname), postcode = as.character(postcode), sex = as.character(sex), Date_entered = as.Date(Date_entered)) # Convert data.frame to data.table so can be fed into function lnk.dt_cims <- data.table::setDT(lnk.data_cims) lnk.dt_sgss <- data.table::setDT(lnk.data_sgss) # Stack the two data sets lnk.dt_combined <- bind_rows( mutate(lnk.dt_cims, data_source = "dt_cims"), mutate(lnk.dt_sgss, data_source = "dt_sgss") ) # Add common date field to be used during deduplication lnk.dt_combined <- lnk.dt_combined |> mutate(common_date = (coalesce(earliest_specimen_date, Date_entered))) # List of id fields to be used during normalisation of id fields # This is a parameter that is fed into the uk_patient_id() function id = list( nhs_number = 'nhsnumber', date_of_birth = 'date_of_birth', sex_mfu = 'sex', forename = 'forename', surname = 'surname', postcode = 'postcode' ) # Feeding combined SGSS and CIMS data into uk_patient_id() function to get # unique patient identifiers lnk.dt_combined_norm <- epidm::uk_patient_id( lnk.dt_combined, id, .useStages = c(1:6), .sortOrder = 'common_date', .forceCopy = TRUE, .keepValidNHS = FALSE, .keepStages = TRUE ) # Group records by `id` into rolling 30-day windows based on `common_date`, # using `dedupe_key` as the unique row index; the trailing [] forces evaluation and returns a data.table lnk.dt_combined_grouped <- epidm::group_time( x = lnk.dt_combined_norm, date_start = 'common_date', window = 30, window_type = 'rolling', indx_varname = 'dedup_key', group_vars = c( "id" ) )[] # Filter to pull out just CIMS data lnk.grouped_cims <- lnk.dt_combined_grouped |> filter(data_source == "dt_cims") |> select ( id, Case_identifier, date_of_birth, forename, surname, nhsnumber, postcode, Infection, sex, common_date, data_source, dedup_key ) # Filter to pull out just SGSS data lnk.grouped_sgss <- lnk.dt_combined_grouped |> filter(data_source == "dt_sgss") |> select ( CDR_OPIE_ID, id, date_of_birth, forename, surname, nhsnumber, postcode, GROUP_A_STREP_PCR, common_date, data_source, dedup_key, third ) # Taking one row from each split - gives one episode per dataset lnk.grouped_cims_deduped <- lnk.grouped_cims |> group_by(dedup_key) |> slice(1) |> ungroup() # De-duplicating SGSS- prioritising earliest emm typing with a relevant result lnk.grouped_sgss_deduped <- lnk.grouped_sgss |> group_by(dedup_key) |> arrange(dedup_key, common_date) |> slice(1) |> ungroup() # Join splits by dedup key- common field names with .x refer to CIMS and .y for SGSS lnk.split_join_cims_sgss <- full_join(lnk.grouped_cims_deduped, lnk.grouped_sgss_deduped, by = "dedup_key")