## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
warning = FALSE,
message = FALSE,
comment = "#>"
)
library(subtools)
## ----read-srt-----------------------------------------------------------------
f_srt <- system.file("extdata", "ex_subrip.srt", package = "subtools")
subs <- read_subtitles(file = f_srt)
subs
## ----read-vtt-----------------------------------------------------------------
f_vtt <- system.file("extdata", "ex_webvtt.vtt", package = "subtools")
read_subtitles(file = f_vtt, format = "webvtt")
## ----read-ass-----------------------------------------------------------------
f_ass <- system.file("extdata", "ex_substation.ass", package = "subtools")
read_subtitles(file = f_ass, format = "substation")
## ----metadata-----------------------------------------------------------------
subs_meta <- read_subtitles(
file = f_srt,
metadata = tibble::tibble(Season = 1L, Episode = 3L, Language = "en")
)
subs_meta
## ----as-subtitle--------------------------------------------------------------
raw <- c(
"1",
"00:00:01,000 --> 00:00:03,500",
"Hello, world.",
"",
"2",
"00:00:04,000 --> 00:00:06,000",
"This is subtools."
)
as_subtitle(x = raw, format = "srt")
## ----info---------------------------------------------------------------------
s <- read_subtitles(
file = system.file("extdata", "ex_subrip.srt", package = "subtools")
)
get_subtitles_info(x = s)
## ----raw-text-----------------------------------------------------------------
transcript <- get_raw_text(x = s)
transcript
# One line per subtitle, separated by newlines
cat(get_raw_text(x = s, collapse = "\n"))
## ----dplyr--------------------------------------------------------------------
library(dplyr)
# Lines spoken after the first 30 seconds
s |>
filter(Timecode_in > hms::as_hms("00:00:30"))
# Duration of each subtitle cue (in seconds)
s |>
mutate(duration_s = as.numeric(Timecode_out - Timecode_in)) |>
select(ID, Text_content, duration_s)
## ----clean-tags---------------------------------------------------------------
tagged <- as_subtitle(
x = c(
"1",
"00:00:01,000 --> 00:00:03,000",
"This is important.",
"",
"2",
"00:00:04,000 --> 00:00:06,000",
"Warning!"
),
format = "srt",
clean.tags = FALSE # keep tags so we can demonstrate cleaning
)
tagged$Text_content
clean_tags(x = tagged)$Text_content
## ----clean-captions-----------------------------------------------------------
bb <- read_subtitles(
file = system.file("extdata", "ex_breakingbad.srt", package = "subtools"),
clean.tags = FALSE
)
bb$Text_content
clean_captions(x = bb)$Text_content
## ----clean-patterns-----------------------------------------------------------
# Remove speaker labels such as "WALTER:" or "JESSE:"
s_labeled <- as_subtitle(
x = c(
"1", "00:00:01,000 --> 00:00:03,000", "WALTER: We need to cook.",
"",
"2", "00:00:04,000 --> 00:00:06,000", "JESSE: Yeah, Mr. White!"
),
format = "srt", clean.tags = FALSE
)
clean_patterns(x = s_labeled, pattern = "^[A-Z]+: ")$Text_content
## ----clean-chain--------------------------------------------------------------
s_clean <- read_subtitles(file = f_srt, clean.tags = FALSE) |>
clean_tags() |>
clean_captions() |>
clean_patterns(pattern = "^-\\s*") # remove leading dialogue dashes
s_clean$Text_content
## ----bind-collapse------------------------------------------------------------
s1 <- read_subtitles(
file = system.file("extdata", "ex_subrip.srt", package = "subtools"),
metadata = tibble::tibble(Episode = 1L)
)
s2 <- read_subtitles(
file = system.file("extdata", "ex_rushmore.srt", package = "subtools"),
metadata = tibble::tibble(Episode = 2L)
)
combined <- bind_subtitles(s1, s2)
nrow(combined)
range(combined$Timecode_in)
## ----bind-list----------------------------------------------------------------
multi <- bind_subtitles(s1, s2, collapse = FALSE)
class(multi)
print(multi)
## ----info-multi---------------------------------------------------------------
get_subtitles_info(x = multi)
## ----read-series-demo, eval=FALSE---------------------------------------------
# # Read a single season
# season1 <- read_subtitles_season(dir = "BreakingBad/Season_01/")
#
# # Read an entire series (all seasons)
# bb_all <- read_subtitles_serie(dir = "BreakingBad/")
#
# # Read multiple series at once
# collection <- read_subtitles_multiseries(dir = "Series_Collection/")
## ----move---------------------------------------------------------------------
subs_shifted <- move_subtitles(x = subs, lag = 2.5)
# Compare first cue before and after
subs$Timecode_in[1]
subs_shifted$Timecode_in[1]
## ----move-multi---------------------------------------------------------------
multi_shifted <- move_subtitles(x = multi, lag = -1.0)
multi_shifted[[1]]$Timecode_in[1]
## ----write, eval=FALSE--------------------------------------------------------
# write_subtitles(x = subs_shifted, file = "synced_episode.srt")
## ----unnest-words-------------------------------------------------------------
words <- unnest_tokens(tbl = subs)
words
## ----unnest-ngrams------------------------------------------------------------
# Bigrams
bigrams <- unnest_tokens(tbl = subs, output = Word, input = Text_content,
token = "ngrams", n = 2)
bigrams$Word
## ----word-freq----------------------------------------------------------------
library(dplyr)
words |>
count(Text_content, sort = TRUE) |>
head(10)
## ----cross-episode------------------------------------------------------------
ep1 <- read_subtitles(
file = system.file("extdata", "ex_breakingbad.srt", package = "subtools"),
metadata = tibble::tibble(Episode = 1L)
)
ep2 <- read_subtitles(
file = system.file("extdata", "ex_rushmore.srt", package = "subtools"),
metadata = tibble::tibble(Episode = 2L)
)
ep3 <- read_subtitles(
file = system.file("extdata", "ex_webvtt.vtt", package = "subtools"),
metadata = tibble::tibble(Episode = 3L)
)
corpus <- bind_subtitles(ep1, ep2, ep3)
token_counts <- unnest_tokens(corpus) |>
count(Episode, Text_content, sort = TRUE)
token_counts |>
slice_max(n, n = 5, by = Episode)
## ----tfidf--------------------------------------------------------------------
token_counts |>
tidytext::bind_tf_idf(Text_content, Episode, n) |>
arrange(Episode, desc(tf_idf)) |>
slice_max(tf_idf, n = 5, by = Episode)
## ----timeline, fig.width = 7, fig.height = 3----------------------------------
words_ep1 <- unnest_tokens(tbl = ep1) |>
mutate(minute = as.numeric(Timecode_in) / 60)
if (requireNamespace("ggplot2", quietly = TRUE)) {
library(ggplot2)
ggplot(words_ep1, aes(x = minute)) +
geom_histogram(binwidth = 0.5, fill = "steelblue", colour = "white") +
labs(
title = "Word density over time",
x = "Time (minutes)",
y = "Word count"
) +
theme_minimal()
}