This vignette demonstrates how to detect input languages and translate only non-English text, integrating seamlessly with tidyverse workflows. This approach is particularly useful for cleaning multilingual datasets and processing mixed-language content efficiently.
Language detection and conditional translation allows you to:
Let’s start with simple language detection:
# Sample texts in different languages
sample_texts <- c(
"Hello, how are you today?", # English
"Bonjour, comment allez-vous?", # French
"Hola, ¿cómo estás hoy?", # Spanish
"Guten Tag, wie geht es Ihnen?", # German
"Ciao, come stai oggi?" # Italian
)
# Detect languages
detected_languages <- sapply(sample_texts, language_detect)
print("Detected languages:")
print(detected_languages)
Create a function that only translates non-English text:
translate_if_not_english <- function(text, target_language = "en") {
# Detect language of the input text
detected_lang <- language_detect(text)
# Check if the detected language is English
is_english <- grepl("en", detected_lang, ignore.case = TRUE)
if (is_english) {
# Return original text if already English
return(list(
original = text,
translated = text,
was_translated = FALSE,
detected_language = detected_lang
))
} else {
# Translate to English if not English
translated_text <- google_translate(text, target_language = target_language, source_language = "auto")
return(list(
original = text,
translated = translated_text,
was_translated = TRUE,
detected_language = detected_lang
))
}
}
# Test the function
test_text_fr <- "Bonjour, j'aimerais acheter un billet."
result <- translate_if_not_english(test_text_fr)
print("Conditional translation result:")
print(paste("Original:", result$original))
print(paste("Translated:", result$translated))
print(paste("Was translated:", result$was_translated))
print(paste("Detected language:", result$detected_language))
Here’s a practical example with a tibble containing mixed-language rows:
# Create a dataset with mixed languages (typical of user-generated content)
mixed_data <- tibble(
id = 1:8,
user_feedback = c(
"Great product, very satisfied!", # English
"Excelente producto, muy satisfecho!", # Spanish
"Produit fantastique, je le recommande!", # French
"This service exceeded my expectations.", # English
"Der Service war wirklich hervorragend.", # German
"Servizio eccellente, davvero impressionante!", # Italian
"The delivery was fast and reliable.", # English
"La livraison était rapide et fiable." # French
),
rating = c(5, 5, 4, 5, 4, 5, 4, 4),
category = rep(c("product", "service"), 4)
)
print("Original mixed-language dataset:")
print(mixed_data)
Now let’s detect languages and conditionally translate:
# Function to process each text entry
process_feedback <- function(text) {
result <- translate_if_not_english(text)
return(tibble(
original_text = result$original,
english_text = result$translated,
was_translated = result$was_translated,
detected_language = result$detected_language
))
}
# Apply to all feedback entries
processed_results <- purrr::map_dfr(mixed_data$user_feedback, process_feedback)
# Combine with original data
enhanced_data <- bind_cols(mixed_data, processed_results)
print("Enhanced dataset with language detection and translation:")
print(enhanced_data)
For more sophisticated data processing workflows:
library(stringr)
# Enhanced processing function with more details
enhanced_language_processing <- function(df, text_column) {
df %>%
mutate(
# Detect language for each text entry
detected_lang = map_chr(!!rlang::sym(text_column),
~ tryCatch(language_detect(.x), error = function(e) "unknown")),
# Determine if translation is needed
needs_translation = !str_detect(detected_lang, "en"),
# Translate only non-English text
english_text = map2_chr(!!rlang::sym(text_column), needs_translation,
~ if (.y) {
tryCatch(google_translate(.x, target_language = "en"),
error = function(e) .x)
} else {
.x
}),
# Add translation confidence/status
translation_status = case_when(
detected_lang == "unknown" ~ "detection_failed",
!needs_translation ~ "already_english",
english_text != !!rlang::sym(text_column) ~ "translated",
TRUE ~ "translation_failed"
)
)
}
# Apply enhanced processing
result_data <- enhanced_language_processing(mixed_data, "user_feedback")
print("Advanced processing results:")
print(result_data %>% select(id, detected_lang, needs_translation, translation_status))
Process large datasets efficiently by filtering and batching:
# Create larger sample dataset
large_dataset <- tibble(
id = 1:20,
content = c(
# Mix of English and non-English content
"Amazing service quality", # EN
"Fantástico servicio al cliente", # ES
"Service client exceptionnel", # FR
"Great user experience", # EN
"Esperienza utente eccellente", # IT
"Ausgezeichnete Benutzerführung", # DE
"Fast shipping and delivery", # EN
"Livraison rapide et efficace", # FR
"Excellent product quality", # EN
"Qualità del prodotto superiore", # IT
"Easy to use interface", # EN
"Interfaz muy fácil de usar", # ES
"Highly recommend this product", # EN
"Je recommande vivement ce produit", # FR
"Outstanding customer support", # EN
"Soporte al cliente sobresaliente", # ES
"Very satisfied with purchase", # EN
"Sehr zufrieden mit dem Kauf", # DE
"Will definitely buy again", # EN
"Sicuramente acquisterò di nuovo" # IT
),
timestamp = Sys.time() + sample(-1000:1000, 20),
priority = sample(c("high", "medium", "low"), 20, replace = TRUE)
)
# Efficient batch processing workflow
batch_process_languages <- function(df, text_col, batch_size = 5) {
# First, detect languages for all entries
df_with_detection <- df %>%
mutate(
row_id = row_number(),
detected_lang = map_chr(!!rlang::sym(text_col),
~ tryCatch(language_detect(.x), error = function(e) "en")),
is_english = str_detect(detected_lang, "en")
)
# Separate English and non-English content
english_content <- df_with_detection %>% filter(is_english)
non_english_content <- df_with_detection %>% filter(!is_english)
# Process non-English content in batches
if (nrow(non_english_content) > 0) {
non_english_content <- non_english_content %>%
mutate(
batch_id = ceiling(row_number() / batch_size),
english_text = map_chr(!!rlang::sym(text_col),
~ tryCatch(google_translate(.x, target_language = "en"),
error = function(e) .x))
)
} else {
non_english_content <- non_english_content %>%
mutate(batch_id = integer(0), english_text = character(0))
}
# For English content, keep original text
english_content <- english_content %>%
mutate(
batch_id = NA_integer_,
english_text = !!rlang::sym(text_col)
)
# Combine results
result <- bind_rows(english_content, non_english_content) %>%
arrange(row_id) %>%
select(-row_id)
return(result)
}
# Apply batch processing
processed_large <- batch_process_languages(large_dataset, "content", batch_size = 3)
# Summary statistics
summary_stats <- processed_large %>%
summarise(
total_entries = n(),
english_entries = sum(is_english),
translated_entries = sum(!is_english),
translation_rate = mean(!is_english),
unique_languages = n_distinct(detected_lang)
)
print("Processing summary:")
print(summary_stats)
print("Sample of processed data:")
print(processed_large %>%
select(id, detected_lang, is_english, content, english_text) %>%
head(10))
Language detection and conditional translation provide powerful tools for cleaning and standardizing multilingual datasets. By integrating with tidyverse workflows, you can efficiently process mixed-language content, enabling consistent analysis and insights across diverse linguistic data sources.