---
title: "Language Detection and Conditional Translation"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Language Detection and Conditional Translation}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)
```

```{r setup}
library(polyglotr)
library(dplyr)
library(tibble)
library(purrr)

```

# Language Detection and Conditional Translation

This vignette demonstrates how to detect input languages and translate only non-English text, integrating seamlessly with tidyverse workflows. This approach is particularly useful for cleaning multilingual datasets and processing mixed-language content efficiently.

## Overview

Language detection and conditional translation allows you to:

- Automatically identify the language of text inputs
- Selectively translate only non-English content
- Process mixed-language datasets efficiently
- Integrate with tidyverse workflows for data manipulation
- Clean and standardize multilingual datasets

## Basic Language Detection

Let's start with simple language detection:

```{r basic_detection}
# Sample texts in different languages
sample_texts <- c(
  "Hello, how are you today?",           # English
  "Bonjour, comment allez-vous?",        # French  
  "Hola, ¿cómo estás hoy?",             # Spanish
  "Guten Tag, wie geht es Ihnen?",      # German
  "Ciao, come stai oggi?"               # Italian
)

# Detect languages
detected_languages <- sapply(sample_texts, language_detect)
print("Detected languages:")
print(detected_languages)
```

## Conditional Translation Function

Create a function that only translates non-English text:

```{r conditional_translation}
translate_if_not_english <- function(text, target_language = "en") {
  # Detect language of the input text
  detected_lang <- language_detect(text)
  
  # Check if the detected language is English
  is_english <- grepl("en", detected_lang, ignore.case = TRUE)
  
  if (is_english) {
    # Return original text if already English
    return(list(
      original = text,
      translated = text,
      was_translated = FALSE,
      detected_language = detected_lang
    ))
  } else {
    # Translate to English if not English
    translated_text <- google_translate(text, target_language = target_language, source_language = "auto")
    return(list(
      original = text,
      translated = translated_text,
      was_translated = TRUE,
      detected_language = detected_lang
    ))
  }
}

# Test the function
test_text_fr <- "Bonjour, j'aimerais acheter un billet."
result <- translate_if_not_english(test_text_fr)

print("Conditional translation result:")
print(paste("Original:", result$original))
print(paste("Translated:", result$translated))
print(paste("Was translated:", result$was_translated))
print(paste("Detected language:", result$detected_language))
```

## Working with Tibbles and Mixed-Language Data

Here's a practical example with a tibble containing mixed-language rows:

```{r mixed_language_tibble}
# Create a dataset with mixed languages (typical of user-generated content)
mixed_data <- tibble(
  id = 1:8,
  user_feedback = c(
    "Great product, very satisfied!",                    # English
    "Excelente producto, muy satisfecho!",               # Spanish
    "Produit fantastique, je le recommande!",            # French
    "This service exceeded my expectations.",            # English
    "Der Service war wirklich hervorragend.",            # German
    "Servizio eccellente, davvero impressionante!",     # Italian
    "The delivery was fast and reliable.",               # English
    "La livraison était rapide et fiable."               # French
  ),
  rating = c(5, 5, 4, 5, 4, 5, 4, 4),
  category = rep(c("product", "service"), 4)
)

print("Original mixed-language dataset:")
print(mixed_data)
```

Now let's detect languages and conditionally translate:

```{r detect_and_translate}
# Function to process each text entry
process_feedback <- function(text) {
  result <- translate_if_not_english(text)
  return(tibble(
    original_text = result$original,
    english_text = result$translated,
    was_translated = result$was_translated,
    detected_language = result$detected_language
  ))
}

# Apply to all feedback entries
processed_results <- purrr::map_dfr(mixed_data$user_feedback, process_feedback)

# Combine with original data
enhanced_data <- bind_cols(mixed_data, processed_results)

print("Enhanced dataset with language detection and translation:")
print(enhanced_data)
```

## Advanced Tidyverse Integration

For more sophisticated data processing workflows:

```{r advanced_tidyverse}
library(stringr)

# Enhanced processing function with more details
enhanced_language_processing <- function(df, text_column) {
  df %>%
    mutate(
      # Detect language for each text entry
      detected_lang = map_chr(!!rlang::sym(text_column), 
                             ~ tryCatch(language_detect(.x), error = function(e) "unknown")),
      
      # Determine if translation is needed
      needs_translation = !str_detect(detected_lang, "en"),
      
      # Translate only non-English text
      english_text = map2_chr(!!rlang::sym(text_column), needs_translation,
                             ~ if (.y) {
                               tryCatch(google_translate(.x, target_language = "en"), 
                                       error = function(e) .x)
                             } else {
                               .x
                             }),
      
      # Add translation confidence/status
      translation_status = case_when(
        detected_lang == "unknown" ~ "detection_failed",
        !needs_translation ~ "already_english", 
        english_text != !!rlang::sym(text_column) ~ "translated",
        TRUE ~ "translation_failed"
      )
    )
}

# Apply enhanced processing
result_data <- enhanced_language_processing(mixed_data, "user_feedback")

print("Advanced processing results:")
print(result_data %>% select(id, detected_lang, needs_translation, translation_status))
```

## Batch Processing with Language Filtering

Process large datasets efficiently by filtering and batching:

```{r batch_filtering}
# Create larger sample dataset
large_dataset <- tibble(
  id = 1:20,
  content = c(
    # Mix of English and non-English content
    "Amazing service quality",                           # EN
    "Fantástico servicio al cliente",                   # ES  
    "Service client exceptionnel",                      # FR
    "Great user experience",                            # EN
    "Esperienza utente eccellente",                     # IT
    "Ausgezeichnete Benutzerführung",                  # DE
    "Fast shipping and delivery",                       # EN
    "Livraison rapide et efficace",                    # FR
    "Excellent product quality",                        # EN
    "Qualità del prodotto superiore",                  # IT
    "Easy to use interface",                           # EN
    "Interfaz muy fácil de usar",                      # ES
    "Highly recommend this product",                    # EN
    "Je recommande vivement ce produit",               # FR
    "Outstanding customer support",                     # EN
    "Soporte al cliente sobresaliente",                # ES
    "Very satisfied with purchase",                     # EN
    "Sehr zufrieden mit dem Kauf",                     # DE
    "Will definitely buy again",                       # EN
    "Sicuramente acquisterò di nuovo"                  # IT
  ),
  timestamp = Sys.time() + sample(-1000:1000, 20),
  priority = sample(c("high", "medium", "low"), 20, replace = TRUE)
)

# Efficient batch processing workflow
batch_process_languages <- function(df, text_col, batch_size = 5) {
  # First, detect languages for all entries
  df_with_detection <- df %>%
    mutate(
      row_id = row_number(),
      detected_lang = map_chr(!!rlang::sym(text_col), 
                             ~ tryCatch(language_detect(.x), error = function(e) "en")),
      is_english = str_detect(detected_lang, "en")
    )
  
  # Separate English and non-English content
  english_content <- df_with_detection %>% filter(is_english)
  non_english_content <- df_with_detection %>% filter(!is_english)
  
  # Process non-English content in batches
  if (nrow(non_english_content) > 0) {
    non_english_content <- non_english_content %>%
      mutate(
        batch_id = ceiling(row_number() / batch_size),
        english_text = map_chr(!!rlang::sym(text_col),
                              ~ tryCatch(google_translate(.x, target_language = "en"),
                                        error = function(e) .x))
      )
  } else {
    non_english_content <- non_english_content %>%
      mutate(batch_id = integer(0), english_text = character(0))
  }
  
  # For English content, keep original text
  english_content <- english_content %>%
    mutate(
      batch_id = NA_integer_,
      english_text = !!rlang::sym(text_col)
    )
  
  # Combine results
  result <- bind_rows(english_content, non_english_content) %>%
    arrange(row_id) %>%
    select(-row_id)
  
  return(result)
}

# Apply batch processing
processed_large <- batch_process_languages(large_dataset, "content", batch_size = 3)

# Summary statistics
summary_stats <- processed_large %>%
  summarise(
    total_entries = n(),
    english_entries = sum(is_english),
    translated_entries = sum(!is_english),
    translation_rate = mean(!is_english),
    unique_languages = n_distinct(detected_lang)
  )

print("Processing summary:")
print(summary_stats)

print("Sample of processed data:")
print(processed_large %>% 
      select(id, detected_lang, is_english, content, english_text) %>%
      head(10))
```

## Best Practices

1. **Validate language detection** - Check detection confidence when possible
2. **Handle errors gracefully** - Implement fallback strategies for failed translations
3. **Batch process efficiently** - Group similar operations to minimize API calls
4. **Monitor quality** - Track translation success rates and errors
5. **Cache results** - Store language detection and translation results to avoid redundant API calls
6. **Test with edge cases** - Handle empty strings, special characters, and mixed content

## Conclusion

Language detection and conditional translation provide powerful tools for cleaning and standardizing multilingual datasets. By integrating with tidyverse workflows, you can efficiently process mixed-language content, enabling consistent analysis and insights across diverse linguistic data sources.