## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE) library(reticulate) # Replace the path below with the path of your Python environment # Then uncomment the command below: # Tip: BERTOPICR_VENV should be the folder that contains `pyvenv.cfg`. # Sys.setenv( # BERTOPICR_VENV = "C:/Users/teodo/Documents/R/bertopic/bertopic4r", # NOT_CRAN = "true" # ) # 1. Define the libraries you need required_modules <- c("bertopic", "umap", "hdbscan", "sklearn", "numpy", "plotly", "datetime", "sentence_transformers", "openai", "ollama") # macOS: if reticulate fails to load Python libraries, run once per session. if (identical(Sys.info()[["sysname"]], "Darwin")) { bertopicr::configure_macos_homebrew_zlib() } # Optional: point reticulate at a user-specified virtualenv venv <- Sys.getenv("BERTOPICR_VENV") if (nzchar(venv)) { venv_cfg <- file.path(venv, "pyvenv.cfg") if (file.exists(venv_cfg)) { reticulate::use_virtualenv(venv, required = TRUE) } else { message("Warning: BERTOPICR_VENV does not point to a valid virtualenv: ", venv) } } # Try to find python, but don't crash if it's missing (e.g. on another user's machine) if (!reticulate::py_available(initialize = TRUE)) { try(reticulate::use_python(Sys.which("python"), required = FALSE), silent = TRUE) } # 2. Check if they are installed python_ready <- tryCatch({ # Attempt to initialize python and check modules py_available(initialize = TRUE) && all(vapply(required_modules, py_module_available, logical(1))) }, error = function(e) FALSE) # 3. Only evaluate chunks when Python is ready and NOT_CRAN is set run_chunks <- python_ready && identical(Sys.getenv("NOT_CRAN"), "true") knitr::opts_chunk$set(eval = run_chunks) if (!python_ready) { message("Warning: Required Python modules (bertopic, umap-learn) not found. Vignette code will not run.") } else { message("Python environment ready: ", reticulate::py_config()$python) if (!identical(Sys.getenv("NOT_CRAN"), "true")) { message("Note: Set NOT_CRAN=true to run Python-dependent chunks locally.") } } ## ----------------------------------------------------------------------------- # library(dplyr) # library(tidyr) # library(purrr) # library(utils) # library(tibble) # library(readr) # library(tictoc) # library(htmltools) # library(bertopicr) ## ----eval=run_chunks---------------------------------------------------------- # # Import necessary Python modules # py <- import_builtins() # np <- import("numpy") # umap <- import("umap") # UMAP <- umap$UMAP # hdbscan <- import("hdbscan") # HDBSCAN <- hdbscan$HDBSCAN # sklearn <- import("sklearn") # CountVectorizer <- sklearn$feature_extraction$text$CountVectorizer # bertopic <- import("bertopic") # plotly <- import("plotly") # datetime <- import("datetime") # ## ----------------------------------------------------------------------------- # rds_path <- file.path("inst/extdata", "spiegel_sample.rds") # dataset <- read_rds(rds_path) # names(dataset) # dim(dataset) ## ----------------------------------------------------------------------------- # stopwords_path <- file.path("inst/extdata", "all_stopwords.txt") # all_stopwords <- read_lines(stopwords_path) ## ----------------------------------------------------------------------------- # texts_cleaned = dataset$text_clean # titles = dataset$doc_id # timestamps <- as.list(dataset$date) # # timestamps <- as.integer(dataset$year) # # texts_cleaned[[1]] ## ----eval=run_chunks---------------------------------------------------------- # # Embed the sentences # sentence_transformers <- import("sentence_transformers") # SentenceTransformer <- sentence_transformers$SentenceTransformer # # choose an appropriate embeddings model # embedding_model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B") # embeddings = embedding_model$encode(texts_cleaned, show_progress_bar=TRUE) # ## ----eval=run_chunks---------------------------------------------------------- # # Initialize UMAP and HDBSCAN models # umap_model <- UMAP(n_neighbors=15L, n_components=5L, min_dist=0.0, metric='cosine', random_state=42L) # ## ----eval=run_chunks---------------------------------------------------------- # hdbscan_model <- HDBSCAN(min_cluster_size=50L, min_samples = 20L, metric='euclidean', cluster_selection_method='eom', gen_min_span_tree=TRUE, prediction_data=TRUE, core_dist_n_jobs = 1) # ## ----eval=run_chunks---------------------------------------------------------- # # Initialize CountVectorizer # vectorizer_model <- CountVectorizer(min_df=2L, ngram_range=tuple(1L, 3L), # max_features = 10000L, max_df = 50L, # stop_words = all_stopwords) # sentence_vectors <- vectorizer_model$fit_transform(texts_cleaned) # sentence_vectors_dense <- np$array(sentence_vectors) # sentence_vectors_dense <- py_to_r(sentence_vectors_dense) # ## ----eval=run_chunks && identical(Sys.getenv("BERTOPICR_ENABLE_REPR"), "true")---- # # Initialize representation models # keybert_model <- bertopic$representation$KeyBERTInspired() # openai <- import("openai") # OpenAI <- openai$OpenAI # ollama <- import("ollama") # # lmstudio <- import("lmstudio") # # # Point to the local server (ollama or lm-studio) # client <- OpenAI(base_url = 'http://localhost:11434/v1', api_key='ollama') # # client <- OpenAI(base_url = 'http://localhost:1234/v1', api_key='lm-studio') # # prompt <- " # I have a topic that contains the following documents: # [DOCUMENTS] # The topic is described by the following keywords: [KEYWORDS] # # Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format: # topic: # " # # # download an appropriate LLM to be hosted by ollama or lm-studio # openai_model <- bertopic$representation$OpenAI(client, # model = "gpt-oss:20b", # exponential_backoff = TRUE, # chat = TRUE, # prompt = prompt) # # # downlaod a language model from spacy.io before use here # # Below a German spacy model is used # pos_model <- bertopic$representation$PartOfSpeech("de_core_news_lg") # # diversity set relatively high to reduce repetition of keyword word forms # mmr_model <- bertopic$representation$MaximalMarginalRelevance(diversity = 0.5) # # # Combine all representation models # representation_model <- list( # "KeyBERT" = keybert_model, # "OpenAI" = openai_model, # "MMR" = mmr_model, # "POS" = pos_model # ) # ## ----eval=run_chunks---------------------------------------------------------- # # We can define a number of topics of interest # zeroshot_topic_list <- list("german national identity", "minority issues in germany") # ## ----eval=run_chunks---------------------------------------------------------- # # Initialize BERTopic model with pipeline models and hyperparameters # BERTopic <- bertopic$BERTopic # topic_model <- BERTopic( # embedding_model = embedding_model, # umap_model = umap_model, # hdbscan_model = hdbscan_model, # vectorizer_model = vectorizer_model, # # zeroshot_topic_list = zeroshot_topic_list, # # zeroshot_min_similarity = 0.85, # representation_model = representation_model, # calculate_probabilities = TRUE, # top_n_words = 200L, # if you need more top words, insert the desired number here!!! # verbose = TRUE # ) # ## ----eval=run_chunks---------------------------------------------------------- # tictoc::tic() # # # Fit the model and transform the texts # fit_transform <- topic_model$fit_transform(texts_cleaned, embeddings) # topics <- fit_transform[[1]] # # # Now transform the texts to get the updated probabilities # transform_result <- topic_model$transform(texts_cleaned) # probs <- transform_result[[2]] # Extract the updated probabilities # # tictoc::toc() # ## ----eval=run_chunks---------------------------------------------------------- # # Converting R Date to Python datetime # datetime <- import("datetime") # # timestamps <- as.list(dataset$date) # # timestamps <- as.integer(dataset$year) # # # Convert each R date object to an ISO 8601 string # timestamps <- lapply(timestamps, function(x) { # format(x, "%Y-%m-%dT%H:%M:%S") # ISO 8601 format # }) # # # Dynamic topic model # topics_over_time <- topic_model$topics_over_time(texts_cleaned, timestamps, nr_bins=20L, global_tuning=TRUE, evolution_tuning=TRUE) # ## ----------------------------------------------------------------------------- # # Combine results with additional columns # results <- dataset |> # mutate(Topic = topics, # Probability = apply(probs, 1, max)) # Assuming the highest probability for each sentence # # results <- results |> # mutate(row_id = row_number()) |> # select(row_id, everything()) # # head(results,10) |> rmarkdown::paged_table() # ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # results |> # saveRDS("inst/extdata/spiegel_topic_results_df.rds", version = 2) # results |> # write_csv("inst/extdata/spiegel_topic_results_df.csv") # ## ----eval=run_chunks---------------------------------------------------------- # library(bertopicr) # document_info_df <- get_document_info_df(model = topic_model, # texts = texts_cleaned, # drop_expanded_columns = TRUE) # document_info_df |> head() |> rmarkdown::paged_table() ## ----eval=run_chunks---------------------------------------------------------- # # Create a data frame similar to df_docs # df_docs <- tibble(Topic = results$Topic, # Document = results$text_clean, # probs = results$Probability) # rep_docs <- get_most_representative_docs(df = df_docs, # topic_nr = 3, # n_docs = 5) # unique(rep_docs) ## ----eval=run_chunks---------------------------------------------------------- # topic_info_df <- get_topic_info_df(model = topic_model, # drop_expanded_columns = TRUE) # head(topic_info_df) |> rmarkdown::paged_table() ## ----eval=run_chunks---------------------------------------------------------- # topics_df <- get_topics_df(model = topic_model) # head(topics_df, 10) ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # visualize_barchart(model = topic_model, # filename = "topics_topwords_interactive_barchart.html", # default # open_file = FALSE) # TRUE enables output in browser ## ----eval=run_chunks---------------------------------------------------------- # library(ggplot2) # # barchart <- topics_df |> # group_by(Topic) |> # filter(Topic >= 0 & Topic <= 8) |> # slice_head(n=5) |> # mutate(Topic = paste("Topic", as.character(Topic)), # Word = reorder(Word, Score)) |> # ggplot(aes(Score, Word, fill = Topic)) + # geom_col() + # facet_wrap(~ Topic, scales = "free") + # theme(legend.position = "none") # # # # Disabled to avoid poential conflicts # # library(plotly) # # ggplotly(barchart) ## ----eval=run_chunks---------------------------------------------------------- # find_topics_df(model = topic_model, # queries = "migration", # user input # top_n = 10, # default # return_tibble = TRUE) # default ## ----eval=run_chunks---------------------------------------------------------- # find_topics_df(model = topic_model, # queries = c("migranten", "asylanten"), # top_n = 5) ## ----eval=run_chunks---------------------------------------------------------- # get_topic_df(model = topic_model, # topic_number = 0, # top_n = 5, # default is 10 # return_tibble = TRUE) # default ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # # default filename: topic_dist_interactive.html # visualize_distribution(model = topic_model, # text_id = 1, # user input # probabilities = probs) # see model training ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # visualize_topics(model = topic_model, # filename = "intertopic_distance_map") # default name ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # visualize_heatmap(model = topic_model, # filename = "topics_similarity_heatmap", # auto_open = FALSE) ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # visualize_hierarchy(model = topic_model, # hierarchical_topics = NULL, # default # filename = "topic_hierarchy", # default name, html extension # auto_open = FALSE) # TRUE enables output in browser ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # hierarchical_topics = topic_model$hierarchical_topics(texts_cleaned) # visualize_hierarchy(model = topic_model, # hierarchical_topics = hierarchical_topics, # filename = "topic_hierarchy", # default name, html extension # auto_open = FALSE) # TRUE enables output in browser ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # # Reduce dimensionality of embeddings using UMAP # reduced_embeddings <- umap$UMAP(n_neighbors = 10L, n_components = 2L, min_dist = 0.0, metric = 'cosine')$fit_transform(embeddings) # # visualize_documents(model = topic_model, # texts = texts_cleaned, # reduced_embeddings = reduced_embeddings, # filename = "visualize_documents", # default extension html # auto_open = FALSE) # TRUE enables output in browser # ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # # Reduce dimensionality of embeddings using UMAP (n_components = 2L !!!) # reduced_embeddings <- umap$UMAP(n_neighbors = 10L, n_components = 2L, min_dist = 0.0, metric = 'cosine')$fit_transform(embeddings) # # visualize_documents_2d(model = topic_model, # texts = texts_cleaned, # reduced_embeddings = reduced_embeddings, # custom_labels = FALSE, # default # hide_annotation = TRUE, # default # tooltips = c("Topic", "Name", "Probability", "Text"), # default # filename = "visualize_documents_2d", # default name # auto_open = FALSE) # TRUE enables output in browser ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # # Reduce dimensionality of embeddings using UMAP # reduced_embeddings <- umap$UMAP(n_neighbors = 10L, n_components = 3L, min_dist = 0.0, metric = 'cosine')$fit_transform(embeddings) # # visualize_documents_3d(model = topic_model, # texts = texts_cleaned, # reduced_embeddings = reduced_embeddings, # custom_labels = FALSE, # default # hide_annotation = TRUE, # default # tooltips = c("Topic", "Name", "Probability", "Text"), # default # filename = "visualize_documents_3d", # default name # auto_open = FALSE) # TRUE enables output in browser ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # visualize_topics_over_time(model = topic_model, # # see Topic Dynamics section above # topics_over_time_model = topics_over_time, # top_n_topics = 10, # default is 20 # filename = "topics_over_time") # default, html extension ## ----eval=run_chunks && identical(Sys.getenv("NOT_CRAN"), "true")------------- # classes = as.list(dataset$genre) # text types # topics_per_class = topic_model$topics_per_class(texts_cleaned, classes=classes) # # visualize_topics_per_class(model = topic_model, # topics_per_class = topics_per_class, # start = 0, # default # end = 10, # default # filename = "topics_per_class", # default, html extension # auto_open = FALSE) # TRUE enables output in browser ## ----eval=run_chunks---------------------------------------------------------- # BERTopic200 <- bertopic$BERTopic # topic_model200 <- BERTopic200( # embedding_model = embedding_model, # umap_model = umap_model, # hdbscan_model = hdbscan_model, # vectorizer_model = vectorizer_model, # # zeroshot_topic_list = zeroshot_topic_list, # # zeroshot_min_similarity = 0.85, # representation_model = representation_model, # calculate_probabilities = TRUE, # top_n_words = 200L, # !!! # verbose = TRUE # ) # # tictoc::tic() # # # Fit the model and transform the texts # py_fit <- topic_model200$fit(texts_cleaned, embeddings) # # # ask Python for the top-200 of the desired topic: # py_topic200 <- py_fit$get_topic(1L, 200L) # list of (word, score) # # names(py_topic200) # # rep_list <- py_topic200[["Main"]] # # tictoc::toc() # ## ----eval=run_chunks---------------------------------------------------------- # df_wc <- data.frame( # name = sapply(rep_list, `[[`, 1), # freq = as.numeric(sapply(rep_list, `[[`, 2)), # stringsAsFactors = FALSE # ) # # library(wordcloud2) # source("inst/extdata/wordcloud2a.R") # # wordcloud2a( # data = df_wc, # size = 0.5, # minSize = 0, # gridSize = 1, # fontFamily = "Segoe UI", # fontWeight = "bold", # color = "random-dark", # backgroundColor = "white", # shape = "circle", # ellipticity = 0.65 # ) #