## ---- echo=FALSE--------------------------------------------------------------
options(rmarkdown.html_vignette.check_title = FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  install.packages("mallet")

## ---- eval=FALSE--------------------------------------------------------------
#  options(java.parameters = "-Xmx4g")

## -----------------------------------------------------------------------------
library(mallet)

## -----------------------------------------------------------------------------
# Note this is the path to the folder where the stoplists are stored in the R package.
# Change this path to another directory to read other txt files into R.
directory <- system.file("stoplists", package = "mallet")

files_in_directory <- list.files(directory, full.names = TRUE)

txt_file_content <- character(length(files_in_directory))
for(i in seq_along(files_in_directory)){
  txt_file_content[i] <- paste(readLines(files_in_directory[i]), collapse = "\n")
}
# We can check the content with str()
str(txt_file_content)

## -----------------------------------------------------------------------------
library(dplyr)
data(sotu)
sotu[["text"]][1:2]

## -----------------------------------------------------------------------------
mallet_supported_stoplists()
stopwords_en_file_path <- mallet_stoplist_file_path("en")

## -----------------------------------------------------------------------------
sotu.instances <- 
  mallet.import(id.array = row.names(sotu), 
                text.array = sotu[["text"]], 
                stoplist = stopwords_en_file_path,
                token.regexp = "\\p{L}[\\p{L}\\p{P}]+\\p{L}")

## -----------------------------------------------------------------------------
sotu.instances.short <- 
  mallet.import(text.array = sotu[["text"]])

## -----------------------------------------------------------------------------
stop_vector <- readLines(stopwords_en_file_path)
sotu.instances.short <- 
  mallet.import(text.array = sotu[["text"]], 
                stoplist = stop_vector)

## -----------------------------------------------------------------------------
topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)

## -----------------------------------------------------------------------------
topic.model$loadDocuments(sotu.instances)

## -----------------------------------------------------------------------------
vocabulary <- topic.model$getVocabulary()
head(vocabulary)

## -----------------------------------------------------------------------------
word_freqs <- mallet.word.freqs(topic.model)
head(word_freqs)

## -----------------------------------------------------------------------------
topic.model$setAlphaOptimization(20, 50)

## -----------------------------------------------------------------------------
topic.model$train(200)

## -----------------------------------------------------------------------------
topic.model$maximize(10)

## -----------------------------------------------------------------------------
doc.topics <- mallet.doc.topics(topic.model, smoothed=TRUE, normalized=TRUE)
topic.words <- mallet.topic.words(topic.model, smoothed=TRUE, normalized=TRUE)

## -----------------------------------------------------------------------------
mallet.top.words(topic.model, word.weights = topic.words[2,], num.top.words = 5)

## -----------------------------------------------------------------------------
docs <- which(doc.topics[,2] > 0.50)
doc_size <- nchar(sotu[["text"]])[docs]
idx <- docs[order(doc_size, decreasing = TRUE)[1]]
sotu[["text"]][idx]


## -----------------------------------------------------------------------------
post1975_topic_words <- mallet.subset.topic.words(topic.model, sotu[["year"]] > 1975)
mallet.top.words(topic.model, word.weights = post1975_topic_words[2,], num.top.words = 5)

## ---- fig.height=5, fig.width=5-----------------------------------------------
topic_labels <- mallet.topic.labels(topic.model, num.top.words = 2)
topic_clusters <- mallet.topic.hclust(doc.topics, topic.words, balance = 0.5)
plot(topic_clusters, labels=topic_labels, xlab = "", )

## -----------------------------------------------------------------------------
state_file <- file.path(tempdir(), "temp_mallet_state.gz")
save.mallet.state(topic.model = topic.model, state.file = state_file)

## -----------------------------------------------------------------------------
doc.topics.counts <- mallet.doc.topics(topic.model, smoothed=FALSE, normalized=FALSE)

rm(topic.model)

## -----------------------------------------------------------------------------
new.topic.model <- MalletLDA(num.topics=10, alpha.sum = 1, beta = 0.1)
new.topic.model$loadDocuments(sotu.instances)
load.mallet.state(topic.model = new.topic.model, state.file = state_file)

doc.topics.counts[1:3, 1:6]
mallet.doc.topics(new.topic.model, smoothed=FALSE, normalized=FALSE)[1:3, 1:6]

## -----------------------------------------------------------------------------
model_file <- file.path(tempdir(), "temp_mallet.model")
mallet.topic.model.save(new.topic.model, model_file)
read.topic.model <- mallet.topic.model.read(model_file)

doc.topics.counts[1:3, 1:6]
mallet.doc.topics(read.topic.model, smoothed=FALSE, normalized=FALSE)[1:3, 1:6]