## ----setup, include=FALSE, cache=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ options(width = 1000) knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, comment = NA, eval = (require(topicmodels))) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(udpipe) ud_model <- udpipe_download_model(language = "french") ## ---- echo=FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, comment = NA, eval = !ud_model$download_failed) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- data(brussels_reviews) comments <- subset(brussels_reviews, language %in% "fr") ud_model <- udpipe_load_model(ud_model$file_model) x <- udpipe_annotate(ud_model, x = comments$feedback, doc_id = comments$id) x <- as.data.frame(x) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- str(x) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ## Define the identifier at which we will build a topic model x$topic_level_id <- unique_identifier(x, fields = c("doc_id", "paragraph_id", "sentence_id")) ## Get a data.frame with 1 row per id/lemma dtf <- subset(x, upos %in% c("NOUN")) dtf <- document_term_frequencies(dtf, document = "topic_level_id", term = "lemma") head(dtf) ## Create a document/term/matrix for building a topic model dtm <- document_term_matrix(x = dtf) ## Remove words which do not occur that much dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 5) head(dtm_colsums(dtm_clean)) ## Remove nouns which you really do not like (mostly too common nouns) dtm_clean <- dtm_remove_terms(dtm_clean, terms = c("appartement", "appart", "eter")) ## Or keep of these nouns the top 50 based on mean term-frequency-inverse document frequency dtm_clean <- dtm_remove_tfidf(dtm_clean, top = 50) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(topicmodels) m <- LDA(dtm_clean, k = 4, method = "Gibbs", control = list(nstart = 5, burnin = 2000, best = TRUE, seed = 1:5)) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scores <- predict(m, newdata = dtm, type = "topics", labels = c("labela", "labelb", "labelc", "xyz")) str(scores) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- predict(m, type = "terms", min_posterior = 0.05, min_terms = 3) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ## Build document term matrix on nouns/adjectives only dtf <- subset(x, upos %in% c("NOUN", "ADJ") & !lemma %in% c("appartement", "appart", "eter", "tres")) dtf <- document_term_frequencies(dtf, document = "topic_level_id", term = "lemma") dtm <- document_term_matrix(x = dtf) dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 5) ## Build topic model + get topic terminology m <- LDA(dtm_clean, k = 4, method = "Gibbs", control = list(nstart = 5, burnin = 2000, best = TRUE, seed = 1:5)) topicterminology <- predict(m, type = "terms", min_posterior = 0.025, min_terms = 5) scores <- predict(m, newdata = dtm, type = "topics") ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # library(igraph) # library(ggraph) # library(ggplot2) # x_topics <- merge(x, scores, by.x="topic_level_id", by.y="doc_id") # wordnetwork <- subset(x_topics, topic %in% 1 & lemma %in% topicterminology[[1]]$term) # wordnetwork <- cooccurrence(wordnetwork, group = c("topic_level_id"), term = "lemma") # wordnetwork <- graph_from_data_frame(wordnetwork) # ggraph(wordnetwork, layout = "fr") + # geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "pink") + # geom_node_text(aes(label = name), col = "darkgreen", size = 4) + # theme_graph(base_family = "Arial Narrow") + # labs(title = "Words in topic 1 ", subtitle = "Nouns & Adjective cooccurrence") ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # topicterminology <- predict(m, type = "terms", min_posterior = 0.05, min_terms = 10) # termcorrs <- subset(x_topics, topic %in% 1 & lemma %in% topicterminology[[1]]$term) # termcorrs <- document_term_frequencies(termcorrs, document = "topic_level_id", term = "lemma") # termcorrs <- document_term_matrix(termcorrs) # termcorrs <- dtm_cor(termcorrs) # termcorrs[lower.tri(termcorrs)] <- NA # diag(termcorrs) <- NA # library(qgraph) # qgraph(termcorrs, layout = "spring", labels = colnames(termcorrs), directed = FALSE, # borders = FALSE, label.scale = FALSE, label.cex = 1, node.width = 0.5) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ## Find keywords with RAKE keyw_rake <- keywords_rake(x, term = "token", group = c("doc_id", "paragraph_id", "sentence_id"), relevant = x$upos %in% c("NOUN", "ADJ"), ngram_max = 3, n_min = 5) ## Find simple noun phrases x$phrase_tag <- as_phrasemachine(x$upos, type = "upos") keyw_nounphrases <- keywords_phrases(x$phrase_tag, term = x$token, pattern = "(A|N)*N(P+D*(A|N)*N)*", is_regex = TRUE, detailed = FALSE) keyw_nounphrases <- subset(keyw_nounphrases, ngram > 1) ## Recode terms to keywords x$term <- x$token x$term <- txt_recode_ngram(x$term, compound = keyw_rake$keyword, ngram = keyw_rake$ngram) x$term <- txt_recode_ngram(x$term, compound = keyw_nounphrases$keyword, ngram = keyw_nounphrases$ngram) ## Keep keyword or just plain nouns x$term <- ifelse(x$upos %in% "NOUN", x$term, ifelse(x$term %in% c(keyw_rake$keyword, keyw_nounphrases$keyword), x$term, NA)) ## Build document/term/matrix dtm <- document_term_frequencies(x, document = "topic_level_id", term = "term") dtm <- document_term_matrix(x = dtm) dtm <- dtm_remove_lowfreq(dtm, minfreq = 5) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- m <- LDA(dtm, k = 3, method = "Gibbs", control = list(nstart = 5, burnin = 2000, best = TRUE, seed = 1:5)) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- topicterminology <- predict(m, type = "terms", min_posterior = 0.10, min_terms = 3) topicterminology ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ## Recode tokens to keywords, if it is not in the list of tokens, set to NA x$mwe <- txt_recode_ngram(x$token, compound = keyw_rake$keyword, ngram = keyw_rake$ngram) x$mwe <- ifelse(x$mwe %in% keyw_rake$keyword, x$mwe, NA) ## nouns x$term_noun <- ifelse(x$upos %in% "NOUN", x$token, NA) ## Build document/term/matrix dtm <- document_term_frequencies(x, document = "topic_level_id", term = c("term_noun", "mwe")) dtm <- document_term_matrix(x = dtm) dtm <- dtm_remove_lowfreq(dtm, minfreq = 3) m <- LDA(dtm, k = 3, method = "Gibbs", control = list(nstart = 5, burnin = 2000, best = TRUE, seed = 1:5)) ## ---- results='hide', echo=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ invisible(if(file.exists("french-gsd-ud-2.4-190531.udpipe")) file.remove(c("french-gsd-ud-2.4-190531.udpipe")))