## ----setup, include=FALSE, cache=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ options(width = 1000) knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, comment = NA, eval = TRUE, fig.width = 5, fig.height = 5) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(udpipe) ud_model <- udpipe_download_model(language = "spanish") ## ---- echo=FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, comment = NA, eval = !ud_model$download_failed) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- data(brussels_reviews) comments <- subset(brussels_reviews, language %in% "es") ud_model <- udpipe_load_model(ud_model$file_model) x <- udpipe_annotate(ud_model, x = comments$feedback, doc_id = comments$id) x <- as.data.frame(x) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(lattice) stats <- txt_freq(x$upos) stats$key <- factor(stats$key, levels = rev(stats$key)) barchart(key ~ freq, data = stats, col = "cadetblue", main = "UPOS (Universal Parts of Speech)\n frequency of occurrence", xlab = "Freq") ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ## NOUNS stats <- subset(x, upos %in% c("NOUN")) stats <- txt_freq(stats$token) stats$key <- factor(stats$key, levels = rev(stats$key)) barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", main = "Most occurring nouns", xlab = "Freq") ## ADJECTIVES stats <- subset(x, upos %in% c("ADJ")) stats <- txt_freq(stats$token) stats$key <- factor(stats$key, levels = rev(stats$key)) barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", main = "Most occurring adjectives", xlab = "Freq") ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ## Using RAKE stats <- keywords_rake(x = x, term = "lemma", group = "doc_id", relevant = x$upos %in% c("NOUN", "ADJ")) stats$key <- factor(stats$keyword, levels = rev(stats$keyword)) barchart(key ~ rake, data = head(subset(stats, freq > 3), 20), col = "cadetblue", main = "Keywords identified by RAKE", xlab = "Rake") ## Using Pointwise Mutual Information Collocations x$word <- tolower(x$token) stats <- keywords_collocation(x = x, term = "word", group = "doc_id") stats$key <- factor(stats$keyword, levels = rev(stats$keyword)) barchart(key ~ pmi, data = head(subset(stats, freq > 3), 20), col = "cadetblue", main = "Keywords identified by PMI Collocation", xlab = "PMI (Pointwise Mutual Information)") ## Using a sequence of POS tags (noun phrases / verb phrases) x$phrase_tag <- as_phrasemachine(x$upos, type = "upos") stats <- keywords_phrases(x = x$phrase_tag, term = tolower(x$token), pattern = "(A|N)*N(P+D*(A|N)*N)*", is_regex = TRUE, detailed = FALSE) stats <- subset(stats, ngram > 1 & freq > 3) stats$key <- factor(stats$keyword, levels = rev(stats$keyword)) barchart(key ~ freq, data = head(stats, 20), col = "cadetblue", main = "Keywords - simple noun phrases", xlab = "Frequency") ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- cooc <- cooccurrence(x = subset(x, upos %in% c("NOUN", "ADJ")), term = "lemma", group = c("doc_id", "paragraph_id", "sentence_id")) head(cooc) ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # library(igraph) # library(ggraph) # library(ggplot2) # wordnetwork <- head(cooc, 30) # wordnetwork <- graph_from_data_frame(wordnetwork) # ggraph(wordnetwork, layout = "fr") + # geom_edge_link(aes(width = cooc, edge_alpha = cooc), edge_colour = "pink") + # geom_node_text(aes(label = name), col = "darkgreen", size = 4) + # theme_graph(base_family = "Arial Narrow") + # theme(legend.position = "none") + # labs(title = "Cooccurrences within sentence", subtitle = "Nouns & Adjective") ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- cooc <- cooccurrence(x$lemma, relevant = x$upos %in% c("NOUN", "ADJ"), skipgram = 1) head(cooc) ## ----eval=FALSE----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # library(igraph) # library(ggraph) # library(ggplot2) # wordnetwork <- head(cooc, 15) # wordnetwork <- graph_from_data_frame(wordnetwork) # ggraph(wordnetwork, layout = "fr") + # geom_edge_link(aes(width = cooc, edge_alpha = cooc)) + # geom_node_text(aes(label = name), col = "darkgreen", size = 4) + # theme_graph(base_family = "Arial Narrow") + # labs(title = "Words following one another", subtitle = "Nouns & Adjective") ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- x$id <- unique_identifier(x, fields = c("sentence_id", "doc_id")) dtm <- subset(x, upos %in% c("NOUN", "ADJ")) dtm <- document_term_frequencies(dtm, document = "id", term = "lemma") dtm <- document_term_matrix(dtm) dtm <- dtm_remove_lowfreq(dtm, minfreq = 5) termcorrelations <- dtm_cor(dtm) y <- as_cooccurrence(termcorrelations) y <- subset(y, term1 < term2 & abs(cooc) > 0.2) y <- y[order(abs(y$cooc), decreasing = TRUE), ] head(y) ## ---- results='hide', echo=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ invisible(if(file.exists(ud_model$file)) file.remove(c(ud_model$file)))