## ----setup, include=FALSE----------------------------------------------------- # CRAN will not have spaCy installed, so create static vignette knitr::opts_chunk$set(eval = FALSE) ## ----echo = FALSE, message=FALSE---------------------------------------------- # library(magrittr) # library(dplyr) # library(ggplot2) # library(cleanNLP) # library(sotu) ## ----------------------------------------------------------------------------- # cnlp_init_spacy() ## ----------------------------------------------------------------------------- # input <- sotu_meta # input$text <- sotu_text ## ----------------------------------------------------------------------------- # anno <- cnlp_annotate(input) ## ----------------------------------------------------------------------------- # anno$token %>% # group_by(doc_id, sid) %>% # summarize(sent_len = n()) %$% # quantile(sent_len, seq(0,1,0.1)) ## ----------------------------------------------------------------------------- # anno$token %>% # filter(upos == "NOUN") %>% # group_by(lemma) %>% # summarize(count = n()) %>% # top_n(n = 42, count) %>% # arrange(desc(count)) %>% # use_series(lemma) ## ----fig.height=6, fig.width=7------------------------------------------------ # anno$token %>% # group_by(doc_id) %>% # summarize(n = n()) %>% # left_join(anno$document, by="doc_id") %>% # ggplot(aes(year, n)) + # geom_line(color = grey(0.8)) + # geom_point(aes(color = sotu_type)) + # geom_smooth(method="loess", formula = y ~ x) + # theme_minimal() ## ----------------------------------------------------------------------------- # anno$entity %>% # filter(entity_type == "LOC") %>% # group_by(entity) %>% # summarize(count = n()) %>% # top_n(n = 44, count) %>% # arrange(desc(count)) %>% # use_series(entity) ## ----------------------------------------------------------------------------- # anno$token %>% # left_join( # anno$token, # c("doc_id"="doc_id", "sid"="sid", "tid"="tid_source"), # suffix=c("", "_source") # ) %>% # left_join(anno$document, by="doc_id") %>% # filter(year == 2001) %>% # filter(relation == "dobj") %>% # select(doc_id = doc_id, start = token, word = token_source) %>% # left_join(word_frequency, by="word") %>% # filter(frequency < 0.001) %>% # select(doc_id, start, word) %$% # sprintf("%s => %s", start, word) ## ----------------------------------------------------------------------------- # anno$token %>% # left_join( # anno$token, c("doc_id"="doc_id", "sid"="sid", "tid"="tid_source"), # suffix=c("", "_source") # ) %>% # left_join(anno$document, by="doc_id") %>% # filter(year == 2002) %>% # filter(relation == "dobj") %>% # select(doc_id = doc_id, start = token, word = token_source) %>% # left_join(word_frequency, by="word") %>% # filter(frequency < 0.001) %>% # select(doc_id, start, word) %$% # sprintf("%s => %s", start, word) ## ----------------------------------------------------------------------------- # pca <- anno$token %>% # filter(xpos %in% c("NN", "NNS")) %>% # cnlp_utils_tfidf(min_df = 0.05, max_df = 0.95, tf_weight = "dnorm") %>% # cnlp_utils_pca() # pca <- bind_cols(anno$document, pca) # pca ## ----fig.height=6, fig.width=7------------------------------------------------ # ggplot(pca, aes(PC1, PC2)) + # geom_point(aes(color = cut(year, 10, dig.lab = 4)), alpha = 0.35, size = 4) + # ggrepel::geom_text_repel(data = filter(pca, !duplicated(president)), # aes(label = president), color = grey(0.4), cex = 3) + # labs(color = "Years") + # scale_color_viridis_d(end = 0.9, option = "C") + # theme(axis.title.x = element_text(size = 14), # axis.title.y = element_text(size = 14), # axis.text.x = element_blank(), # axis.text.y = element_blank()) + # theme_void() ## ----message=FALSE, warning=FALSE--------------------------------------------- # library(topicmodels) # mat <- anno$token %>% # filter(xpos %in% c("NN", "NNS")) %>% # cnlp_utils_tf(min_df = 0.05, max_df = 0.95) # # tm <- LDA(mat, k = 16) ## ----fig.height=9, fig.width=7------------------------------------------------ # terms <- posterior(tm)$terms # topics <- posterior(tm)$topics # topic_df <- tibble(topic = as.integer(col(topics)), # doc_id = anno$document$doc_id[as.integer(row(topics))], # val = as.numeric(topics)) %>% # left_join(anno$document, by="doc_id") # top_terms <- apply(terms, 1, # function(v) { # paste(colnames(mat)[order(v, decreasing = TRUE)[1:5]], collapse = ", ") # }) # top_terms <- as.character(top_terms) # # index <- rank(-1 * tapply(topic_df$year * topic_df$val, topic_df$topic, which.max)) # topic_df$topic_new <- index[topic_df$topic] # top_terms_df <- tibble(top_terms, topic = 1:length(top_terms)) # top_terms_df$topic_new <- index[top_terms_df$topic] # # ggplot(topic_df, aes(year, topic_new)) + # geom_point(aes(size = val, color = factor(topic_new))) + # geom_text(data = top_terms_df, x = mean(topic_df$year), # size = 5, aes(y = topic_new + 0.4, label = top_terms, color = factor(topic_new)), # show.legend = FALSE) + # scale_color_viridis_d(end = 0.7, option = "C") + # theme(axis.text.y=element_blank(), # axis.title.y=element_blank(), # legend.position="bottom", # axis.title.x = element_text(size = 16), # axis.text.x = element_text(size = 14)) + # labs(size = "Posterior probability") + # theme_minimal() + # scale_y_continuous(breaks=FALSE) + # ylab("") + # xlab("Year") + # guides(colour = FALSE) ## ----------------------------------------------------------------------------- # df <- anno$token %>% # left_join(anno$document, by="doc_id") %>% # filter(year > 2000) %>% # mutate(new_id = paste(doc_id, sid, sep = "-")) %>% # filter(xpos %in% c("NN", "NNS")) # mat <- cnlp_utils_tf(df, doc_var = "new_id") # dim(mat) ## ----------------------------------------------------------------------------- # meta <- tibble(new_id = rownames(mat)) %>% # left_join(df[!duplicated(df$new_id),], by="new_id") %>% # mutate(y = as.numeric(president == "Barack Obama")) %>% # mutate(train = year %in% seq(2001, 2016, by = 2)) ## ----message=FALSE------------------------------------------------------------ # library(glmnet) # model <- cv.glmnet(mat[meta$train,], meta$y[meta$train], family = "binomial") ## ----fig.height=6, fig.width=7------------------------------------------------ # meta$pred <- predict(model, newx = mat, type = "response", s = model$lambda.1se) # ggplot(meta, aes(factor(year),pred)) + # geom_boxplot(aes(fill = relevel(factor(president), "George W. Bush"))) + # labs(fill = "President") + xlab("year") + ylab("predicted probability") + # scale_fill_viridis_d(alpha = 0.6, end = 0.75, option = "C") + # coord_flip() + # theme(axis.title.x = element_text(size = 12), # axis.text.x = element_text(size = 10), # axis.title.y = element_text(size = 12), # axis.text.y = element_text(size = 10)) + # theme_minimal() + # ylab("Predicted probability") + # xlab("Year") ## ----------------------------------------------------------------------------- # beta <- coef(model, s = model[["lambda"]][11])[-1] # sprintf("%s (%d)", colnames(mat), sign(beta))[beta != 0]