## ---- echo = FALSE------------------------------------------------------------
knitr::opts_chunk$set(collapse = FALSE,
                      comment = "##",
                      tidy = FALSE)

## ---- eval=FALSE--------------------------------------------------------------
#  install.packages("conText")

## ---- eval=FALSE--------------------------------------------------------------
#  devtools::install_github("prodriguezsosa/conText")

## ---- message=FALSE-----------------------------------------------------------
library(conText)

# other libraries used in this guide
library(quanteda)
library(dplyr)
library(text2vec)

## ---- message = FALSE---------------------------------------------------------
# tokenize corpus removing unnecessary (i.e. semantically uninformative) elements
toks <- tokens(cr_sample_corpus, remove_punct=T, remove_symbols=T, remove_numbers=T, remove_separators=T)

# clean out stopwords and words with 2 or fewer characters
toks_nostop <- tokens_select(toks, pattern = stopwords("en"), selection = "remove", min_nchar=3)

# only use features that appear at least 5 times in the corpus
feats <- dfm(toks_nostop, tolower=T, verbose = FALSE) %>% dfm_trim(min_termfreq = 5) %>% featnames()

# leave the pads so that non-adjacent words will not become adjacent
toks_nostop_feats <- tokens_select(toks_nostop, feats, padding = TRUE)

## ---- message = FALSE---------------------------------------------------------
# build a tokenized corpus of contexts sorrounding the target term "immigration"
immig_toks <- tokens_context(x = toks_nostop_feats, pattern = "immigr*", window = 6L)
head(docvars(immig_toks), 3)

## ---- message = FALSE---------------------------------------------------------
# build document-feature matrix
immig_dfm <- dfm(immig_toks)
immig_dfm[1:3,1:3]

## ---- message = FALSE---------------------------------------------------------
# build a document-embedding-matrix
immig_dem <- dem(x = immig_dfm, pre_trained = cr_glove_subset, transform = TRUE, transform_matrix = cr_transform, verbose = TRUE)

# each document inherits its corresponding docvars
#head(immig_dem@docvars)

# you can check which documents were not embedded due to lack of overlapping features (in this example all documents are embedded)
# note: 'quanteda' functions like `docvars()` and `docnames()` don't work on `dem` objects, so you will have to call the attributes directly. 
#setdiff(docnames(immig_dfm), immig_dem@Dimnames$docs)

# vector of features used to create the embeddings
#head(immig_dem@features)

## ---- message = FALSE---------------------------------------------------------
# to get a single "corpus-wide" embedding, take the column average
immig_wv <- matrix(colMeans(immig_dem), ncol = ncol(immig_dem)) %>%  `rownames<-`("immigration")
dim(immig_wv)

## ---- message = FALSE---------------------------------------------------------
# to get group-specific embeddings, average within party
immig_wv_party <- dem_group(immig_dem, groups = immig_dem@docvars$party)
dim(immig_wv_party)

## -----------------------------------------------------------------------------
# find nearest neighbors by party
# setting as_list = FALSE combines each group's results into a single tibble (useful for joint plotting)
immig_nns <- nns(immig_wv_party, pre_trained = cr_glove_subset, N = 5, candidates = immig_wv_party@features, as_list = TRUE)

# check out results for Republican party
immig_nns[["R"]]

## -----------------------------------------------------------------------------
# compute the cosine similarity between each party's embedding and a specific set of features
cos_sim(immig_wv_party, pre_trained = cr_glove_subset, features = c('reform', 'enforcement'), as_list = FALSE)

## -----------------------------------------------------------------------------
# compute the cosine similarity between each party's embedding and a specific set of features
nns_ratio(x = immig_wv_party, N = 10, numerator = "R", candidates = immig_wv_party@features, pre_trained = cr_glove_subset, verbose = FALSE)

## -----------------------------------------------------------------------------

# compute the cosine similarity between each party's embedding and a set of tokenized contexts
immig_ncs <- ncs(x = immig_wv_party, contexts_dem = immig_dem, contexts = immig_toks, N = 5, as_list = TRUE)

# nearest contexts to Republican embedding of target term
# note, these may included contexts originating from Democrat speakers
immig_ncs[["R"]]

# you can limit candidate contexts to those of a specific party
immig_ncs <- ncs(x = immig_wv_party["R",], contexts_dem = immig_dem[immig_dem@docvars$party == "R",], contexts = immig_toks, N = 5, as_list = FALSE)


## -----------------------------------------------------------------------------
# extract candidate features from the dem object
immig_feats <- immig_wv_party@features

# check spelling. toupper avoids names being considered misspelled
if (requireNamespace("hunspell", quietly = TRUE)) { 
  library(hunspell) # spellcheck library
  spellcheck <-  hunspell_check(toupper(immig_feats), dict = hunspell::dictionary("en_US"))
  immig_feats <- immig_feats[spellcheck]
  }

# find nearest neighbors by party using stemming
immig_nns_stem <- nns(immig_wv_party, pre_trained = cr_glove_subset, N = 5, candidates = immig_feats, stem = TRUE, as_list = TRUE)

# check out results for Republican party
immig_nns_stem[["R"]]

## ---- message = FALSE---------------------------------------------------------
# build a corpus of contexts sorrounding the target term "immigration"
mkws_toks <- tokens_context(x = toks_nostop_feats, pattern = c("immigration", "welfare", "immigration reform", "economy"), window = 6L, verbose = FALSE)

# create document-feature matrix
mkws_dfm <- dfm(mkws_toks)

# create document-embedding matrix using a la carte
mkws_dem <- dem(x = mkws_dfm, pre_trained = cr_glove_subset, transform = TRUE, transform_matrix = cr_transform, verbose = FALSE)

# get embeddings for each pattern
mkws_wvs <- dem_group(mkws_dem, groups = mkws_dem@docvars$pattern)

# find nearest neighbors for each keyword
mkws_nns <- nns(mkws_wvs, pre_trained = cr_glove_subset, N = 5, candidates = mkws_wvs@features, as_list = TRUE)

# to check results for a given pattern
mkws_nns[["immigration reform"]]

## ---- message = FALSE---------------------------------------------------------
# build a corpus of contexts sorrounding the immigration related words
topical_toks <- tokens_context(x = toks_nostop_feats, pattern = c("immigration", "immigrant", "immigration reform"), window = 6L, verbose = FALSE)

# create document-feature matrix
topical_dfm <- dfm(topical_toks)

# create document-embedding matrix using a la carte
topical_dem <- dem(x = topical_dfm, pre_trained = cr_glove_subset, transform = TRUE, transform_matrix = cr_transform, verbose = FALSE)

# get "topical" embeddings for each party
topical_wvs <- dem_group(topical_dem, groups = topical_dem@docvars$party)

# find nearest neighbors for each keyword
nns(topical_wvs, pre_trained = cr_glove_subset, N = 5, candidates = topical_wvs@features, stem = TRUE, as_list = FALSE)

## ---- message = FALSE---------------------------------------------------------

# we limit candidates to features in our corpus
feats <- featnames(dfm(immig_toks))

# compare nearest neighbors between groups
set.seed(2021L)
immig_party_nns <- get_nns(x = immig_toks, N = 10,
        groups = docvars(immig_toks, 'party'),
        candidates = feats,
        pre_trained = cr_glove_subset,
        transform = TRUE,
        transform_matrix = cr_transform,
        bootstrap = TRUE,
        num_bootstraps = 100, 
        confidence_level = 0.95,
        as_list = TRUE)

# nearest neighbors of "immigration" for Republican party
immig_party_nns[["R"]]

## ---- message = FALSE---------------------------------------------------------

# compute the cosine similarity between each group's embedding and a specific set of features
set.seed(2021L)
get_cos_sim(x = immig_toks,
            groups = docvars(immig_toks, 'party'),
            features = c("reform", "enforce"),
            pre_trained = cr_glove_subset,
            transform = TRUE,
            transform_matrix = cr_transform,
            bootstrap = TRUE,
            num_bootstraps = 100,
            as_list = FALSE)

## ---- message = FALSE---------------------------------------------------------

# we limit candidates to features in our corpus
feats <- featnames(dfm(immig_toks))

# compute ratio
set.seed(2021L)
immig_nns_ratio <- get_nns_ratio(x = immig_toks, 
              N = 10,
              groups = docvars(immig_toks, 'party'),
              numerator = "R",
              candidates = feats,
              pre_trained = cr_glove_subset,
              transform = TRUE,
              transform_matrix = cr_transform,
              bootstrap = TRUE,
              num_bootstraps = 100,
              permute = TRUE,
              num_permutations = 100,
              verbose = FALSE)

head(immig_nns_ratio)

## ---- eval=TRUE---------------------------------------------------------------
plot_nns_ratio(x = immig_nns_ratio, alpha = 0.01, horizontal = TRUE)

## ---- message = FALSE---------------------------------------------------------
# compare nearest neighbors between groups
set.seed(2021L)
immig_party_ncs <- get_ncs(x = immig_toks,
                            N = 10,
                            groups = docvars(immig_toks, 'party'),
                            pre_trained = cr_glove_subset,
                            transform = TRUE,
                            transform_matrix = cr_transform,
                            bootstrap = TRUE,
                            num_bootstraps = 100,
                            as_list = TRUE)

# nearest neighbors of "immigration" for Republican party
immig_party_ncs[["R"]]

## ---- message = FALSE---------------------------------------------------------

# two factor covariates
set.seed(2021L)
model1 <- conText(formula = immigration ~ party + gender,
                  data = toks_nostop_feats,
                  pre_trained = cr_glove_subset,
                  transform = TRUE, transform_matrix = cr_transform,
                  bootstrap = TRUE, num_bootstraps = 100,
                  permute = TRUE, num_permutations = 100,
                  window = 6, case_insensitive = TRUE,
                  verbose = FALSE)

# notice, non-binary covariates are automatically "dummified"
rownames(model1)

## ---- message = FALSE---------------------------------------------------------

# D-dimensional beta coefficients
# the intercept in this case is the ALC embedding for female Democrats
# beta coefficients can be combined to get each group's ALC embedding
DF_wv <- model1['(Intercept)',] # (D)emocrat - (F)emale 
DM_wv <- model1['(Intercept)',] + model1['gender_M',] # (D)emocrat - (M)ale 
RF_wv <- model1['(Intercept)',] + model1['party_R',]  # (R)epublican - (F)emale 
RM_wv <- model1['(Intercept)',] + model1['party_R',] + model1['gender_M',] # (R)epublican - (M)ale 

# nearest neighbors
nns(rbind(DF_wv,DM_wv), N = 10, pre_trained = cr_glove_subset, candidates = model1@features)

## ---- message = FALSE---------------------------------------------------------
model1@normed_coefficients

## ---- message = FALSE---------------------------------------------------------

# continuous covariate
set.seed(2021L)
model2 <- conText(formula = immigration ~ nominate_dim1,
                  data = toks_nostop_feats,
                  pre_trained = cr_glove_subset,
                  transform = TRUE, transform_matrix = cr_transform,
                  bootstrap = TRUE, num_bootstraps = 100,
                  permute = TRUE, num_permutations = 100,
                  window = 6, case_insensitive = TRUE,
                  verbose = FALSE)

# look at percentiles of nominate
percentiles <- quantile(docvars(cr_sample_corpus)$nominate_dim1, probs = seq(0.05,0.95,0.05))
percentile_wvs <- lapply(percentiles, function(i) model2["(Intercept)",] + i*model2["nominate_dim1",]) %>% do.call(rbind,.)
percentile_sim <- cos_sim(x = percentile_wvs, pre_trained = cr_glove_subset, features = c("reform", "enforce"), as_list = TRUE)

# check output
rbind(head(percentile_sim[["reform"]], 5),tail(percentile_sim[["reform"]], 5))
rbind(head(percentile_sim[["enforce"]], 5),tail(percentile_sim[["enforce"]], 5))

## ---- message=FALSE-----------------------------------------------------------
library(text2vec)

#---------------------------------
# estimate glove model
#---------------------------------

# construct the feature co-occurrence matrix for our toks_nostop_feats object (see above)
toks_fcm <- fcm(toks_nostop_feats, context = "window", window = 6, count = "frequency", tri = FALSE) # important to set tri = FALSE

# estimate glove model using text2vec
glove <- GlobalVectors$new(rank = 300, 
                           x_max = 10,
                           learning_rate = 0.05)
wv_main <- glove$fit_transform(toks_fcm, n_iter = 10,
                               convergence_tol = 1e-3, 
                               n_threads = 2) # set to 'parallel::detectCores()' to use all available cores


wv_context <- glove$components
local_glove <- wv_main + t(wv_context) # word vectors

# qualitative check
find_nns(local_glove['immigration',], pre_trained = local_glove, N = 5, candidates = feats)

## ---- message = FALSE---------------------------------------------------------
# compute transform
# weighting = 'log' works well for smaller corpora
# for large corpora use a numeric value e.g. weighting = 500
# see: https://arxiv.org/pdf/1805.05388.pdf
local_transform <- compute_transform(x = toks_fcm, pre_trained = local_glove, weighting = 'log')

## ---- message = FALSE---------------------------------------------------------
#---------------------------------
# check
#---------------------------------

# create document-embedding matrix using our locally trained GloVe embeddings and transformation matrix
immig_dem_local <- dem(x = immig_dfm, pre_trained = local_glove, transform = TRUE, transform_matrix = local_transform, verbose = TRUE)

# take the column average to get a single "corpus-wide" embedding
immig_wv_local <- colMeans(immig_dem_local)

# find nearest neighbors for overall immigraiton embedding
find_nns(immig_wv_local, pre_trained = local_glove, N = 10, candidates = immig_dem_local@features)

# we can also compare to corresponding pre-trained embedding
sim2(x = matrix(immig_wv_local, nrow = 1), y = matrix(local_glove['immigration',], nrow = 1), method = 'cosine', norm = 'l2')

## ---- message = FALSE---------------------------------------------------------
# create feature co-occurrence matrix for each party (set tri = FALSE to work with fem)
fcm_D <- fcm(toks_nostop_feats[docvars(toks_nostop_feats, 'party') == "D",], context = "window", window = 6, count = "frequency", tri = FALSE)
fcm_R <- fcm(toks_nostop_feats[docvars(toks_nostop_feats, 'party') == "R",], context = "window", window = 6, count = "frequency", tri = FALSE)

## ---- message = FALSE---------------------------------------------------------

# compute feature-embedding matrix
fem_D <- fem(fcm_D, pre_trained = cr_glove_subset, transform = TRUE, transform_matrix = cr_transform, verbose = FALSE)
fem_R <- fem(fcm_R, pre_trained = cr_glove_subset, transform = TRUE, transform_matrix = cr_transform, verbose = FALSE)

# cr_fem will contain an embedding for each feature
fem_D[1:5,1:3]

## ---- message = FALSE---------------------------------------------------------

# compute "horizontal" cosine similarity
feat_comp <- feature_sim(x = fem_R, y = fem_D)

# least similar features
head(feat_comp)

# most similar features
tail(feat_comp)

## ---- message = FALSE---------------------------------------------------------

# identify documents with fewer than 100 words
short_toks <- toks_nostop_feats[sapply(toks_nostop_feats, length) <= 100,]

# run regression on full documents
model3 <- conText(formula = . ~ party,
                  data = short_toks,
                  pre_trained = cr_glove_subset,
                  transform = TRUE, transform_matrix = cr_transform,
                  bootstrap = TRUE, num_bootstraps = 100,
                  permute = TRUE, num_permutations = 100,
                  window = 6, case_insensitive = TRUE,
                  verbose = FALSE)