## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----texts-overview----------------------------------------------------------- library(tipitaka.critical) dim(texts) names(texts) ## ----texts-example------------------------------------------------------------ # The Brahmajala Sutta (DN 1) dn1 <- texts[texts$id == "dn1", ] dn1$title # First 120 characters of surface text cat(substr(dn1$text, 1, 120), "...\n") # Same passage, lemmatized cat(substr(dn1$text_lemmatized, 1, 120), "...\n") ## ----texts-collections-------------------------------------------------------- table(texts$pitaka) table(texts$collection) ## ----lemmas-overview---------------------------------------------------------- dim(lemmas) head(lemmas) ## ----lemmas-top--------------------------------------------------------------- totals <- tapply(lemmas$n, lemmas$word, sum) head(sort(totals, decreasing = TRUE), 15) ## ----lemmas-by-collection----------------------------------------------------- dn_lemmas <- lemmas[lemmas$collection == "dn", ] dn_totals <- tapply(dn_lemmas$n, dn_lemmas$word, sum) head(sort(dn_totals, decreasing = TRUE), 10) ## ----search------------------------------------------------------------------- # Where does "nibbana" appear most frequently? nibbana <- search_lemma("nibbana") head(nibbana[, c("id", "collection", "n", "freq")]) ## ----search-dhamma------------------------------------------------------------ # "dhamma" across collections dhamma <- search_lemma("dhamma") tapply(dhamma$n, dhamma$collection, sum) ## ----dtm-overview------------------------------------------------------------- dim(dtm) class(dtm) # Sparsity (proportion of zero entries) 1 - length(dtm@x) / prod(dim(dtm)) ## ----dn-cluster, fig.width=7, fig.height=4------------------------------------ dn_ids <- texts$id[texts$collection == "dn"] dn_dtm <- dtm[dn_ids, ] # Drop empty columns dn_dtm <- dn_dtm[, colSums(dn_dtm) > 0] d <- dist(as.matrix(dn_dtm)) hc <- hclust(d, method = "ward.D2") plot(hc, main = "Digha Nikaya — Hierarchical Clustering", xlab = "", sub = "", cex = 0.7) ## ----pca, fig.width=7, fig.height=6------------------------------------------- # Select top 500 lemmas by total frequency col_sums <- colSums(dtm) top_terms <- names(sort(col_sums, decreasing = TRUE))[1:500] dtm_sub <- as.matrix(dtm[, top_terms]) # PCA pca <- prcomp(dtm_sub, center = TRUE, scale. = FALSE) pct_var <- summary(pca)$importance[2, 1:2] * 100 # Color by collection coll_colors <- c( abhidhamma = "#E41A1C", an = "#377EB8", dn = "#4DAF4A", kn = "#FF7F00", mn = "#984EA3", sn = "#A65628", vinaya = "#F781BF" ) pt_col <- coll_colors[texts$collection] plot(pca$x[, 1], pca$x[, 2], col = adjustcolor(pt_col, alpha.f = 0.5), pch = 16, cex = 0.6, xlab = paste0("PC1 (", round(pct_var[1], 1), "%)"), ylab = paste0("PC2 (", round(pct_var[2], 1), "%)"), main = "PCA of All Tipitaka Texts") legend("topright", c("Abhidhamma", "AN", "DN", "KN", "MN", "SN", "Vinaya"), col = coll_colors, pch = 16, cex = 0.8) ## ----canon-cluster, fig.width=7, fig.height=10-------------------------------- # Create group IDs at an intermediate level group_id <- texts$id # SN: sn1.1 -> sn1 (by samyutta) sn_mask <- texts$collection == "sn" group_id[sn_mask] <- sub("\\..*", "", group_id[sn_mask]) # AN: an1.1 -> an1 (by nipata) an_mask <- texts$collection == "an" group_id[an_mask] <- sub("\\..*", "", group_id[an_mask]) # KN: dhp1-20 -> dhp, snp1.1 -> snp, etc. (by text) kn_mask <- texts$collection == "kn" group_id[kn_mask] <- sub("[0-9].*", "", group_id[kn_mask]) # Aggregate DTM by group (mean of member frequencies) groups <- unique(group_id) group_dtm <- matrix(0, length(groups), length(top_terms)) group_coll <- character(length(groups)) for (i in seq_along(groups)) { rows <- which(group_id == groups[i]) if (length(rows) == 1) { group_dtm[i, ] <- dtm_sub[rows, ] } else { group_dtm[i, ] <- colMeans(dtm_sub[rows, ]) } group_coll[i] <- texts$collection[rows[1]] } rownames(group_dtm) <- groups # Cluster d <- dist(group_dtm) hc <- hclust(d, method = "ward.D2") # Color labels by collection label_col <- coll_colors[group_coll[hc$order]] dend <- as.dendrogram(hc) # Apply colors to leaf labels color_labels <- function(n, col_vec) { if (is.leaf(n)) { i <- match(attr(n, "label"), groups[hc$order]) attr(n, "nodePar") <- list(pch = NA, lab.col = col_vec[i], lab.cex = 0.45) } n } dend <- dendrapply(dend, color_labels, col_vec = label_col) oldpar <- par(mar = c(2, 1, 2, 8)) plot(dend, horiz = TRUE, main = "Tipitaka — Hierarchical Clustering", xlab = "") legend("topleft", c("Abhidhamma", "AN", "DN", "KN", "MN", "SN", "Vinaya"), text.col = coll_colors, cex = 0.7, bty = "n") par(oldpar)