PubMatrixR: Literature Co-occurrence Analysis

A comprehensive guide to analyzing publication relationships

ToledoEM

2026-03-07

library(PubMatrixR)
library(knitr)
library(kableExtra)
library(dplyr)
library(pheatmap)
library(ggplot2)

Introduction

PubMatrixR is an R package designed to analyze literature co-occurrence patterns by systematically searching PubMed and PMC databases. This vignette demonstrates how to:

Acknowledgments

This package is a heavily fork of the original PubMatrixR by tslaird. Our gratitude to the original author.

Preparing Gene Sets

Making the gene lists from MSigDB

For this example, we’ll extract genes related to WNT signaling and obesity from the MSigDB database:

msigdf is optional and not required to build this vignette. The example below is shown for reference only and is not executed during package checks.

# Extract WNT-related genes
A <- msigdf::msigdf.human %>%
  dplyr::filter(grepl(geneset, pattern = "wnt", ignore.case = TRUE)) %>%
  dplyr::pull(symbol) %>%
  unique()

# Extract obesity-related genes
B <- msigdf::msigdf.human %>%
  dplyr::filter(grepl(geneset, pattern = "obesity", ignore.case = TRUE)) %>%
  dplyr::pull(symbol) %>%
  unique()

# Sample genes for demonstration (making them equal in length)
A <- sample(A, 10, replace = FALSE)
B <- sample(B, 10, replace = FALSE)

Fallback Example Dataset

When MSigDB is not available, we use these representative gene sets:

# WNT signaling pathway genes
A <- c("WNT1", "WNT2", "WNT3A", "WNT5A", "WNT7B", "CTNNB1", "DVL1")

# Obesity-related genes
B <- c("LEPR", "ADIPOQ", "PPARG", "TNF", "IL6", "ADRB2", "INSR")

Literature Analysis

Running PubMatrixR

Now we’ll search for co-occurrences between our gene sets in PubMed literature:

# Intentionally not run during package checks: this performs live NCBI queries.
# Run actual PubMatrix analysis
current_year <- as.integer(format(Sys.Date(), "%Y"))
result <- PubMatrix(
  A = A,
  B = B,
  Database = "pubmed",
  daterange = c(2020, current_year),
  outfile = "pubmatrix_result"
)
# Offline deterministic example used for vignette rendering/package checks.
result <- outer(seq_along(B), seq_along(A), function(i, j) {
  8 + (i * 6) + (j * 5) + ((i + j) %% 4) * 3 + ((i * j) %% 5)
})
result <- as.data.frame(result, check.names = FALSE)
colnames(result) <- A
rownames(result) <- B

Results Table

The co-occurrence matrix shows the number of publications mentioning each pair of genes:

kable(result,
  caption = "Co-occurrence Matrix: WNT Genes vs Obesity Genes (Publication Counts)",
  align = "c",
  format = if (knitr::pandoc_to() == "html") "html" else "markdown"
) %>%
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "center"
  ) %>%
  kableExtra::add_header_above(c(" " = 1, "Obesity Genes" = length(B)))
Co-occurrence Matrix: WNT Genes vs Obesity Genes (Publication Counts)
Obesity Genes
WNT1 WNT2 WNT3A WNT5A WNT7B CTNNB1 DVL1
LEPR 26 35 32 41 45 54 51
ADIPOQ 36 34 39 49 54 52 62
PPARG 34 40 51 57 51 62 68
TNF 44 51 58 53 60 72 79
IL6 49 57 53 61 69 77 73
ADRB2 59 56 65 74 78 75 84
INSR 57 67 72 82 75 85 95

Visualization

Publication Count Bar Plots

Let’s first examine which genes have the highest publication counts:

# Create data frame for List A genes (rows) colored by List B genes (columns)
a_genes_data <- data.frame(
  gene = rownames(result),
  total_pubs = rowSums(result),
  stringsAsFactors = FALSE
)

# Add color coding based on max overlap with B genes
a_genes_data$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)])
a_genes_data$max_overlap <- apply(result, 1, max)

# Create data frame for List B genes (columns) colored by List A genes (rows)
b_genes_data <- data.frame(
  gene = colnames(result),
  total_pubs = colSums(result),
  stringsAsFactors = FALSE
)

# Add color coding based on max overlap with A genes
b_genes_data$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)])
b_genes_data$max_overlap <- apply(result, 2, max)

bar_plot_theme <- theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 15),
    plot.subtitle = element_text(size = 10),
    axis.title = element_text(size = 11),
    axis.text = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 9),
    legend.box = "vertical"
  )

n_fill_a <- length(unique(a_genes_data$max_b_gene))
n_fill_b <- length(unique(b_genes_data$max_a_gene))

# Plot A genes colored by their strongest B gene partner
p1 <- ggplot(a_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) +
  geom_col(width = 0.75) +
  coord_flip() +
  labs(
    title = "List A Genes by Publication Count",
    subtitle = "Colored by strongest List B partner",
    x = "Genes (List A)",
    y = "Total Publications",
    fill = "Strongest B Partner"
  ) +
  scale_fill_viridis_d(option = "D", end = 0.9) +
  guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_a / 4)), byrow = TRUE)) +
  bar_plot_theme


# Plot B genes colored by their strongest A gene partner
p2 <- ggplot(b_genes_data, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) +
  geom_col(width = 0.75) +
  coord_flip() +
  labs(
    title = "List B Genes by Publication Count",
    subtitle = "Colored by strongest List A partner",
    x = "Genes (List B)",
    y = "Total Publications",
    fill = "Strongest A Partner"
  ) +
  scale_fill_viridis_d(option = "D", end = 0.9) +
  guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_b / 4)), byrow = TRUE)) +
  bar_plot_theme


print(p1)

print(p2)

Heatmap with Overlap Percentages

The heatmap displays overlap percentages calculated from the publication co-occurrence counts:

plot_pubmatrix_heatmap(result,
  title = "WNT-Obesity Overlap (%)",
  show_numbers = TRUE,
  cellwidth = 44,
  cellheight = 32,
  width = 12,
  height = 10
)
Overlap percentage heatmap with values displayed in each cell

Overlap percentage heatmap with values displayed in each cell

Clean Heatmap

For a cleaner visualization without numbers, useful for presentations:

plot_pubmatrix_heatmap(result,
  title = "WNT-Obesity Co-occurrence (Clean)",
  show_numbers = FALSE,
  cellwidth = 44,
  cellheight = 32,
  width = 12,
  height = 10
)
Co-occurrence heatmap without numbers for better visual clarity

Co-occurrence heatmap without numbers for better visual clarity

Asymmetric lists

# Intentionally not run during package checks: this performs live NCBI queries.
A <- c("NCOR2", "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53")


B <- c("EIF1", "EIF1AX", "EIF2B1", "EIF2B2", "EIF2B3", "EIF2B4", "EIF2B5", "EIF2S1", "EIF2S2", "EIF2S3", "ELAVL1")


# Run actual PubMatrix analysis
current_year <- as.integer(format(Sys.Date(), "%Y"))
result <- PubMatrix(
  A = A,
  B = B,
  Database = "pubmed",
  daterange = c(2020, current_year),
  outfile = "pubmatrix_result"
)
# Offline deterministic example used for vignette rendering/package checks.
A <- c("NCOR2", "NCSTN", "NKD1", "NOTCH1", "NOTCH4", "NUMB", "PPARD", "PSEN2", "PTCH1", "RBPJ", "SKP2", "TCF7", "TP53")
B <- c("EIF1", "EIF1AX", "EIF2B1", "EIF2B2", "EIF2B3", "EIF2B4", "EIF2B5", "EIF2S1", "EIF2S2", "EIF2S3", "ELAVL1")

result <- outer(seq_along(B), seq_along(A), function(i, j) {
  12 + (i * 4) + (j * 3) + ((i * j) %% 9) + ((i + 2 * j) %% 6)
})
result <- as.data.frame(result, check.names = FALSE)
colnames(result) <- A
rownames(result) <- B

Results Table

The co-occurrence matrix shows the number of publications mentioning each pair of genes:

kable(result,
  caption = "Co-occurrence Matrix: Longer Lists (Publication Counts)",
  align = "c",
  format = if (knitr::pandoc_to() == "html") "html" else "markdown"
) %>%
  kableExtra::kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "center"
  ) %>%
  kableExtra::add_header_above(c(" " = 1, "A Genes" = length(A)))
Co-occurrence Matrix: Longer Lists (Publication Counts)
A Genes
NCOR2 NCSTN NKD1 NOTCH1 NOTCH4 NUMB PPARD PSEN2 PTCH1 RBPJ SKP2 TCF7 TP53
EIF1 23 29 29 35 41 41 47 53 44 50 56 56 62
EIF1AX 29 30 37 44 36 43 50 51 49 56 57 64 71
EIF2B1 35 37 36 44 46 45 53 55 54 62 64 63 71
EIF2B2 35 44 44 47 47 56 50 59 59 62 71 71 74
EIF2B3 41 42 52 47 57 58 62 63 64 68 69 79 74
EIF2B4 47 49 45 56 58 54 65 67 63 74 76 72 83
EIF2B5 53 56 53 56 68 65 68 71 68 80 83 80 83
EIF2S1 59 57 61 65 63 67 71 69 73 86 84 88 92
EIF2S2 56 55 60 65 64 69 74 73 78 83 82 87 92
EIF2S3 56 62 68 68 74 80 80 86 83 83 89 95 95
ELAVL1 62 69 76 77 75 82 83 90 88 89 96 103 104

Bar Plots for Asymmetric Lists

# Create data frame for List A genes (rows) colored by List B genes (columns)
a_genes_data2 <- data.frame(
  gene = rownames(result),
  total_pubs = rowSums(result),
  stringsAsFactors = FALSE
)

# Add color coding based on max overlap with B genes
a_genes_data2$max_b_gene <- apply(result, 1, function(x) colnames(result)[which.max(x)])
a_genes_data2$max_overlap <- apply(result, 1, max)

# Create data frame for List B genes (columns) colored by List A genes (rows)
b_genes_data2 <- data.frame(
  gene = colnames(result),
  total_pubs = colSums(result),
  stringsAsFactors = FALSE
)

# Add color coding based on max overlap with A genes
b_genes_data2$max_a_gene <- apply(result, 2, function(x) rownames(result)[which.max(x)])
b_genes_data2$max_overlap <- apply(result, 2, max)

bar_plot_theme <- theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 15),
    plot.subtitle = element_text(size = 10),
    axis.title = element_text(size = 11),
    axis.text = element_text(size = 10),
    panel.grid.major.y = element_blank(),
    legend.position = "bottom",
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8.5),
    legend.box = "vertical"
  )

n_fill_a2 <- length(unique(a_genes_data2$max_b_gene))
n_fill_b2 <- length(unique(b_genes_data2$max_a_gene))

# Plot A genes colored by their strongest B gene partner
p3 <- ggplot(a_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_b_gene)) +
  geom_col(width = 0.75) +
  coord_flip() +
  labs(
    title = "List A Genes by Publication Count (Asymmetric)",
    subtitle = "Colored by strongest List B partner",
    x = "Genes (List A)",
    y = "Total Publications",
    fill = "Strongest B Partner"
  ) +
  scale_fill_viridis_d(option = "D", end = 0.9) +
  guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_a2 / 4)), byrow = TRUE)) +
  bar_plot_theme


# Plot B genes colored by their strongest A gene partner
p4 <- ggplot(b_genes_data2, aes(x = reorder(gene, total_pubs), y = total_pubs, fill = max_a_gene)) +
  geom_col(width = 0.75) +
  coord_flip() +
  labs(
    title = "List B Genes by Publication Count (Asymmetric)",
    subtitle = "Colored by strongest List A partner",
    x = "Genes (List B)",
    y = "Total Publications",
    fill = "Strongest A Partner"
  ) +
  scale_fill_viridis_d(option = "D", end = 0.9) +
  guides(fill = guide_legend(nrow = max(1, ceiling(n_fill_b2 / 4)), byrow = TRUE)) +
  bar_plot_theme


print(p3)

print(p4)

Heatmap with Overlap Percentages

The heatmap displays overlap percentages calculated from the publication co-occurrence counts:

plot_pubmatrix_heatmap(result,
  title = "Asymmetric Lists Overlap (%)",
  show_numbers = TRUE,
  cellwidth = 44,
  cellheight = 32,
  width = 12,
  height = 10
)
Overlap percentage heatmap with values displayed in each cell

Overlap percentage heatmap with values displayed in each cell

Clean Heatmap

For a cleaner visualization without numbers, useful for presentations:

plot_pubmatrix_heatmap(result,
  title = "Asymmetric Lists Co-occurrence (Clean)",
  show_numbers = FALSE,
  cellwidth = 44,
  cellheight = 32,
  width = 12,
  height = 10
)
Co-occurrence heatmap without numbers for better visual clarity

Co-occurrence heatmap without numbers for better visual clarity

Summary

This vignette demonstrated:

  1. Gene Set Preparation: Using MSigDB or manual curation to create meaningful gene lists
  2. Literature Analysis: Running PubMatrixR to generate co-occurrence matrices
  3. Data Visualization: Creating publication-ready heatmaps with custom color schemes and bar plots
  4. Results Interpretation: Understanding co-occurrence patterns in the literature

The resulting matrices and visualizations can help identify: - Strong literature connections between gene sets - Potential research gaps (low co-occurrence pairs) - Patterns in publication trends over time - Most studied genes and their strongest literature partners

System Information

sessionInfo()
#> R version 4.5.2 (2025-10-31)
#> Platform: aarch64-apple-darwin20
#> Running under: macOS Tahoe 26.3.1
#> 
#> Matrix products: default
#> BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
#> 
#> locale:
#> [1] C/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
#> 
#> time zone: Europe/London
#> tzcode source: internal
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] ggplot2_4.0.2    pheatmap_1.0.13  dplyr_1.2.0      kableExtra_1.4.0
#> [5] knitr_1.51       PubMatrixR_1.0.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.2     tidyselect_1.2.1  
#>  [5] xml2_1.5.2         stringr_1.6.0      parallel_4.5.2     jquerylib_0.1.4   
#>  [9] systemfonts_1.3.1  scales_1.4.0       textshaping_1.0.4  yaml_2.3.12       
#> [13] fastmap_1.2.0      R6_2.6.1           labeling_0.4.3     generics_0.1.4    
#> [17] tibble_3.3.1       svglite_2.2.2      bslib_0.10.0       pillar_1.11.1     
#> [21] RColorBrewer_1.1-3 readODS_2.3.2      rlang_1.1.7        cachem_1.1.0      
#> [25] stringi_1.8.7      xfun_0.56          S7_0.2.1           sass_0.4.10       
#> [29] otel_0.2.0         viridisLite_0.4.3  cli_3.6.5          withr_3.0.2       
#> [33] magrittr_2.0.4     grid_4.5.2         digest_0.6.39      rstudioapi_0.18.0 
#> [37] pbapply_1.7-4      lifecycle_1.0.5    vctrs_0.7.1        evaluate_1.0.5    
#> [41] glue_1.8.0         farver_2.1.2       rmarkdown_2.30     tools_4.5.2       
#> [45] pkgconfig_2.0.3    htmltools_0.5.9