## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) # Guard all jvecfor calls: skip if Java or JAR is unavailable has_java <- nzchar(Sys.which("java")) has_jar <- length(dir(system.file("java", package = "jvecfor"), pattern = "^jvecfor-.*\\.jar$")) > 0L run_eg <- has_java && has_jar ## ----check-java, eval=FALSE--------------------------------------------------- # system2("java", "--version") # should print openjdk 20 or higher ## ----options-table, echo=FALSE------------------------------------------------ opts <- data.frame( Option = c("`jvecfor.verbose`", "`jvecfor.jar`"), Default = c("`FALSE`", "`NULL`"), Effect = c( "Pass `--verbose` to jvecfor, printing HNSW build progress to stderr.", "Use a custom JAR instead of the one bundled in `inst/java/`." ), stringsAsFactors = FALSE ) knitr::kable(opts) ## ----options-example, eval=FALSE---------------------------------------------- # options(jvecfor.verbose = TRUE) # global verbose mode # options(jvecfor.jar = "/path/to/custom/jvecfor.jar") # custom JAR ## ----setup-jar, eval=FALSE---------------------------------------------------- # # Auto-finds target/jvecfor-*.jar relative to the working directory # jvecfor_setup() # # # Or provide the path explicitly # jvecfor_setup(jar_path = "/path/to/jvecfor-x.y.z.jar") ## ----quick-start, eval=run_eg------------------------------------------------- library(jvecfor) # Simulate 300 cells × 30 principal components set.seed(42) pca <- matrix(rnorm(300 * 30), nrow = 300, ncol = 30) # Find 15 nearest neighbours (HNSW-DiskANN approximate, Euclidean) nn <- fastFindKNN(pca, k = 15) str(nn) ## ----output-structure, eval=run_eg-------------------------------------------- dim(nn$index) # 300 x 15 dim(nn$distance) # 300 x 15 class(nn$index) # "integer" — 1-based, same as BiocNeighbors class(nn$distance) # "numeric" # No zeros in index (1-based) and all values within [1, nrow(pca)] range(nn$index) ## ----ann-vs-knn, eval=run_eg-------------------------------------------------- # Approximate (HNSW-DiskANN) — default, fast, O(log n) query nn_ann <- fastFindKNN(pca, k = 15, type = "ann") # Exact (VP-tree) — deterministic, O(n log n) query nn_knn <- fastFindKNN(pca, k = 15, type = "knn") # Self-consistency check: exact search always returns itself as nearest # (excluding self) — no duplicate indices per row stopifnot(all(apply(nn_knn$index, 1, function(x) !anyDuplicated(x)))) ## ----metric-euclidean, eval=run_eg-------------------------------------------- nn_euc <- fastFindKNN(pca, k = 15, metric = "euclidean") ## ----metric-cosine, eval=run_eg----------------------------------------------- # L2-normalise rows norms <- sqrt(rowSums(pca^2)) pca_norm <- sweep(pca, 1, norms, "/") nn_cos <- fastFindKNN(pca_norm, k = 15, metric = "cosine") ## ----metric-dotproduct, eval=run_eg------------------------------------------- # ANN with dot-product similarity nn_dot <- fastFindKNN(pca_norm, k = 15, metric = "dot_product", type = "ann") ## ----metric-dotproduct-error, eval=run_eg, error=TRUE------------------------- try({ # Attempting exact + dot_product raises an error fastFindKNN(pca, k = 15, metric = "dot_product", type = "knn") }) ## ----no-distance, eval=run_eg------------------------------------------------- nn_idx <- fastFindKNN(pca, k = 15, get.distance = FALSE) is.null(nn_idx$distance) # TRUE ## ----hnsw-params-table, echo=FALSE-------------------------------------------- hnsw_tbl <- data.frame( "Parameter" = c( "`M`", "`ef.search`", "`oversample.factor`", "`pq.subspaces`" ), "Default" = c("16", "0 (auto)", "1.0", "0 (off)"), "Tuning guidance" = c( paste( "Connections per node. Increase to 32-64", "for high-dimensional (>50 dims) data;", "doubles memory and build time." ), paste( "Beam width. Auto = max(k+1, 3k).", "Increase (e.g. 100-500) to trade speed for recall." ), paste( "Fetch ceil(ef x factor) candidates, keep top k.", "Values 1.5-3.0 improve recall with proportional cost." ), paste( "PQ subspaces. See section 5.3.", "4-8x speedup with minimal recall loss." ) ), check.names = FALSE, stringsAsFactors = FALSE ) knitr::kable(hnsw_tbl) ## ----high-recall, eval=run_eg------------------------------------------------- nn_highrecall <- fastFindKNN( pca, k = 15, M = 32L, ef.search = 150L, oversample.factor = 2.0 ) dim(nn_highrecall$index) ## ----pq-demo, eval=run_eg----------------------------------------------------- # 300 × 20 matrix — use 10 PQ subspaces (= 20 / 2) set.seed(1) pca20 <- matrix(rnorm(300 * 20), 300, 20) nn_default <- fastFindKNN(pca20, k = 15) # no PQ nn_pq <- fastFindKNN(pca20, k = 15, pq.subspaces = 10L) # PQ enabled # Recall overlap (fraction of PQ neighbors that match the default) shared <- mapply(function(a, b) length(intersect(a, b)), split(nn_default$index, row(nn_default$index)), split(nn_pq$index, row(nn_pq$index))) message(sprintf("Mean PQ recall vs. default: %.1f%%", 100 * mean(shared / 15))) ## ----pq-timing, eval=FALSE---------------------------------------------------- # # Illustrative — run on your own large matrix # nn_default_time <- system.time(fastFindKNN(large_pca, k = 15)) # nn_pq_time <- system.time( # fastFindKNN(large_pca, k = 15, # pq.subspaces = ncol(large_pca) %/% 2L) # ) # nn_default_time["elapsed"] / nn_pq_time["elapsed"] # expect 4–8× ## ----threads, eval=run_eg----------------------------------------------------- # Single-threaded (reproducible, good for shared HPC nodes) nn_1t <- fastFindKNN(pca, k = 15, num.threads = 1L) # Explicit thread count nn_4t <- fastFindKNN(pca, k = 15, num.threads = 4L) # Use verbose to confirm the thread count jvecfor actually uses nn_v <- fastFindKNN(pca, k = 15, num.threads = 2L, verbose = TRUE) ## ----snn-graph, eval=run_eg--------------------------------------------------- library(igraph) g_snn <- fastMakeSNNGraph(pca, k = 15) class(g_snn) igraph::vcount(g_snn) # 300 — one vertex per cell igraph::ecount(g_snn) # weighted undirected edges ## ----snn-type-table, echo=FALSE----------------------------------------------- snn_tbl <- data.frame( "snn.type" = c( '`"rank"` (default)', '`"jaccard"`', '`"number"`' ), "Edge weight" = c( "Rank-based Jaccard similarity", "Jaccard similarity", "Count of shared neighbors" ), "Use case" = c( "Seurat-compatible; robust to hub nodes", "Standard set-overlap measure", "Simple; unnormalised" ), check.names = FALSE, stringsAsFactors = FALSE ) knitr::kable(snn_tbl) ## ----snn-types, eval=run_eg--------------------------------------------------- g_rank <- fastMakeSNNGraph(pca, k = 15, snn.type = "rank") g_jaccard <- fastMakeSNNGraph(pca, k = 15, snn.type = "jaccard") g_number <- fastMakeSNNGraph(pca, k = 15, snn.type = "number") # Edge weight ranges differ between types summary(igraph::E(g_rank)$weight) summary(igraph::E(g_jaccard)$weight) ## ----knn-graph, eval=run_eg--------------------------------------------------- # Undirected KNN graph (mutual edges) g_knn_u <- fastMakeKNNGraph(pca, k = 15, directed = FALSE) # Directed KNN graph (each cell points to its k neighbors) g_knn_d <- fastMakeKNNGraph(pca, k = 15, directed = TRUE) igraph::is_directed(g_knn_d) # TRUE ## ----louvain, eval=run_eg----------------------------------------------------- louvain <- igraph::cluster_louvain(g_snn) message( "Number of detected communities: ", length(igraph::communities(louvain)) ) membership_vec <- igraph::membership(louvain) table(membership_vec) ## ----leiden, eval=FALSE------------------------------------------------------- # leiden <- igraph::cluster_leiden(g_snn, resolution_parameter = 0.5) # table(igraph::membership(leiden)) ## ----workflow-simulate, eval=run_eg------------------------------------------- # Simulate 300 cells (100 per cluster) with separable cluster centres in PC1–PC2 set.seed(42) n_per <- 100 centres <- list(c(5, 0), c(-5, 0), c(0, 5)) pca_clust <- do.call(rbind, lapply(seq_along(centres), function(i) { m <- matrix(rnorm(n_per * 30), n_per, 30) m[, 1] <- m[, 1] + centres[[i]][1] m[, 2] <- m[, 2] + centres[[i]][2] m })) true_labels <- rep(1:3, each = n_per) ## ----workflow-knn, eval=run_eg------------------------------------------------ # Step 1: find 15 nearest neighbours nn_wf <- fastFindKNN(pca_clust, k = 15) ## ----workflow-graph, eval=run_eg---------------------------------------------- # Step 2: build SNN graph g_wf <- fastMakeSNNGraph(pca_clust, k = 15) ## ----workflow-cluster, eval=run_eg-------------------------------------------- # Step 3: Louvain community detection lou_wf <- igraph::cluster_louvain(g_wf) detected <- igraph::membership(lou_wf) # Step 4: cross-tabulate detected vs. true labels print(table(Detected = detected, True = true_labels)) ## ----workflow-plot-cap, include=FALSE, eval=run_eg---------------------------- wf_cap <- paste( "Louvain clusters (colours) in PC1-PC2 space.", "Three well-separated populations are recovered." ) ## ----workflow-plot, eval=run_eg, fig.width=5, fig.height=5, fig.cap=wf_cap---- # Visualise in the first two PCs (base R — no extra dependencies) cluster_cols <- c("#E64B35", "#4DBBD5", "#00A087", "#3C5488", "#F39B7F") plot( pca_clust[, 1], pca_clust[, 2], col = cluster_cols[detected], pch = 19, cex = 0.7, xlab = "PC 1", ylab = "PC 2", main = "Louvain clusters on SNN graph" ) legend("topright", legend = paste("Cluster", sort(unique(detected))), col = cluster_cols[sort(unique(detected))], pch = 19, bty = "n", cex = 0.85) ## ----sce-workflow, eval=FALSE------------------------------------------------- # library(SingleCellExperiment) # # # Assume sce is a SingleCellExperiment with PCA computed # # (e.g. via scater::runPCA) # pca_mat <- reducedDim(sce, "PCA") # # # KNN search # nn_sce <- fastFindKNN(pca_mat, k = 15) # # # Graph-based clustering # g_sce <- fastMakeSNNGraph(pca_mat, k = 15) # lou_sce <- igraph::cluster_louvain(g_sce) # sce$cluster <- as.factor(igraph::membership(lou_sce)) ## ----biocneighbors-compare, eval=FALSE---------------------------------------- # library(BiocNeighbors) # # # BiocNeighbors (Annoy backend, the default ANN method) # nn_bn <- BiocNeighbors::findKNN(pca, k = 15, BNPARAM = AnnoyParam()) # # # jvecfor (HNSW-DiskANN backend) — standalone function # nn_jv <- jvecfor::fastFindKNN(pca, k = 15) # # # Same structure # identical(names(nn_bn), names(nn_jv)) # TRUE — both have $index, $distance # identical(dim(nn_bn$index), dim(nn_jv$index)) # TRUE ## ----bnparam-dropin, eval=FALSE----------------------------------------------- # library(BiocNeighbors) # # # Use jvecfor through the standard BiocNeighbors interface # nn <- findKNN(pca, k = 15, BNPARAM = JvecforParam()) # # # Works with scran, scater, and other BNPARAM-aware packages: # # library(scran) # # g <- buildSNNGraph(sce, BNPARAM = JvecforParam()) # # # # library(scater) # # sce <- runUMAP(sce, BNPARAM = JvecforParam()) # # # Customise algorithm parameters via the constructor # nn2 <- findKNN(pca, k = 15, # BNPARAM = JvecforParam( # type = "knn", # exact VP-tree search # distance = "Cosine", # M = 32L # )) ## ----timing-compare, eval=FALSE----------------------------------------------- # # Replace `pca_large` with your n × p matrix (e.g. n = 100K cells, p = 50 PCs) # t_bn <- system.time( # BiocNeighbors::findKNN(pca_large, k = 15, BNPARAM = AnnoyParam()) # ) # t_jv <- system.time( # fastFindKNN(pca_large, k = 15) # ) # # message(sprintf("BiocNeighbors: %.1f s", t_bn["elapsed"])) # message(sprintf("jvecfor: %.1f s", t_jv["elapsed"])) # message(sprintf("Speedup: %.1fx", t_bn["elapsed"] / t_jv["elapsed"])) ## ----errors-table, echo=FALSE------------------------------------------------- err_tbl <- data.frame( "Error message" = c( "`Java not found on PATH`", "`Java >= 20 is required (found Java X)`", paste("`jvecfor JAR not found.", "Run jvecfor_setup()`"), "`jvecfor exited with status N`", "Very slow first call", "Unexpected neighbour count" ), "Likely cause" = c( "`java` not in shell `PATH`", "Outdated JDK", "`inst/java/` empty", "JVM crash or invalid input", "JVM cold-start overhead", "`k >= nrow(X)`" ), "Resolution" = c( paste("Install OpenJDK 20+ (adoptium.net);", "re-launch R after updating PATH."), paste("Upgrade to OpenJDK 20+;", "verify with `java --version`."), "Run `jvecfor_setup()` or reinstall.", "Set `verbose=TRUE` to inspect Java stderr.", "Expected; subsequent calls are faster.", paste("Reduce `k`; need at least", "`k + 1` observations.") ), check.names = FALSE, stringsAsFactors = FALSE ) knitr::kable(err_tbl) ## ----verbose-debug, eval=run_eg----------------------------------------------- options(jvecfor.verbose = TRUE) nn_debug <- fastFindKNN(pca[1:50, ], k = 5) options(jvecfor.verbose = FALSE) # reset ## ----java-check, eval=FALSE--------------------------------------------------- # # Which java is on PATH? # Sys.which("java") # # # Version reported by that java binary # system2("java", "--version", stderr = TRUE) # # # Where is the bundled JAR? # system.file("java", package = "jvecfor") ## ----tryCatch-example, eval=FALSE--------------------------------------------- # nn <- tryCatch( # fastFindKNN(pca, k = 15), # error = function(e) { # warning("jvecfor unavailable: ", conditionMessage(e), # "\nFalling back to BiocNeighbors.") # BiocNeighbors::findKNN(pca, k = 15) # } # ) ## ----session-info------------------------------------------------------------- sessionInfo()