# Global knitr configuration
knitr::opts_chunk$set(echo = TRUE, collapse = TRUE, message = FALSE, warning = FALSE)
# Load libraries required for the pipeline workflow
library(lncRna)
library(rtracklayer)
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: generics
##
## Attaching package: 'generics'
## The following objects are masked from 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
## setequal, union
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
## as.data.frame, basename, cbind, colnames, dirname, do.call,
## duplicated, eval, evalq, get, grep, grepl, is.unsorted, lapply,
## mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## rank, rbind, rownames, sapply, saveRDS, table, tapply, unique,
## unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## I, expand.grid, unname
## Loading required package: IRanges
## Loading required package: Seqinfo
library(GenomicRanges)
library(IRanges)
library(gprofiler2)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:rtracklayer':
##
## export
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(fmsb)
library(patchwork)
The lncRna package provides a comprehensive toolkit for the identification, analysis, and functional annotation of long non-coding RNAs (lncRNAs) from RNA-Seq data. The workflow covers several key stages:
calculateCM framework.In a typical analysis, you would load your own data. For this demonstration, we create mock data objects that mimic real-world transcriptomic outputs.
We start by creating a representative genomic structure and partitioning our sequences into independent training and testing subsets using the built-in split utility.
# 1. Sample GRanges object (mimicking an imported GTF file)
mockGtf <- GRanges(
seqnames = "chr1",
ranges = IRanges(start = c(100, 200, 400, 500), width = c(50, 60, 100, 20)),
strand = "+",
type = c("exon", "exon", "transcript", "exon"),
gene_id = c("G1", "G1", "G2", "G2"),
gene_biotype = c("protein_coding", "protein_coding", "lncRNA", "lncRNA"),
transcript_id = c("tx1", "tx1", "tx2", "tx2"),
transcript_biotype = c("protein_coding", "protein_coding", "lncRNA", "lncRNA"),
exon_number = c("1", "2", NA, "1")
)
# 2. Partitioning sequences using createTrainTestSets
# This ensures we have ground truth sets for performance evaluation
all_ids <- c("tx1", "tx2", "tx3", "tx4", "tx5", "tx6")
set.seed(123)
sets <- createTrainTestSets(sequences = all_ids, percentTrain = 0.5, prefix = "demo")
# Define independent test sets (nc = non-coding, cds = coding)
ncTestSet <- sets$demo.test
cdsTestSet <- sets$demo.train
We extract key structural features to distinguish lncRNAs from other biotypes based on the input GTF.
# Calculate transcript length and exon count
transcriptStats <- getGtfStats(gtfObject = mockGtf)
# Extract biotype information at the gene level
geneBiotypes <- getBiotypes(refGtf = mockGtf, level = "gene")
print(head(transcriptStats))
## transcript_id exons trans_length
## 1 tx1 2 110
## 2 tx2 1 20
This stage integrates results from various coding potential predictors and evaluates their reliability.
We aggregate results from external tools and visualize the agreement (consensus) between them.
# Mocking aggregated results from CPC2, PLEK, and CPAT
mockCodPotList <- list(
seqIDs = c("tx1", "tx2", "tx3", "tx4", "tx5", "tx6"),
tools = list(
CPC2 = c(0, 1, 1, 0, 1, 0),
PLEK = c(0, 1, 0, 0, 1, 1),
CPAT = c(0, 1, 1, 1, 0, 0)
)
)
# Visualize tool agreement (Venn)
if (requireNamespace("venn", quietly = TRUE)) {
plotVennCodPot(codPot = mockCodPotList)
}
We evaluate all possible tool combinations to identify the most accurate identification strategy for the current dataset.
# 1. Prepare evaluation sets by annotating predictions with ground truth
evaluationSummary <- prepareEvaluationSets(
codPotList = mockCodPotList,
ncTest = ncTestSet,
cdsTest = cdsTestSet
)
# 2. Evaluate performance for each tool combination (e.g., "CPC2+PLEK")
combSummary <- evaluateToolCombinations(summaryList = evaluationSummary)
# 3. Calculate summary metrics
combStats <- bestToolCombination(combinationSummaryList = combSummary)
print(combStats)
## CPC2+PLEK CPC2+CPAT PLEK+CPAT CPC2+PLEK+CPAT
## Accuracy 0.5000 0.1667 0.3333 0.3333
## Kappa 0.0000 -0.6667 -0.3333 -0.3333
## AccuracyLower 0.1181 0.0042 0.0433 0.0433
## AccuracyUpper 0.8819 0.6412 0.7772 0.7772
## AccuracyNull 0.5000 0.5000 0.5000 0.5000
## AccuracyPValue 0.6562 0.9844 0.8906 0.8906
## McnemarPValue 1.0000 1.0000 0.6171 0.6171
## Sensitivity 0.3333 0.0000 0.0000 0.0000
## Specificity 0.6667 0.3333 0.6667 0.6667
## Pos Pred Value 0.5000 0.0000 0.0000 0.0000
## Neg Pred Value 0.5000 0.2500 0.4000 0.4000
## Precision 0.5000 0.0000 0.0000 0.0000
## Recall 0.3333 0.0000 0.0000 0.0000
## F1 0.4000 NA NA NA
## Prevalence 0.5000 0.5000 0.5000 0.5000
## Detection Rate 0.1667 0.0000 0.0000 0.0000
## Detection Prevalence 0.3333 0.3333 0.1667 0.1667
## Balanced Accuracy 0.5000 0.1667 0.3333 0.3333
The calculateCM function serves as the central hub for the visualization pipeline. It converts raw predictions into Confusion Matrix objects and allows for threshold-based filtering.
# Filter for methods with Accuracy >= 0.5 (low threshold for demo purposes)
# Use metricsData to provide values for filtering
allCms <- calculateCM(
combinationSummaryList = combSummary,
metricsData = combStats,
threshold = 0.5,
returnOnlyHigh = TRUE,
metricToExtract = "Accuracy",
printMetricThresholds = TRUE
)
Radar and Clock plots provide a holistic view of tool performance across multiple metrics.
# Comparison of top combinations (Radar Plot)
methodsToPlot <- names(allCms)[1:min(3, length(allCms))]
plotRadarMetrics(cmList = allCms, methods = methodsToPlot, plotTitle = "Top Methods Comparison")
# Detailed metrics view (Clock Plot) in 'multiple' layout
plotClockMetrics(cmList = allCms, plotTitle = "Performance Metrics Breakdown", layout = "multiple")
The final stage of the pipeline identifies where and how the identified lncRNAs act in the cell.
We identify cis-interactions based on genomic proximity and trans-interactions based on expression correlation.
# Identify Cis-interactions (genomic proximity)
feelncFile <- tempfile()
write.table(data.frame(isBest=1, lncRNA_gene="LNC1", partnerRNA_gene="GENE_A", distance=500),
feelncFile, sep="\t", row.names=FALSE)
cisInteractions <- findCisInteractions(FEELncClassesFile = feelncFile, maxDist = 1000)
# Identify Trans-interactions via expression correlation
mockExpr <- matrix(rnorm(50), nrow = 5,
dimnames = list(c("LNC1", "LNC2", "GENE_A", "GENE_B", "GENE_C"), paste0("S", 1:10)))
mockExpr["LNC1",] <- mockExpr["GENE_A",] + rnorm(10, 0, 0.1)
transInteractions <- findTransInteractions(exprMatrix = mockExpr, rval = 0.8, pval = 0.05)
We link interactions to functional enrichment terms and visualize the flow using an interactive Sankey diagram.
# Sample enrichment result (mimicking gprofiler2::gost)
mockGost <- data.frame(term_id="GO:01", term_name="stress response", intersection="GENE_A")
# Annotate interactions with functional terms
interactionsProcessed <- annotateInteractions(gostResult = mockGost,
interactionTable = transInteractions, type = "trans")
# Generate interactive Sankey diagram
plotSankeyInteractions(interactionData = interactionsProcessed, groupBy = "term",
selectIds = "stress response", title = "Functional Interaction Flow")
sessionInfo()
## R version 4.6.0 RC (2026-04-17 r89917)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] patchwork_1.3.2 fmsb_0.7.6 plotly_4.12.0
## [4] ggplot2_4.0.3 gprofiler2_0.2.4 rtracklayer_1.71.3
## [7] GenomicRanges_1.63.2 Seqinfo_1.1.0 IRanges_2.45.0
## [10] S4Vectors_0.49.2 BiocGenerics_0.57.1 generics_0.1.4
## [13] lncRna_0.99.3 BiocStyle_2.39.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 gridExtra_2.3
## [3] rlang_1.2.0 magrittr_2.0.5
## [5] otel_0.2.0 matrixStats_1.5.0
## [7] compiler_4.6.0 venn_1.13
## [9] vctrs_0.7.3 stringr_1.6.0
## [11] pkgconfig_2.0.3 crayon_1.5.3
## [13] fastmap_1.2.0 magick_2.9.1
## [15] backports_1.5.1 XVector_0.51.0
## [17] labeling_0.4.3 Rsamtools_2.27.2
## [19] rmarkdown_2.31 tinytex_0.59
## [21] purrr_1.2.2 xfun_0.57
## [23] cachem_1.1.0 cigarillo_1.1.0
## [25] jsonlite_2.0.0 DelayedArray_0.37.1
## [27] BiocParallel_1.45.0 parallel_4.6.0
## [29] cluster_2.1.8.2 R6_2.6.1
## [31] bslib_0.10.0 stringi_1.8.7
## [33] RColorBrewer_1.1-3 rpart_4.1.27
## [35] jquerylib_0.1.4 Rcpp_1.1.1-1.1
## [37] bookdown_0.46 SummarizedExperiment_1.41.1
## [39] knitr_1.51 base64enc_0.1-6
## [41] Matrix_1.7-5 nnet_7.3-20
## [43] tidyselect_1.2.1 rstudioapi_0.18.0
## [45] dichromat_2.0-0.1 abind_1.4-8
## [47] yaml_2.3.12 codetools_0.2-20
## [49] curl_7.1.0 admisc_0.40
## [51] lattice_0.22-9 tibble_3.3.1
## [53] Biobase_2.71.0 withr_3.0.2
## [55] S7_0.2.2 evaluate_1.0.5
## [57] foreign_0.8-91 Biostrings_2.79.5
## [59] pillar_1.11.1 BiocManager_1.30.27
## [61] MatrixGenerics_1.23.0 checkmate_2.3.4
## [63] RCurl_1.98-1.18 scales_1.4.0
## [65] glue_1.8.1 scatterplot3d_0.3-45
## [67] Hmisc_5.2-5 lazyeval_0.2.3
## [69] tools_4.6.0 BiocIO_1.21.0
## [71] data.table_1.18.2.1 GenomicAlignments_1.47.0
## [73] XML_3.99-0.23 grid_4.6.0
## [75] tidyr_1.3.2 crosstalk_1.2.2
## [77] colorspace_2.1-2 htmlTable_2.5.0
## [79] restfulr_0.0.16 Formula_1.2-5
## [81] cli_3.6.6 Polychrome_1.5.4
## [83] S4Arrays_1.11.1 viridisLite_0.4.3
## [85] dplyr_1.2.1 gtable_0.3.6
## [87] sass_0.4.10 digest_0.6.39
## [89] SparseArray_1.11.13 rjson_0.2.23
## [91] htmlwidgets_1.6.4 farver_2.1.2
## [93] htmltools_0.5.9 lifecycle_1.0.5
## [95] httr_1.4.8