## ----include = FALSE----------------------------------------------------------
library(rmarkdown)
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)


## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----echo = T, results = 'hide',message=FALSE---------------------------------
#install.packages("rnaCrosslinkOO")
# Load the rnaCrosslink-OO Library 
library(rnaCrosslinkOO)

## ----echo = T, results = 'hide',message=FALSE---------------------------------
# Here are the other libraries on which rnaCrosslinkOO relies
#library(seqinr)
#library(GenomicRanges)
#library(ggplot2)
#library(reshape2)
#library(MASS)
#library(ggplot2)
#library(doParallel)
#library(igraph)
#library(R4RNA)
#library(RColorBrewer)
#library(heatmap3)
#library(mixtools)
#library(TopDom)
library(tidyverse)
#library(RRNA)
#library(ggrepel)

## -----------------------------------------------------------------------------
# Set up the sample table
sampleTableRow1 = c(system.file("extdata", 
                                "s1.txt", 
                                package="rnaCrosslinkOO"), "s", "1", "s1")
sampleTableRow2 = c(system.file("extdata", 
                                "c1.txt", 
                                package="rnaCrosslinkOO"), "c", "1", "c1")
sampleTable2 = rbind.data.frame(sampleTableRow1, sampleTableRow2)

# add the column names 
colnames(sampleTable2) = c("file", "group", "sample", "sampleName")

sampleTable2

## -----------------------------------------------------------------------------
rna = c("ENSG000000XXXXX_NR003286-2_RN18S1_rRNA")

## -----------------------------------------------------------------------------
path18SFata <- system.file("extdata", 
                           "18S.fasta", 
                           package="rnaCrosslinkOO")

rnaRefs = list()
rnaRefs[[rna]] = read.fasta(path18SFata)


## -----------------------------------------------------------------------------
path18SFata <- system.file("extdata", 
                           "ribovision18S.txt", 
                           package="rnaCrosslinkOO")
known18S = read.table(path18SFata,
                      header = F)

## ----eval = F-----------------------------------------------------------------
#  pathShape <- system.file("extdata",
#                           "reactivities.txt",
#                           package="rnaCrosslinkOO")
#  shape = read.table(pathShape,
#                        header = F)

## ----eval=FALSE---------------------------------------------------------------
#  rnaCrosslinkQC(sampleTable2,
#                 directory = ".",
#                 topTranscripts = F)
#  

## -----------------------------------------------------------------------------
# load the object
cds = rnaCrosslinkDataSet(rnas = rna,
                      sampleTable = sampleTable2,
                      subset = "all",
                      sample = "all")
# be aware there are extra options here
# including:
# subset - allows you to choose specific read sizes (this affects resolution and accuracy)
# sample - Choose the same ammount of reads for each sample (useful for comparisons)

## -----------------------------------------------------------------------------
# Check status of instance 
cds

## -----------------------------------------------------------------------------
# Returns the size of the RNA
rnaSize(cds)

## -----------------------------------------------------------------------------
# Returns the sample table 
sampleTable(cds)

## -----------------------------------------------------------------------------
# Returns indexes of the samples in the control and not control groups
group(cds)

## -----------------------------------------------------------------------------
# Get the sample names of the instance
sampleNames(cds)

## ----eval=FALSE,echo=TRUE-----------------------------------------------------
#  # Return the InputFiles slot
#  InputFiles(cds)

## ----eval=FALSE, echo=TRUE----------------------------------------------------
#  # Return the matrixList slot
#  matrixList(cds)

## -----------------------------------------------------------------------------
data = getData(x    = cds,              # The object      
               data =  "InputFiles",       # The Type of data to return     
               type = "original")[[1]] # The stage of the analysis for the return data

head(data)

## ----eval = F-----------------------------------------------------------------
#  # Returns the RNAs with highest number of assigned reads
#  # regardless of whether it is an Inter or Intra - RNA interaction.
#  topTranscripts(cds,
#                 2)  # The number of entries to return
#  

## ----eval = F-----------------------------------------------------------------
#  # Returns the RNAs that interact with the RNA of interest
#  topInteracters(cds, # The rnaCrosslinkDataSet instance
#                 1)   # The number of entries to return

## ----eval = F-----------------------------------------------------------------
#  # Returns the Interacions with the highest number of assigned reads
#  topInteractions(cds, # The rnaCrosslinkDataSet instance
#                  2)   # The number of entries to return
#  

## -----------------------------------------------------------------------------
features = featureInfo(cds) # The rnaCrosslinkDataSet instance


# Counts for features at the transcript level
features$transcript

# Counts for features at the family level (last field with  "_" delimited IDs)
features$family


## -----------------------------------------------------------------------------
# Cluster the reads
clusteredCds = clusterrnaCrosslink(cds = cds,     # The rnaCrosslinkDataSet instance 
                               cores = 1,         # The number of cores
                               stepCount = 2,     # The number of steps in the random walk
                               clusterCutoff = 2) # The minimum number of reads for a cluster to be considered

## -----------------------------------------------------------------------------
# Check status of instance 
clusteredCds

## -----------------------------------------------------------------------------
# Returns the number of clusters in each sample
clusterNumbers(clusteredCds)

## -----------------------------------------------------------------------------
# Returns the number reads in clusters
readNumbers( clusteredCds)

## ----eval=FALSE---------------------------------------------------------------
#  getData(clusteredCds,        # The object
#          "clusterTableList",  # The Type of data to return
#          "original")[[1]]     # The stage of the analysis for the return data

## ----eval=FALSE---------------------------------------------------------------
#  getData(clusteredCds,         # The object
#          "clusterGrangesList", # The Type of data to return
#          "original")[[1]]      # The stage of the analysis for the return data

## ----eval = F-----------------------------------------------------------------
#  # Trim the Clusters
#  trimmedClusters = trimClusters(clusteredCds = clusteredCds, # The rnaCrosslinkDataSet instance
#                                 trimFactor = 1,              # The cutoff for cluster trimming (see above)
#                                 clusterCutoff = 0)          # The minimum number of reads for a cluster to be considered

## ----eval = F-----------------------------------------------------------------
#  # Check status of instance
#  trimmedClusters

## ----eval = F-----------------------------------------------------------------
#  # Returns the number of clusters in each sample
#  clusterNumbers(trimmedClusters)

## ----eval = F-----------------------------------------------------------------
#  # Returns the number reads in clusters
#  readNumbers( trimmedClusters)

## -----------------------------------------------------------------------------
#plotClusterAgreement(trimmedClusters,
#                     "trimmedClusters")

## -----------------------------------------------------------------------------
#plotClusterAgreementHeat(trimmedClusters,
#                         "trimmedClusters")

## ----error=FALSE,eval = FALSE, results=FALSE,message=FALSE--------------------
#  # Fold the RNA in part of whole
#  foldedCds = foldrnaCrosslink(trimmedClusters,
#                           rnaRefs = rnaRefs,
#                           start = 1600,
#                           end = 1869,
#                           shape = 0,
#                           ensembl = 20,
#                           constraintNumber  = 30,
#                           evCutoff = 5)

## ----eval = FALSE-------------------------------------------------------------
#  # Check status of instance
#  foldedCds

## -----------------------------------------------------------------------------
# Plot heatmaps for each sample
#plotMatrices(cds = cds,         # The rnaCrosslinkDataSet instance 
#             type = "original", # The "analysis stage"
#             directory = 0,     # The directory for output (0 for standard out)
#             a = 1,             # Start coord for x-axis
#             b = rnaSize(cds),  # End coord for x-axis
#             c = 1,             # Start coord for y-axis
#             d = rnaSize(cds),  # End coord for y-axis
#             h = 5)             # The height of the image (if saved)


## -----------------------------------------------------------------------------
plotCombinedMatrix(cds,
           type1 = "original",
           type2 = "noHost",
           b = rnaSize(cds),
           d = rnaSize(cds))

## -----------------------------------------------------------------------------
clusteredCds

## -----------------------------------------------------------------------------
# Plot heatmaps for all samples combined and all controls combined
plotMatricesAverage(cds = clusteredCds, # The rnaCrosslinkDataSet instance 
             type1 = "originalClusters", # The "analysis stage" to plot on the upper half of the heatmap
             type2 = "noHost",        # The "analysis stage" to plot on the lower half of the heatmap
             directory = 0,     # The directory for output (0 for standard out)
             a = 1,             # Start coord for x-axis
             b = rnaSize(cds),  # End coord for x-axis
             c = 1,             # Start coord for y-axis
             d = rnaSize(cds),  # End coord for y-axis
             h = 5)             # The height of the image (if saved)

## ----eval = FALSE-------------------------------------------------------------
#  domainDF = data.frame()
#  for(j in c(20,30,40,50,60,70)){
#      #for(i in which(sampleTable(cds)$group == "s")){
#  
#      timeMats = as.matrix(getData(x = cds,
#                                   data = "matrixList",
#                                   type = "noHost")[[1]])
#  
#      timeMats = timeMats/ (sum(timeMats)/1000000)
#      tmp = tempfile()
#      write.table(timeMats, file = tmp,quote = F,row.names = F, col.names = F)
#  
#      tdData2 = readHiC(
#          file = tmp,
#          chr = "rna18s",
#          binSize = 10,
#          debug = getOption("TopDom.debug", FALSE)
#      )
#  
#      tdData =  TopDom(
#          tdData2 ,
#          window.size = j,
#          outFile = NULL,
#          statFilter = TRUE,
#          debug = getOption("TopDom.debug", FALSE)
#      )
#  
#      td = tdData$domain
#      td$sample = sampleTable(cds)$sampleName[1]
#      td$window = j
#      domainDF = rbind.data.frame(td, domainDF)
#  
#  }
#  
#  
#  
#  ggplot(domainDF) +
#      geom_segment(aes(x = from.coord/10,
#                       xend = to.coord/10, y = as.factor(sub("s","",sample)),
#                       yend = (as.factor(sub("s","",sample)) ), colour = tag),
#                   size  = 20, alpha = 0.8) +
#      facet_grid(window~.)+
#      theme_bw()
#  

## ----eval = FALSE-------------------------------------------------------------
#  plotEnsemblePCA(foldedCds,
#                  labels = T, # plot labels for structures
#                  split = T)  # split samples over different facets (T/f)
#  

## ----eval = FALSE-------------------------------------------------------------
#  plotComparisonArc(foldedCds = foldedCds,
#                    s1 = "c1",            # The sample of the 1st structure
#                    s2 = "s1",            # The sample of the 2nd structure
#                    n1 = 13,               # The number of the 1st structure
#                    n2 = 16)               # The number of the 2nd structure

## ----eval = F-----------------------------------------------------------------
#  plotStructure(foldedCds = foldedCds,
#                rnaRefs = rnaRefs,
#                s = "s1",          # The sample of the structure
#                n = 1)             # The number of the structure

## ----eval=FALSE---------------------------------------------------------------
#  
#  getInteractions(cds,
#                  "ENSG00000XXXXXX_NR003287-2_RN28S1_rRNA") %>%
#      mutate(sample =sub("\\d$","",sample) )%>%
#      group_by(rna,Position,sample)%>%
#      summarise(sum =  sum(depth)) %>%
#      ggplot()+
#      geom_area(aes(x = Position,
#                    y = sum,
#                    fill = sample),
#                stat = "identity")+
#      facet_grid(sample~.) +
#      theme_bw()
#  

## ----eval=FALSE---------------------------------------------------------------
#  getReverseInteractions(cds,
#                         rna) %>%
#      mutate(sample =sub("\\d$","",sample) )%>%
#      group_by(rna,Position,sample)%>%
#      summarise(sum =  sum(depth)) %>%
#      ggplot()+
#      geom_area(aes(x = Position,
#                    y = sum,
#                    fill = sample),
#                      stat = "identity")+
#      facet_grid(sample~.)+
#      theme_bw()

## -----------------------------------------------------------------------------
plotInteractions(cds,
                 rna = "ENSG000000XXXXX_NR003286-2_RN18S1_rRNA",
                 interactor = "ENSG00000XXXXXX_NR003287-2_RN28S1_rRNA",
                 b = "max",
                 d = "max")

## ----eval = F-----------------------------------------------------------------
#  plotInteractionsAverage(cds,
#                   rna = "ENSG000000XXXXX_NR003286-2_RN18S1_rRNA",
#                   interactor = "ENSG00000XXXXXX_NR003287-2_RN28S1_rRNA",
#                   b = "max",
#                   d = "max")

## ----eval = F-----------------------------------------------------------------
#  expansionSize = 5
#  knownMat = matrix(0, nrow = rnaSize(cds), ncol = rnaSize(cds))
#  for(i in 1:nrow(known18S)){
#      knownMat[ (known18S$V1[i]-expansionSize):(known18S$V1[i]+expansionSize),
#                (known18S$V2[i]-expansionSize):(known18S$V2[i]+expansionSize)] =
#          knownMat[(known18S$V1[i]-expansionSize):(known18S$V1[i]+expansionSize),
#                   (known18S$V2[i]-expansionSize):(known18S$V2[i]+expansionSize)] +1
#  }
#  knownMat = knownMat + t(knownMat)
#  

## ----eval = F-----------------------------------------------------------------
#  # use compare known to gett he known and not know clusters
#  knowClusteredCds = compareKnown(trimmedClusters, # The rnaCrosslinkDataSet instance
#                                  knownMat, # A contact matrix of know interactions
#                                  "trimmedClusters") # The analysis stage of clustering to compare
#  
#  knowClusteredCds

## ----eval = F-----------------------------------------------------------------
#  # Plot heatmaps for all samples combined and all controls combined
#  plotMatricesAverage(cds = knowClusteredCds, # The rnaCrosslinkDataSet instance
#               type1 = "KnownAndNovel", # The "analysis stage"
#               directory = 0,     # The directory for output (0 for standard out)
#               a = 1,             # Start coord for x-axis
#               b = rnaSize(cds),  # End coord for x-axis
#               c = 1,             # Start coord for y-axis
#               d = rnaSize(cds),  # End coord for y-axis
#               h = 5)             # The hight of the image (if saved)
#  

## ----eval = F-----------------------------------------------------------------
#  # Get the number of clusters for each analysis Stage
#  clusterNumbers(knowClusteredCds)

## ----eval = F-----------------------------------------------------------------
#  # Get the number of reads in each cluster for each analysis stage
#  readNumbers(knowClusteredCds)

## ----eval = FALSE-------------------------------------------------------------
#  head(compareKnownStructures(foldedCds,
#                              known18S)) # the comarison set

## ----eval = FALSE-------------------------------------------------------------
#  ggplot(compareKnownStructures(foldedCds, known18S)) +
#      geom_hline(yintercept = c(0.5,0.25,0.75,0,1),
#                 colour = "grey",
#                 alpha = 0.2)+
#      geom_vline(xintercept = c(0.5,0.25,0.75,0,1),
#                 colour = "grey",
#                 alpha = 0.2)+
#      geom_point(aes(x = sensitivity,
#                     y = precision,
#                     size = as.numeric(as.character(unlist(foldedCds@dgs))),
#                     colour = str_sub(structureID,
#                                      start = 1 ,
#                                      end = 2))) +
#      xlim(0,1)+
#      ylim(0,1)+
#      theme_classic()
#  

## ----eval = FALSE-------------------------------------------------------------
#  # make an example dataset
#  cds = makeExamplernaCrosslinkDataSet()
#  cds

## ----eval = FALSE-------------------------------------------------------------
#  # Have a look at the reads for each sample
#  plotMatricesAverage(cds = cds,
#                      type1 = "original",
#                      directory = 0,
#                      a = 1,
#                      b = rnaSize(cds),
#                      c = 1,
#                      d = rnaSize(cds),
#                      h = 5)
#  

## ----eval = FALSE-------------------------------------------------------------
#  topInteracters(cds,2)

## ----eval = FALSE-------------------------------------------------------------
#  featureInfo(cds)

## ----eval = FALSE-------------------------------------------------------------
#  clusteredCds = clusterrnaCrosslink(cds = cds,
#                                 cores = 1,
#                                 stepCount = 2,
#                                 clusterCutoff = 0)
#  
#  
#  
#  
#  
#  
#  trimmedClusters = trimClusters(clusteredCds = clusteredCds,
#                                 trimFactor = 1,
#                                 clusterCutoff = 0)

## ----eval = FALSE-------------------------------------------------------------
#  plotMatricesAverage(cds = clusteredCds,
#                      type1 = "originalClusters",
#                      directory = 0,
#                      a = 1,
#                      b = rnaSize(cds),
#                      c = 1,
#                      d = rnaSize(cds),
#                      h = 5)

## ----eval = FALSE-------------------------------------------------------------
#  plotMatricesAverage(cds = trimmedClusters,
#                      type1 = "trimmedClusters",
#                      directory = 0,
#                      a = 1,
#                      b = rnaSize(cds),
#                      c = 1,
#                      d = rnaSize(cds),
#                      h = 5)
#  

## ----eval = FALSE-------------------------------------------------------------
#  
#  fasta = paste(c(rep('A',25),
#                  rep('T',25),
#                  rep('A',10),
#                  rep('T',23)),collapse = "")
#  
#  header = '>transcript1'
#  
#  
#  fastaFile = tempfile()
#  writeLines(paste(header,fasta,sep = "\n"),con = fastaFile)
#  
#  
#  rnaRefs = list()
#  rnaRefs[[rnas(cds)]] = read.fasta(fastaFile)
#  rnaRefs
#  
#  
#  
#  foldedCds = foldrnaCrosslink(trimmedClusters,
#                           rnaRefs = rnaRefs,
#                           start = 1,
#                           end = 83,
#                           shape = 0,
#                           ensembl = 5,
#                           constraintNumber  = 1,
#                           evCutoff = 1)
#  

## ----eval = FALSE-------------------------------------------------------------
#  # make an example table of "know" interactions
#  file = data.frame(V1 = c(6),
#                    V2 = c(80))
#  
#  compareKnownStructures(foldedCds,file)
#  

## ----eval = FALSE-------------------------------------------------------------
#  
#  plotEnsemblePCA(foldedCds, labels = T,split = T)

## ----eval = FALSE-------------------------------------------------------------
#  plotComparisonArc(foldedCds = foldedCds,"s1","c1",2,3)

## ----eval = FALSE-------------------------------------------------------------
#  plotStructure(foldedCds = foldedCds,"c1",1)
#  
#