---
title: 'Example data for TCGA Workflow: Analyze cancer genomics and epigenomics data using Bioconductor packages'
author: Tiago C. Silva, Antonio Colaprico, Catharina Olsen, Fulvio D’Angelo, Gianluca
  Bontempi Michele Ceccarelli , and Houtan Noushmehr
date: "`r Sys.Date()`"
output:  BiocStyle::html_document
vignette: >
  %\VignetteIndexEntry{'Example data for TCGA Workflow: Analyze cancer genomics and epigenomics data using Bioconductor packages'}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

# Data Introduction

This package provides a dataset for those wishing to try out the
[TCGA Workflow: Analyze cancer genomics and epigenomics data using Bioconductor packages](https://f1000research.com/articles/5-1542/v2) [@10.12688/f1000research.8923.2]. 
The data in this package are a subset of the TCGA data
for LGG (Lower grade glioma) and GBM (Glioblastoma multiforme) samples.

# Loading the data

```{r, fig.show='hold'}
library(TCGAWorkflowData)
data("elmerExample")
data("TCGA_LGG_Transcriptome_20_samples")
data("TCGA_GBM_Transcriptome_20_samples")
data("histoneMarks")
data("biogrid")
data("genes_GR")
data("maf_lgg_gbm")
```

# Data creation

The following commands were used to create the data included with this package.

## Genes information

Download gene information for hg19 using TCGAbiolinks, which uses biomart parckage.

```{r , eval = FALSE, message=FALSE,warning=FALSE, include=TRUE}
library(GenomicRanges)
library(TCGAbiolinks)
##############################
## Recurrent CNV annotation ## 
##############################
# Get gene information from GENCODE using biomart
genes <- TCGAbiolinks:::get.GRCh.bioMart(genome = "hg19") 
genes <- genes[genes$external_gene_id != "" & genes$chromosome_name %in% c(1:22,"X","Y"),]
genes[genes$chromosome_name == "X", "chromosome_name"] <- 23
genes[genes$chromosome_name == "Y", "chromosome_name"] <- 24
genes$chromosome_name <- sapply(genes$chromosome_name,as.integer)
genes <- genes[order(genes$start_position),]
genes <- genes[order(genes$chromosome_name),]
genes <- genes[,c("external_gene_id", "chromosome_name", "start_position","end_position")]
colnames(genes) <- c("GeneSymbol","Chr","Start","End")
genes_GR <- makeGRangesFromDataFrame(genes,keep.extra.columns = TRUE)
save(genes_GR,genes,file = "genes_GR.rda", compress = "xz")
```

## GISTIC results

Download and save a subset of GBM GISTIC results from GDAC firehose.

```{R, eval=FALSE, message=FALSE, warning=FALSE, include=TRUE}
library(RTCGAToolbox)
# Download GISTIC results
lastAnalyseDate <- getFirehoseAnalyzeDates(1)
gistic <- getFirehoseData(
  dataset = "GBM",
  gistic2_Date = lastAnalyseDate,
  GISTIC = TRUE
)

# get GISTIC results
gistic_allbygene <- getData(
  gistic,
  type = "GISTIC", 
  platform = "AllByGene"
)

gistic_thresholedbygene <- getData(
  gistic,
  type = "GISTIC", 
  platform = "ThresholdedByGene"
)

save(
  gistic_allbygene,
  gistic_thresholedbygene,
  file = "GBMGistic.rda", compress = "xz"
)
```


## Gene expression data

The following code will download 20  LGG (Lower grade glioma) and 20 GBM (Glioblastoma multiforme) 
samples that have gene expression data. The Gene expression data is the raw expression signal for expression of a gene.

```{r , eval=FALSE, include=TRUE, results='asis'}
query_exp_lgg <- GDCquery(
  project = "TCGA-LGG",
  data.category = "Transcriptome Profiling",
  data.type = "Gene Expression Quantification", 
  workflow.type = "STAR - Counts"
)
# Get only first 20 samples to make example faster
query_exp_lgg$results[[1]] <- query_exp_lgg$results[[1]][1:20,]
GDCdownload(query_exp_lgg)
exp_lgg <- GDCprepare(
  query = query_exp_lgg
)

query_exp_gbm <- GDCquery(
  project = "TCGA-GBM",
  data.category = "Transcriptome Profiling",
  data.type = "Gene Expression Quantification", 
  workflow.type = "STAR - Counts"
)
# Get only first 20 samples to make example faster
query_exp_gbm$results[[1]] <- query_exp_gbm$results[[1]][1:20,]
GDCdownload(query_exp_gbm)
exp_gbm <- GDCprepare(
  query = query_exp_gbm
)
```


## DNA methylation and Gene expression data

The following code will select 10  LGG (Lower grade glioma) and 10 GBM (Glioblastoma multiforme)  samples that have both 
DNA methylation  and gene expression data. This objects will be then prepared 
to the format accept by the Biocondcutor package 
`r Biocpkg("ELMER")`([link])(http://bioconductor.org/packages/release/bioc/html/ELMER.html).
The DNA methylation will have only probes in chromossome 9  in order to make the example 
of the workflow faster. For a real analysis, all chromossomes should be used.
The Gene expression data is the normalized results for expression of a gene.


```{r , eval=FALSE, include=TRUE, results='asis'}
#----------- 8.3 Identification of Regulatory Enhancers   -------
library(TCGAbiolinks)
# Samples: primary solid tumor w/ DNA methylation and gene expression
matched_met_exp <- function(project, n = NULL){
  # get primary solid tumor samples: DNA methylation
  message("Download DNA methylation information")
  met450k <- GDCquery(
    project = project,
    data.category = "DNA Methylation",
    platform = "Illumina Human Methylation 450",
    data.type = "Methylation Beta Value",
    sample.type = c("Primary Tumor")
  )
  met450k.tp <-  met450k$results[[1]]$cases
  
  # get primary solid tumor samples: RNAseq
  message("Download gene expression information")
  exp <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification", 
    workflow.type = "STAR - Counts"
    sample.type = c("Primary Tumor")
  )
  
  exp.tp <- exp$results[[1]]$cases
  # Get patients with samples in both platforms
  patients <- unique(substr(exp.tp,1,15)[substr(exp.tp,1,12) %in% substr(met450k.tp,1,12)] )
  if(!is.null(n)) patients <- patients[1:n] # get only n samples
  return(patients)
}
lgg.samples <- matched_met_exp("TCGA-LGG", n = 10)
gbm.samples <- matched_met_exp("TCGA-GBM", n = 10)
samples <- c(lgg.samples,gbm.samples)

#-----------------------------------
# 1 - Methylation
# ----------------------------------
query.met <- GDCquery(
  project = c("TCGA-LGG","TCGA-GBM"),
  data.category = "DNA Methylation",
  platform = "Illumina Human Methylation 450",
  data.type = "Methylation Beta Value",
  barcode = samples
)
GDCdownload(query.met)
met <- GDCprepare(query.met, save = FALSE)
met <- subset(met,subset = as.character(GenomicRanges::seqnames(met)) %in% c("chr9"))

#-----------------------------------
# 2 - Expression
# ----------------------------------
query.exp <- GDCquery(
  project = c("TCGA-LGG","TCGA-GBM"),
  data.category = "Transcriptome Profiling",
  data.type = "Gene Expression Quantification", 
  workflow.type = "STAR - Counts"
  sample.type = c("Primary Tumor")
  barcode =  samples
)
GDCdownload(query.exp)
exp <- GDCprepare(query.exp, save = FALSE)
save(exp, met, gbm.samples, lgg.samples, file = "elmerExample.rda", compress = "xz")
```

## Mutation data

The following code will download Mutation annotation files (aligned against the genoem of reference hg38) 
for LGG and GBM samples and merge them into one single single file. The GDC Somatic Mutation Calling Workflow 
used is the mutect2. For more information please check [GDC](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/).

```{r , eval=FALSE, include=TRUE, results='asis'}
library(TCGAbiolinks)
query <- GDCquery(
  project = c("TCGA-LGG","TCGA-GBM"), 
  data.category = "Simple Nucleotide Variation", 
  access = "open",
  data.type = "Masked Somatic Mutation", 
  workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
)
GDCdownload(query)
maf <- GDCprepare(query)
save(maf,file = "maf_lgg_gbm.rda",compress = "xz")
```

## Biogrid data

Download biogrid information.

```{r , eval=FALSE, include=TRUE, results='asis'}
### read biogrid info
### Check last version in https://thebiogrid.org/download.php 
file <- "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-ALL-LATEST.tab2.zip"
if(!file.exists(gsub("zip","txt",basename(file)))){
  downloader::download(file,basename(file))
  unzip(basename(file),junkpaths =TRUE)
}

tmp.biogrid <- vroom::vroom(
  dir(pattern = "BIOGRID-ALL.*\\.txt")
)
save(tmp.biogrid, file = "biogrid.rda", compress = "xz")
```

## Histone marks

The code below was used to download histone marks specific for brain tissue using
the AnnotationHub package that can access the Roadmap database. 

```{r results='hide', eval=FALSE, echo=FALSE, message=FALSE,warning=FALSE}
library(ChIPseeker)
library(AnnotationHub)
library(pbapply)
library(ggplot2)
#------------------ Working with ChipSeq data ---------------
# Step 1: download histone marks for a brain and non-brain samples.
#------------------------------------------------------------
# loading annotation hub database
ah = AnnotationHub()

# Searching for brain consolidated epigenomes in the roadmap database
bpChipEpi_brain <- query(ah , c("EpigenomeRoadMap", "narrowPeak", "chip", "consolidated","brain","E068"))

# Get chip-seq data
histone.marks <- pblapply(names(bpChipEpi_brain), function(x) {ah[[x]]})
names(histone.marks) <- names(bpChipEpi_brain)
save(histone.marks, file = "histoneMarks.rda", compress = "xz")
```

# Session info
```{r sessionInfo, results='asis', echo=FALSE}
pander::pander(sessionInfo(), compact = FALSE)
```