---
title: "yriMulti -- HapMap YRI population, multiassay interfaces"
author: "Vincent J. Carey, stvjc at channing.harvard.edu"
date: "May 2016"
output:
  BiocStyle::pdf_document:
    toc: yes
    number_sections: yes
  BiocStyle::html_document:
    highlight: pygments
    number_sections: yes
    theme: united
    toc: yes
vignette: >
  %\VignetteIndexEntry{yriMulti -- HapMap YRI population, multiassay interfaces}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}  
---

```{r style, echo = FALSE, results = 'asis'}
suppressMessages({
suppressWarnings({
suppressPackageStartupMessages({
BiocStyle::markdown()
})
})
})
```
```{r bib, echo = FALSE, results = 'hide'}
suppressMessages({
suppressWarnings({
suppressPackageStartupMessages({
library(knitcitations)
library(bibtex)
library(yriMulti)
library(dsQTL)
library(geuvPack)
library(erma)
library(VariantAnnotation)
library(gQTLstats)
library(Homo.sapiens)
})
})
})
allbib = read.bibtex("allbib.bib")
```

# Introduction

The EBV-transformed B-cells from Yoruban donors are assayed
for genotype and various genomic features in a number
of prominent studies.  This package helps work with relevant
datasets and data structures as use cases for 
MultiAssayExperiment package development.  A particular concern
is accommodation of distributed genotype data, in this case,
based on the 1000 genomes VCF files in an S3 bucket.

# Basic data resources

## Expression data

We will use the RNA-seq expression data in the `r Biocpkg("geuvPack")`
package.

```{r fakeload1,echo=FALSE,results="hide"}
if (!exists("geuFPKM")) data(geuFPKM)
```
```{r lkexpr,eval=TRUE}
library(geuvPack)
data(geuFPKM)
```{r do1}
geuFPKM
```

## Methylation data

We have added 450k data from 
`r citet(allbib[["Banovich2014"]])`  paper to the yriMulti package.

```{r fakeload2,echo=FALSE,results="hide"}
if (!exists("banovichSE")) data(banovichSE)
```
```{r lkmul,eval=TRUE}
library(yriMulti)
data(banovichSE)
```{r do2}
banovichSE
```

## DnaseI hypersensitivity data

```{r lkdhsf,echo=FALSE,results="hide"}
if (!exists("DHStop5_hg19")) data(DHStop5_hg19)
```
```{r lkdhs,eval=TRUE}
library(dsQTL)
if (!exists("DHStop5_hg19")) data(DHStop5_hg19)
```
```{r lkddd}
DHStop5_hg19
```

## Genotype data

We take advantage of a function (`gtpath`) that generates paths to S3-resident
VCF from the 1000 genomes project.

```{r lkgeno,cache=TRUE}
litvcf = readVcf(gtpath(20), 
      param=ScanVcfParam(which=GRanges("20", 
           IRanges(3.7e7,3.701e7))), genome="hg19")
```
```{r outca}
litvcf
length(colnames(litvcf))
length(intersect(colnames(litvcf), colnames(banovichSE)))
length(intersect(colnames(litvcf), colnames(geuFPKM)))
length(intersect(colnames(litvcf), colnames(DHStop5_hg19)))
```

# Some computations focused on methylation-expression association

The `yriMulti` package is currently a scratch-pad for some
integrative infrastructure thoughts.

##  Gene-centric selection

With `mexGR`, a GRanges instance is formed with methylation scores
for CpG near a gene.  The assay data are placed in the mcols,
with one range devoted to the expression measures.

```{r lkmex,cache=TRUE}
m1 = mexGR(banovichSE, geuFPKM, symbol="ORMDL3")
m1
mcols(m1)[1:4,1:4]
table(mcols(m1)$type)
```

## DNA-methylation association with expression

`bindelms` computes the regressions of the selected gene's expression
values on the methylation scores.  We have
options to transform the expression value
(parameter `ytx` is a function) and can indicate the radius around
the gene coding region to search for CpG (parameter `gradius`).

We'll examine a region around gene BRCA2 for a CpG whose
methylation score is negatively associated with BRCA2 expression.

```{r lkbi,cache=TRUE}
b1 = bindelms(geuFPKM, banovichSE, symbol="BRCA2", ytx=log,
   gradius=20000)
b1
mcols(b1)[1:3,]
summary(mcols(b1)$t)
mintind = which.min(mcols(b1)$t)
mincpg = names(b1)[mintind]
mincpg
```
```{r lkevm,fig=TRUE}
plotEvM(b1)
```

# Using the MultiAssayExperiment infrastructure 

## Construction of the MultiAssayExperiment

We will use a unified object design to reproduce the
BRCA2 display just obtained.

We need a list of relevant objects and a phenodata component.

```{r lkbcma, eval=TRUE}
library(MultiAssayExperiment)
myobs = list(geuvRNAseq=geuFPKM, yri450k=banovichSE, yriDHS=DHStop5_hg19)
cold = colData(geuFPKM)
suppressWarnings({
library(MultiAssayExperiment)
mm = MultiAssayExperiment(myobs, as.data.frame(cold))
})
mm
```

## Restriction by range

We compute the BRCA2 'gene range'.

```{r lkbrc, eval=TRUE}
library(erma)
brr = range(genemodel("BRCA2"))
brr
```

Subset the multiassay structure to features in the vicinity of this range.
```{r dorsub, eval=TRUE}
.subsetByRanges = function(ma, r) {
  subsetByRow(ma,r)
}
newmm = .subsetByRanges(mm, brr+20000)
newmm
```

We now have all the relevant features and samples.  In fact we have more
genes than we really wanted.  But we will proceed with this selection.

## All pairwise regressions

We will introduce a formula idiom to specify a collection
of models of interest.  

```{r makeco}
library(doParallel)
registerDoSEQ()
allLM_pw = function(fmla, mae, xtx=force, ytx=force) {
#
# formula specifies dependent and independent assays
# form all regressions of ytx(dep) on xtx(indep) for all
# pairs of dependent and independent variables defined by
# assays for samples held in common
#
lf = as.list(fmla)
nms = lapply(lf, as.character)
yel = experiments(mae)[[nms[[2]]]]
xel = experiments(mae)[[nms[[3]]]]
sy = colnames(yel)
sx = colnames(xel)
sb = intersect(sy,sx)
yel = yel[,sb]
xel = xel[,sb]
vdf = as.matrix(expand.grid( rownames(yel),
    rownames(xel), stringsAsFactors=FALSE ))
allf = apply(vdf, 1, function(x) as.formula(paste(x, collapse="~")))
alllm = foreach (i = 1:length(allf)) %dopar% {
  df = data.frame(ytx(assay(yel)[vdf[i,1],]), xtx(assay(xel)[vdf[i,2],]))
  names(df) = vdf[i,]
  lm(allf[[i]], data=df)
  }
names(alllm) = apply(vdf,1,function(x) paste(x, collapse="~"))
allts = lapply(alllm, function(x) summary(x)$coef[2, "t value"])
list(mods=alllm, tslopes=allts)
}
pwplot = function(fmla1, fmla2, mae, ytx=force, xtx=force, ...) {
#
# use fmla1 with assays as components to identify
#    two assays to regard as sources of y and x
# fmla2 indicates which features to plot
#
lf = as.list(fmla1)
nms = lapply(lf, as.character)
yel = experiments(mae)[[nms[[2]]]]
xel = experiments(mae)[[nms[[3]]]]
sy = colnames(yel)
sx = colnames(xel)
sb = intersect(sy,sx)
yel = yel[,sb]
xel = xel[,sb]
lf2 = lapply(as.list(fmla2), as.character)
ndf = data.frame( ytx(assay(yel)[ lf2[[2]], ]), xtx(assay(xel)[ lf2[[3]], ]) )
names(ndf) = c(lf2[[2]], lf2[[3]])
plot(fmla2, ndf, ...)
}

```

```{r lkpwl, eval=TRUE}
pp = allLM_pw(geuvRNAseq~yri450k, newmm, ytx=log) 
names(pp)
summary(pp[[1]][[1]])
which.min(unlist(pp[[2]])) # not BRCA2 but FRY
```

The formula idiom can be used to isolate assays and features.
```{r lkf,fig=TRUE, eval=TRUE}
pwplot(geuvRNAseq~yri450k, ENSG00000139618.9~cg20073910, newmm, ytx=log)
```

## Integrating dense genotypes in a VcfStack instance

We set up a reference to a collection of VCF.
We'll use 1000 genomes VCF for chr21, and chr22.
At present (16 Oct 2016) chrY must be kept out of the
stack as it does not have the full set of samples for
other chromosomes.
```{r redoVcfStackFunc,echo=FALSE,results="hide"}
VcfStack2 <- function(files=NULL, seqinfo=NULL, colData=NULL)
{
    if (is.null(files)) {
        files <- VcfFileList()
        header <- NULL
    } else {
        if (class(files) != "VcfFileList")
            files = VcfFileList(files)
#        files = indexVcf(files)
        header <- scanVcfHeader(files[[1]])
    
    }   

    if (is.null(seqinfo)) {
        seqinfo <- if (length(files)) {
            seqinfo(files)
        } else Seqinfo()
    }   

    if (is.null(colData) && length(files)) {
        colData <- DataFrame(row.names=samples(header))
    } else {
        colData <- as(colData, "DataFrame")
    }   

    if (is.null(rownames(colData)) && length(files))
         stop("specify rownames in 'colData'")

    new("VcfStack", files=files, colData=colData, seqinfo=seqinfo)
}
```
```{r mkvs, eval=TRUE}
library(gQTLstats)
library(VariantAnnotation)
library(GenomicFiles)
pa = paths1kg(paste0("chr", c(21:22))) #,"Y")))
sn = vcfSamples(scanVcfHeader(TabixFile(pa[1])))
library(Homo.sapiens)  # necessary?
stopifnot(requireNamespace("GenomeInfoDb")) # indexVcf with S3 bucket? ->
ob = RangedVcfStack(VcfStack2(pa, seqinfo(Homo.sapiens)))
cd = DataFrame(id1kg=sn)
rownames(cd) = sn
colData(ob) = cd
```

Now we set up a region of interest and bind it to the stack.
This yields an instance of `RangedVcfStack`, for which we 
have `samples`, `features`, and `assay` methods defined in
gQTLstats.

```{r mkrvs, eval=TRUE}
myr = GRanges("chr22", IRanges(20e6,20.01e6))
rowRanges(ob) = myr
colData(ob) = DataFrame(colData(ob), zz=runif(nrow(colData(ob))))
hasInternetConnectivity = function()
 !is.null(nsl("www.r-project.org"))
#if (hasInternetConnectivity()) lka = assay(ob)
myobs = list(geuvRNAseq=geuFPKM, yri450k=banovichSE, yriDHS=DHStop5_hg19,
    yriGeno=ob)
suppressWarnings({
mm = MultiAssayExperiment(myobs)
})
mm
```


<h2 id="bib-sec">References</h2>

```{r results='asis',echo=FALSE}
bibliography() #style="markdown")
```