deseq.ensembl.rmd

```{r setup, echo=FALSE}
opts_chunk$set(tidy=TRUE, echo=FALSE, highlight=TRUE, figalign="center", fig.height=6, fig.width=6, message=FALSE, error=FALSE, warning=FALSE)
```

## LIBRARIES
- using DESeq for normalization, dispersion adjustment and significance testing

```{r libraries}
library(DESeq)
library(plyr)
library(reshape)
library(ggplot2)
library(xtable)
library(biomaRt)
ensembl = useMart("ensembl",dataset="mmusculus_gene_ensembl")
```

## VARIABLES
- using trimmed (11bp from 5' end) reads aligned with Tophat against the Ensembl iGenome genes.gtf gene annotation file
- FDR of 0.2

```{r variables, cache=FALSE}
dataDir <- "/data/" #where 
resultsDir <- "/results" #wehre the counts are stored
samples <- c("foo1","foo2","foo3","foo4","foo5","foo6","foo7","foo8","foo9","foo10","foo11","foo12","foo13","foo14")
pvalcutoff=0.1
```

```{r functions, cache=FALSE}
plotDispEsts <- function( cds ) {
 plot(rowMeans( counts( cds, normalized=TRUE ) ), fitInfo(cds)$perGeneDispEsts, pch = '.', log="xy" , ylab="dispersion", xlab="mean normalized counts")
 xg <- 10^seq( -.5, 5, length.out=300 )
 lines( xg, fitInfo(cds)$dispFun( xg ), col="red" )
}

lm_eqn = function(df){
    m = lm(rep.2 ~ rep.1, df);
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(coef(m)[1], digits = 2), 
              b = format(coef(m)[2], digits = 2), 
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));                 
}

qq = function(pvaldf,  cutoffpval, samples) {
  title=paste("Quantile-quantile plot of p-values", samples, sep=" - ")
  pvaldf <- pvaldf[order(pvaldf$pval, decreasing=F),]
  pvals <- as.vector(unlist(pvaldf$pval))
  padjs <- as.numeric(as.vector(unlist(pvaldf$padj)))
  colors <- as.vector(ifelse(padjs<cutoffpval, "sig", "nonsig"))
  o = -log10(pvals)
  e = -log10( 1:length(o)/length(o) )
  plot=qplot(e,o, color=colors, xlim=c(0,max(e)), ylim=c(0,max(o))) + stat_abline(intercept=0,slope=1, col="darkgrey")
  plot=plot+opts(title=title)
  plot=plot+scale_x_continuous(name=expression(Expected~~-log[10](italic(p))))
  plot=plot+scale_y_continuous(name=expression(Observed~~-log[10](italic(p))))
  plot=plot + scale_colour_manual(name="BFH adjusted pvalue", values=c("black", "red"), labels=c(paste("q>", cutoffpval, sep=""),paste("q<", cutoffpval,sep=""))) 
  plot
}

plotDE <- function(res, cutoffpval, samples ) {
  title=paste("M-A plot of", samples, sep=" - ")
  res$colors <- ifelse(res$padj<cutoffpval, "sig", "nonsig" )
  plot <- ggplot(data=res, aes(x=log(baseMean), y=log2(foldChange), colour=colors)) + 
    geom_point( size=3)  +  
    scale_colour_manual(name="BFH adjusted pvalue", values=c("#00000033","#FF0000FF"),labels=c(paste("q>", cutoffpval, sep=""),paste("q<", cutoffpval,sep=""))) +
    opts(title=title)
  plot
}
```

## DATALOAD and PREP
- HTseq counts are generated for individual genes (using the UCSC GTF annoation from iGenomes) from Tophat aligments
- covariates can include multiple factors, sample names in teh first column
- processed and loaded all data into a DESeq CountDataSet

```{r dataload_and_reshape, cache=FALSE}
covars <- read.table(paste(dataDir, "covars.desc", sep=""), header=T, row.names=1)
covars$replicate.setname <- paste(covars[,1], covars[,2], covars[,3],sep="-")
counts <- llply(samples, function(n) {
  read.table(paste(resultsDir, n, "/HTseq-counts.tab",sep="" ))
})  
names(counts) <- samples
counts <- melt(counts)
counts <- cast(counts, V1 ~ L1)
row.names(counts) <- counts$V1
counts <- as.data.frame(counts[,names(counts) %in% samples])
counts.ds <- newCountDataSet(counts, covars)
```

## NORMALIZATION
- estimate library size adjustements and gene variances/dispersions for all samples
- takes the most conservative approach to reducing the variability of the dispersion estimates

```{r estimate_sizefactors, results='hide', cache=FALSE}
counts.ds <- estimateSizeFactors(counts.ds)
head(counts(counts.ds, normalized=TRUE))
# modelFrame - identical rows in this dataframe will be marked as replicates
# sharingMode = maximum, most conservative approach to sharing information across genes to reduce variability of the dispersion estimates
counts.ds <- estimateDispersions(counts.ds, method="pooled", modelFrame=covars[,1:3], sharingMode="maximum", fitType="parametric")
counts.norm <- counts(counts.ds, normalized=TRUE)
```

### "Sanity" checks for dispersion results, (from DESeq vignette)
- plot dispersion estimates versus empirical values for different expression levels 
  - black dots are empirical values and the red line is the fitted values
- viewed the dispersions values used in subsequent testing and verified that disp_pooled contains the maximum of the two value vectors


```{r plotdispersions, cache=FALSE, fig.cap=""}
plotDispEsts(counts.ds)
head(fData(counts.ds))
str(fitInfo(counts.ds))
```
**Figure 1 - Empirical and fitted dispersion values plotted against mean expression strength**


## QC
### Replicate based
#### Scatter plots and linear regressions
- linear regression equation is on each plot
- blue line = linear regression predicted values
- red line = perfect correlation values

```{r replicate_scatterplots, fig.width=18, fig.height=18, cache=FALSE, fig.cap=""}
pd <- pData(counts.ds)
counts.norm.melt <- melt(counts.norm)
counts.norm.melt$replicate <- pd$replicate[match(counts.norm.melt$X2, row.names(pd))]
counts.norm.melt$replicate <- paste("rep", counts.norm.melt$replicate, sep=".")
counts.norm.melt$setname <- pd$replicate.setname[match(counts.norm.melt$X2, row.names(pd))]
counts.norm.melt.rep <- cast(counts.norm.melt, X1+ setname ~ replicate)
counts.norm.melt.rep$rep.1 <- log10(counts.norm.melt.rep$rep.1)
counts.norm.melt.rep$rep.2 <- log10(counts.norm.melt.rep$rep.2)
## subset data to the finite counts
counts.norm.melt.rep.finite <- subset(counts.norm.melt.rep,is.finite(rep.1) & is.finite(rep.2))
## calculate the linear regression equations
labeldata <- ddply(counts.norm.melt.rep.finite, .(setname), lm_eqn)
p <- ggplot(counts.norm.melt.rep.finite, aes(x=rep.1,y=rep.2)) + geom_point(size=1, alpha=0.2) +
  geom_smooth(method="lm")  +
  xlab("replicate1 - log10(counts)") + 
  ylab("replicate2 - log10(counts)") +
  geom_abline(intercept=0, color="red",alpha=0.2,slope=1 ) + 
  facet_wrap(~setname, ncol=3) +
  geom_text(data=labeldata, aes(x=1, y=5,label=V1), parse=TRUE, size=4)
print(p)
```

**Figure 2 - Scatterplots and linear regressions of replicate counts**

---

### Replicate M-A plots
- values outside the orange lines show more than 2 fold changes in expression between replicates
- "M" axis is the log 2 fold change between replicates and the "A" axis is the average log 2 normalized counts in the two samples


```{r MvA_plots, fig.width=18, fig.height=18, cache=FALSE, fig.cap=""}
mean.counts.norm.melt <- aggregate(data=counts.norm.melt ,value ~ setname + X1, function(n) 0.5*(log2(n[1])+log2(n[2])))
fc.counts.norm.melt <- aggregate(data=counts.norm.melt ,value ~ setname + X1, function(n) log2(n[1])-log2(n[2]))
replicate.counts.norm.melt <- merge(mean.counts.norm.melt, fc.counts.norm.melt, by.x=c("X1", "setname"), by.y=c("X1", "setname"))
names(replicate.counts.norm.melt) <- c("gene", "set", "mean", "fc")
ggplot(data=replicate.counts.norm.melt, aes(x=mean, y=fc)) + geom_point(size=1, alpha=0.1) +facet_wrap(~set, ncol=3) + ylab("M") + xlab("A") +geom_hline(yintercept=c(2,-2), color="orange", alpha=0.4)
```
**Figure 3 - M-A plots of replicate counts**

## ANALYSES
- it is not possible to work with the entire dataset to calculate the library size adjustments and dispersions and then perform pairwise comparisons between samples as DESeq cannot transition between a multivariate, multisample mode to a pairwise mode
- instead, for each pairwise analysis, take  the appropriate subset of the raw data and recalculate the size adjustments and dispersions for this subset
- using this data, perform independent filtering to eliminate genes that have no, or little chance of showing signicant evidence
  - this should result in increased detection power, in terms of false discovery rate.
  - filtering is  based on the sum of counts from all samples as below a certain number of samples, it is not possible to get a pvalue below a desired cutoff 
    - here you discard the genes in the bottom 40% of the distribution

```{r subset_pairwise_comparisons, cache=FALSE}
setnames <- list(c("foo1", "foo2"),c("foo3", "foo4"))
sig.results <- llply(setnames, function(n) {
  n <- unlist(n)
  ## subset data, 
  counts.ds.subset <- newCountDataSet(counts(counts.ds[,pData(counts.ds)$replicate.setname %in% n]), pData(counts.ds)$replicate.setname[pd$replicate.setname %in% n])
  ## recalculate sizeFactors and Dispersions
  counts.ds.subset <- estimateSizeFactors(counts.ds.subset)
  counts.ds.subset <- estimateDispersions(counts.ds.subset)
  ## independent filtering - drop lowest counted genes to improve multiple test correction
  ## get sum of counts for all samples for each gene
  rowcounts <- rowSums(counts(counts.ds.subset))
  ## filter the data based on the minimal row sum 
  use <- (rowcounts > quantile(rowcounts, 0.4))
  counts.ds.subset.filtered <- counts.ds.subset[use,]
  ## perform significance testing
  res.filtered <- nbinomTest(counts.ds.subset.filtered, n[1], n[2])
  ## get normalized counts for significant hits, relabel samples with condition rather than sampleID
  results.1 <- counts(counts.ds.subset.filtered,normalize=TRUE)[which(res.filtered$padj<pvalcutoff),]
  dimnames(results.1)[[2]] <- pData(counts.ds.subset)$condition
  results.1 <- results.1[,order(dimnames(results.1)[[2]])]
  ## get means and pvalues for significant hits and put together with counts
  results.2 <- res.filtered[which(res.filtered$padj<pvalcutoff),]
  results <- cbind(results.1, results.2)
  results <- results[,-grep("id", names(results))]
  results$ensembl_gene_id <- row.names(results)
  if(nrow(results)==0) {
    gene_symbols=NA
    } else { 
  gene_symbols <-  getBM(attributes=c('external_gene_id','ensembl_gene_id'), filters='ensembl_gene_id', values=row.names(results), mart=ensembl)
  }
  results <- merge(gene_symbols, results, all=TRUE)
  ## output some plots
    qqplots <- qq(res.filtered[,c("pval", "padj")], pvalcutoff, paste(n[1], "v", n[2]) )
    DEplots <- plotDE(res.filtered, pvalcutoff, paste(n[1], "v", n[2]))
  return(list(results=results,  qqplots=qqplots, DEplots=DEplots))
})     
```

**2 pairwise comparisons in total**
  
- for each of these comparisons, significant genes which passed a 10% false discovery rate are highlighted
- for each comparison, there are 3 ways to visualize these significant results
  - QQplot, with significantly varying transcripts shaded in red
  - MA-plot, with significantly varying transcripts shaded in red
  - table of transcripts with significantly different expression levels 

---

### 1

```{r out1, fig.width=11, fig.height=6, out.width='.45\\textwidth', cache=FALSE}
sig.results[[1]]$qqplots
sig.results[[1]]$DEplots
```
**Figure 4 - title**  
   
**Table 1 - title** 
```{r tables1, results='asis', cache=FALSE}
out1 <- xtable(sig.results[[1]]$results)
print(out1, type='html',include.rownames=FALSE)
```

---

### 2

```{r out2, fig.width=11, fig.height=6, out.width='.45\\textwidth', cache=FALSE}
sig.results[[2]]$qqplots
sig.results[[2]]$DEplots
```
**Figure 5 - title**  
  
**Table 2 - title**
```{r tables2, results='asis', cache=FALSE}
out2 <- xtable(sig.results[[2]]$results)
print(out2, type='html',include.rownames=FALSE)
```
 Replicate this code for as many pairwise comparisons as you want.