pipeline.Rmd

---
title: "Report title here"
author:
- Multi-line comments
- can be added here
output:
  md_document:
    toc: yes
  html_notebook:
    highlight: tango
    theme: flatly
    toc: yes
    toc_float:
      collapsed: no
---
```{r Setup, include=F}
### Set species of data
#species <- "human"
species <- "mouse"

dataName <- "yourData" #replace with appropriate data name (see expected directory structure below)

### Load libraries
library(MASS)
library(Matrix)
library(RColorBrewer)
library(scales)
library(viridis)
library(scran) # from Bioconductor
if (species == "human") {
  library(org.Hs.eg.db) # from Bioconductor (for human data)
  egDB <- "org.Hs.eg.db"
} else if (species == "mouse") {
  library(org.Mm.eg.db) # from Bioconductor (for mouse data)
  egDB <- "org.Mm.eg.db"
} else { } #Mito gene IDs and cell cycle prediction might need your attention.
library(Seurat) #see http://satijalab.org/seurat/install.html

### Initialize some variables included in the report text, so that the report can be generated from interim saves.
forText_dropMito <- forText_mitoMads <- forText_genesRemain <- forText_genesLost <- forText_mitoPct <- NA
```

```{r Load_raw_data, include=F}
### Make output directory
if (!dir.exists(paste0("../",dataName,"/output"))) { dir.create(paste0("../",dataName,"/output")) }

### One method for loading in the data, a little more memory efficient
setwd(paste0("../",dataName,"/input")) # point to data folder (WD is set per code block only)
temp_cells <- scan(paste0(dataName,"_CELLNAMES.txt"),character(),sep="\t") #column names
temp_genes <- scan(paste0(dataName,"_GENENAMES.txt"),character(),sep="\t") #row names
eb_raw <- Matrix(scan(paste0(dataName,"_MATRIX.txt"),integer(),sep="\t"), #matrix of integer (count) values
                 nrow=length(temp_genes),byrow=T,sparse=T)
colnames(eb_raw) <- temp_cells
rownames(eb_raw) <- temp_genes

### Alternative, simpler method
#setwd(paste0("../",dataName,"/input")) # point to data folder (WD is set per code block only)
#eb_raw <- Matrix(as.matrix(read.table(paste0(dataName,"_DGE.txt"),header=T,sep="\t",row.names=1)),sparse=T)

### Remove duplicated gene names (a couple genes are in under their MGI and HGNC symbols)
temp_r <- rownames(eb_raw)[which(duplicated(toupper(rownames(eb_raw))))]
temp_r <- lapply(temp_r,function(X) grep(paste0("^",X,"$"),rownames(eb_raw),ignore.case=T))
temp_r <- which(rownames(eb_raw) %in% 
                  names(sapply(temp_r,function(X) which.min(apply(eb_raw[X,],1,function(Y) sum(Y>0))))))
if (length(temp_r) > 0) { eb_raw <- eb_raw[-temp_r,] }

### Remove rows and columns with no data.
eb_raw <- eb_raw[Matrix::rowSums(eb_raw) > 0,]
eb_raw <- eb_raw[,Matrix::colSums(eb_raw) > 0]

rm(list=ls()[grepl("temp",ls())])
```

# Filtering raw data
## Raw cell distributions
**Genes detected as a function of library size per cell:**
```{r Figs_rawData_cellStats, echo=F,fig.height=8.4,fig.width=8.4,fig.show="hold"}
cS <- data.frame(libSize=Matrix::colSums(eb_raw),
                 geneDetect=apply(eb_raw,2,function(X) sum(X>0)))
p_hi <- 1e-3 #p-value for filtering doublets
p_lo <- 1e-2 #p-value for filtering poor libraries
fitLibSize <- fitdistr(cS$libSize,"negative binomial")
c_hiL <- qnbinom(p_hi,size=fitLibSize$estimate["size"],
                 mu=fitLibSize$estimate["mu"],lower.tail=F)
c_loL <- qnbinom(p_lo,size=fitLibSize$estimate["size"],
                 mu=fitLibSize$estimate["mu"],lower.tail=T)
fitGeneDetect <- fitdistr(cS$geneDetect,"negative binomial")
c_hiG <- qnbinom(p_hi,size=fitGeneDetect$estimate["size"],
                 mu=fitGeneDetect$estimate["mu"],lower.tail=F)
c_loG <- qnbinom(p_lo,size=fitGeneDetect$estimate["size"],
                 mu=fitGeneDetect$estimate["mu"],lower.tail=T)
temp_doublets <- (cS$libSize > c_hiL) | (cS$geneDetect > c_hiG) #doublets IDed based on high library size or genes detected
temp_crapLibs <- (cS$libSize < c_loL) | (cS$geneDetect < c_loG) #poor libraries IDed based on low library size or genes detected

eb_rawF <- eb_raw[,!(temp_doublets | temp_crapLibs)]
temp_postFgenes <- Matrix::rowSums(eb_rawF) > 0

### Visualize cell filtering:
layout(matrix(c(2,1,0,3),2),c(7,1.4),c(1.4,7))
par(mar=c(3,3,0,0),mgp=2:0)
plot(geneDetect~libSize,data=cS,
     pch=21,col=alpha("black",0.2),bg=alpha("black",0.1),cex=1.2,
     xlim=range(cS$libSize),ylim=range(cS$geneDetect),
     xlab="Library Size",ylab="Genes Detected")
points(geneDetect~libSize,cex=1.2,pch=4,col="red",data=cS[temp_doublets | temp_crapLibs,])
legend("topleft",bty="n",inset=c(-.02,0),legend=c(paste("Total genes:",nrow(eb_raw)),
                                                    paste("Post-filter:",sum(temp_postFgenes))))
legend("bottomright",bty="n",pch=4,col="red",
       legend=c(paste(sep="\n","Poor-quality libraries",
                      paste0("(p<",p_lo,"): ",sum(temp_crapLibs))),
                paste(sep="\n","Predicted doublets",
                      paste0("(p<",p_hi,"): ",sum(temp_doublets)))))

par(mar=c(0,3,1,0))
tempD <- density(rnbinom(10000,size=fitLibSize$estimate["size"],mu=fitLibSize$estimate["mu"]))
hist(cS$libSize,breaks=100,freq=F,col="grey",main=NULL,xaxt="n",ylab="Density")
lines(tempD,lwd=2,col=alpha("red",0.5))
abline(v=c_hiL,lty=2,lwd=2,col="darkred")
abline(v=c_loL,lty=2,lwd=2,col="darkred")

par(mar=c(3,0,0,1))
tempD <- density(rnbinom(10000,size=fitGeneDetect$estimate["size"],mu=fitGeneDetect$estimate["mu"]))
tempH <- hist(cS$geneDetect,breaks=100,plot=F)
tempB <- barplot(tempH$density,horiz=T,space=0,col="grey",main=NULL,xlab="Density")
tempSF <- (max(tempB) - min(tempB)) / (max(tempH$mids) - min(tempH$mids))
lines(y=tempD$x * tempSF + (min(tempB) - min(tempH$mids) * tempSF),
      x=tempD$y,lwd=2,col=alpha("red",0.5))
abline(h=c_hiG * tempSF + (min(tempB) - min(tempH$mids) * tempSF),lty=2,lwd=2,col="darkred")
abline(h=c_loG * tempSF + (min(tempB) - min(tempH$mids) * tempSF),lty=2,lwd=2,col="darkred")

```

## Doublet filtering 
In order to filter out predicted doublets, library sizes were fit to a negative binomial distribution and cells unlikely to have been sampled from those distributions were removed (red x).

Data processing will now be performed based on a published workflow from the Marioni group (Lun *et al.*, F1000Research 2016. http://dx.doi.org/10.12688/f1000research.9501.2).

```{r create_scran_object, include=F}
eb1p <- newSCESet(countData=eb_rawF)

### Clean up objects to free memory
#rm(list=c("cS","eb_rawF","eb_raw"))
#gc()
```

## Mitochondrial gene content
```{r prefilter_scran_cells, echo=F,fig.height=4.2,fig.width=8.4}
if (species == "human") {
  mitoGenePrefix <- "^MT-"
} else if (species == "mouse") {
  mitoGenePrefix <- "^mt-"
} else { } #Mito gene IDs and cell cycle prediction might need your attention.

eb1p <- calculateQCMetrics(eb1p,feature_controls=list(Mt=grepl(mitoGenePrefix,rownames(eb1p))))
forText_mitoPct <- round(median(pData(eb1p)$pct_counts_feature_controls_Mt))
drop_mitoMads <- 4
drop_mito <- isOutlier(eb1p$pct_counts_feature_controls_Mt,nmads=drop_mitoMads,type="higher")

eb1 <- eb1p[,!drop_mito]
eb1 <- eb1[rowSums(counts(eb1)) > 0,]

forText_dropMito <- sum(drop_mito)
forText_mitoMads <- drop_mitoMads
forText_genesLost <- nrow(eb1p) - nrow(eb1)
forText_genesRemain <- nrow(eb1)

### Visualize filtering step
layout(matrix(c(1,2,4,3,0,5),2),c(4.3,4.2,0.5),c(0.5,4))
par(mar=c(0,3,3,1),mgp=2:0)
plot.new()
title(main="Filtering cells based on\nmitochondrial gene proportion")
par(mar=c(3,3,0,1))
plot(total_features~total_counts,data=pData(eb1p)[!drop_mito,],
     pch=21,cex=1.2,col=alpha("black",0.2),bg=alpha("black",0.1),
     xlab="Library Size",ylab="Genes detected",
     xlim=range(pData(eb1p)$total_counts),ylim=range(pData(eb1p)$total_features))
points(total_features~total_counts,data=pData(eb1p)[drop_mito,],
       pch=21,cex=1.2,col=alpha("red",0.5),bg=alpha("red",0.3))
legend("topleft",bty="n",pch=21,col=alpha("red",0.5),pt.bg=alpha("red",0.3),
       legend=paste(drop_mitoMads,"MADs above median"))
par(mar=c(3,3,0,0))
plot(pct_counts_feature_controls_Mt~total_counts,data=pData(eb1p),
     pch=21,cex=1.2,col=alpha("black",0.2),bg=alpha("black",0.1),
     xlab="Library Size",ylab="Mitochondrial Transcript Percent")
with(pData(eb1p),abline(h=median(pct_counts_feature_controls_Mt)+drop_mitoMads*mad(pct_counts_feature_controls_Mt),
                        lwd=2,lty=2,col=alpha("red",0.5)))
legend("topright",lty=2,lwd=2,col=alpha("red",0.5),bty="n",
       legend=paste(drop_mitoMads,"MADs above median"))
par(mar=c(0,3,0,0))
hist(pData(eb1p)$total_counts,breaks=100,col="grey",main=NULL,xaxt="n")
par(mar=c(3,0,0,0))
barplot(hist(pData(eb1p)$pct_counts_feature_controls_Mt,breaks=100,plot=F)$counts,
        horiz=T,space=0,col="grey",main=NULL,xlab="Frequency")

```
The majority of cells have less than `r forText_mitoPct`% of transcripts from mitochondrial genes.  This is evidence of generally good-quality data.  A cutoff of `r forText_mitoMads` median absolute deviations above the median mitochondrial content was used to remove `r forText_dropMito` cells with higher mitochondrial gene proportion than the data set as a whole.  These cells were predominantly at the low end of the gene detection and library size distributions.  This resulted in the loss of `r forText_genesLost` from the total number of genes detected in the dataset, leaving `r forText_genesRemain` genes detected in the dataset.

```{r, include=F}
#rm(eb1p)
#gc()
```


## Cell cycle annotation
```{r cell_cycle_annotation, echo=F,fig.height=4.2,fig.width=8.4}
if (!file.exists(paste0("../",dataName,"/output/",dataName,"_cycScores.RData"))) {
  anno <- select(get(egDB), keys=rownames(eb1), keytype="SYMBOL", column="ENSEMBL") 
  cycScores <- cyclone(eb1,gene.names=anno$ENSEMBL[match(rownames(eb1), anno$SYMBOL)],
                       pairs=readRDS(system.file("exdata", "mouse_cycle_markers.rds", package="scran")))
  cycScores$phases <- as.factor(cycScores$phases)
  save(cycScores,file=paste0("../",dataName,"/output/",dataName,"_cycScores.RData"))
} else {
  load(paste0("../",dataName,"/output/",dataName,"_cycScores.RData"))
}

### Visualize cell cycle phase per cell
cycDlibSize <- tapply(pData(eb1)$total_counts,cycScores$phases,function(X) density(X))
cycDgeneDetect <- tapply(pData(eb1)$total_features,cycScores$phases,function(X) density(X))

layout(matrix(c(2,1,0,3,5,4,0,6),2),
       widths=c(3.6,.6,3.6,.6),heights=c(.6,3.6))
par(mar=c(3,3,0,0),mgp=2:0)
plot(cycScores$score$G1,cycScores$score$G2M,pch=21,cex=1.2,
     col=viridis(3,.5)[c(3,1,2)][cycScores$phases],
     bg=viridis(3,0.2)[c(3,1,2)][cycScores$phases],
     xlab="G1 score", ylab="G2/M score") 
par(mar=c(0,3,0,0))
hist(cycScores$score$G1,breaks=100,col="grey",main=NULL,xaxt="n")
par(mar=c(3,0,0,0))
barplot(hist(cycScores$score$G2M,breaks=100,plot=F)$counts,
        horiz=T,space=0,col="grey",main=NULL,xlab="Frequency")

par(mar=c(3,3,0,0),mgp=2:0)
plot(total_features~total_counts,data=pData(eb1),pch=21,cex=1.2,
     col=viridis(3,.5)[c(3,1,2)][cycScores$phases],
     bg=viridis(3,0.2)[c(3,1,2)][cycScores$phases],
     xlab="Library Size",ylab="Genes detected",
     main=NULL)
legend("bottomright",bty="n",pch=21,legend=levels(cycScores$phases),xpd=NA,
       col=viridis(3)[c(3,1,2)],pt.bg=viridis(3,0.5)[c(3,1,2)])
par(mar=c(0,3,0,0))
plot(x=NULL,y=NULL,ylab="Density",xaxt="n",
     xlim=range(pData(eb1)$total_counts),
     ylim=c(min(sapply(cycDlibSize,function(X) min(X$y))),
            max(sapply(cycDlibSize,function(X) max(X$y)))))
for (x in 1:length(cycDlibSize)) {
  lines(cycDlibSize[[x]],col=viridis(3)[c(3,1,2)][x],lwd=3)
}
par(mar=c(3,0,0,0))
plot(x=NULL,y=NULL,xlab="Density",yaxt="n",
     xlim=c(min(sapply(cycDgeneDetect,function(X) min(X$y))),
            max(sapply(cycDgeneDetect,function(X) max(X$y)))),
     ylim=range(pData(eb1)$total_features))
for (x in 1:length(cycDgeneDetect)) {
  lines(x=cycDgeneDetect[[x]]$y,y=cycDgeneDetect[[x]]$x,col=viridis(3)[c(3,1,2)][x],lwd=3)
}

### Data frame of cell cycle distribution
CellCyclePhase <- sapply(table(cycScores$phases)/length(cycScores$phases),percent)
as.data.frame(rbind(CellCyclePhase))
```
Cell cycle prediction performed by Cyclone (Scialdone *et al.*, Methods 2015.  http://dx.doi.org/10.1016/j.ymeth.2015.06.021)

## Filter out low abundance genes
Noisy genes must be removed to prevent them from skewing normalization.  The filtering method in *Seurat* removes only genes detected in very few cells, which is sufficient for normalization while removing as few genes as possible.

```{r prefilter_scran_genes_DR, echo=F,fig.height=4.6,fig.width=8.4,fig.show="hold"}
geneStatsR <- with(fData(eb1),data.frame(DR=n_cells_exprs/ncol(eb1),
                                         MDTC=total_feature_counts/n_cells_exprs,
                                         MTC=total_feature_counts/ncol(eb1),
                                         sumTC=total_feature_counts))
geneStatsR$cellMax <- apply(counts(eb1),1,max)

lowCellNum <- 3 # filter out genes detected in less than this many cells
DRcut <- lowCellNum/ncol(eb1)
drop_lowCell <- geneStatsR$DR < DRcut
eb1F1 <- eb1[!drop_lowCell,]

### Visualize gene expression distributions
iH <- 101-cut(log10(geneStatsR[order(geneStatsR$cellMax,decreasing=F),"cellMax"]),breaks=100,labels=F)
layout(matrix(c(2,1,4,0,3,0,6,5,8,0,7,0),3),c(3.6,.6,3.6,.6),c(.6,3.6,.4))
par(mar=c(3,3,0,0),mgp=2:0)
plot(log10(MTC)~log10(DR),data=geneStatsR[order(geneStatsR$cellMax,decreasing=F),],
     xlim=log10(range(geneStatsR$DR)),ylim=log10(range(geneStatsR$MTC)),
     pch=21,col=viridis(100,0.5)[iH],bg=viridis(100,0.3)[iH],
     xlab=expression(Log[10]~"Proportion of cells detecting gene"),
     ylab=expression(Log[10]~"Mean transcript count (MTC)"))
points(log10(MTC)~log10(DR),data=geneStatsR[drop_lowCell,],
       pch=4,col=alpha("red",0.5),cex=1.2)
legend("topleft",bty="n",pch=c(4,NA),col=c("red",NA),
       legend=c(paste("Genes in <",lowCellNum,"cells"),
                paste(sum(!drop_lowCell),"genes remain")))
par(mar=c(0,3,.1,0))
hist(log10(geneStatsR$DR),breaks=100,col="grey",main=NULL,xaxt="n")
abline(v=log10(DRcut-1/(2*ncol(eb1))),lty=2,lwd=2,col=alpha("red",0.5))
par(mar=c(3,0,0,.1))
barplot(hist(log10(geneStatsR$MTC),breaks=100,plot=F)$counts,
        horiz=T,space=0,col="grey",main=NULL,xlab="Frequency")
par(mar=c(0.1,3,.5,0))
barplot(rep(1,100),col=viridis(100,begin=1,end=0),space=0,border=NA,axes=F,ylim=c(-3,1))
text(c(1,50,100),rep(-1,3),labels=c(bquote(10^.(log10(min(geneStatsR$cellMax)))),
                                    expression(Log[10]~bold(max)~transcript~count),
                                    bquote(10^.(round(log10(max(geneStatsR$cellMax)),1)))))

par(mar=c(3,3,0,0),mgp=2:0)
plot(log10(MDTC)~log10(DR),data=geneStatsR[order(geneStatsR$cellMax,decreasing=F),],
     xlim=log10(range(geneStatsR$DR)),ylim=log10(range(geneStatsR$MDTC)),
     pch=21,col=viridis(100,0.5)[iH],bg=viridis(100,0.3)[iH],
     xlab=expression(Log[10]~"Proportion of cells detecting gene"),
     ylab=expression(Log[10]~"Mean transcript count of detected genes (MDTC)"))
points(log10(MDTC)~log10(DR),data=geneStatsR[drop_lowCell,],
       pch=4,col=alpha("red",0.5),cex=1.2)
par(mar=c(0,3,.1,0))
hist(log10(geneStatsR$DR),breaks=100,col="grey",main=NULL,xaxt="n")
abline(v=log10(DRcut-1/(2*ncol(eb1))),lty=2,lwd=2,col=alpha("red",0.5))
par(mar=c(3,0,0,.1))
barplot(hist(log10(geneStatsR$MDTC),breaks=100,plot=F)$counts,
        horiz=T,space=0,col="grey",main=NULL,xlab="Frequency")
par(mar=c(0.1,3,.5,0))
barplot(rep(1,100),col=viridis(100,begin=1,end=0),space=0,border=NA,axes=F,ylim=c(-3,1))
text(c(1,50,100),rep(-1,3),labels=c(bquote(10^.(log10(min(geneStatsR$cellMax)))),
                                    expression(Log[10]~bold(max)~transcript~count),
                                    bquote(10^.(round(log10(max(geneStatsR$cellMax)),1)))))

```

# Normalization

Next step is normalization.  Marioni proposed a normalization technique that attempts to generate cell-specific size factors that are robust to differential expression between genes in a heterogenous sample, unlike simple library-size normalization (http://f1000.com/fulltext/doi/10.1186/s13059-016-0947-7).  This method correlates strongly with library size normalization for homogenous samples, but solves a series of linear equations to deconvolute cell-specific size factors for normalization.  In order to better handle heterogenous data, they suggest separating the data by simple heirarchical clustering of a Spearman correlation-based distance metric so that they can normalize the separate subpopulations separately to prevent the suppression of true differential expression during normalization.  

Normalization is carried out by assigning size factors per gene by the pooling and deconvolution method, then taking the log-ratio between each count and its size factor, and adding a pseudocount of one.  Log-transforming the data stabilizes variance by reducing the impact of a few highly variable genes.  

Following this, it is suggested to investigate sources of technical variance, but without spike-ins or any annotated possible sources of variation, this step is not possible.  


```{r normalize_by_deconvolution_quickCluster, echo=F,fig.height=4.6,fig.width=8.4,fig.show="hold"}
if (!file.exists(paste0("../",dataName,"/output/",dataName,"_eb1Fnorm.RData"))) {
  qClust <- quickCluster(eb1F1) # skip for homogenous samples.  Might help with batch effect though?
  if (exists("qClust")) {
    names(qClust) <- colnames(eb1F1)
    forText_numQClust <- length(levels(qClust))
    eb1F1 <- computeSumFactors(eb1F1,clusters=qClust,positive=T)
  } else {
    eb1F1 <- computeSumFactors(eb1F1,positive=T)
  }
  dropNorm <- sum(sizeFactors(eb1F1) <= 0)
  eb1F <- eb1F1[,!sizeFactors(eb1F1) <= 0]
  eb1F <- normalize(eb1F)
  naCells <- apply(exprs(eb1F),2,function(X) any(is.na(X)))
  if (any(naCells)) {
    exprs(eb1F)[,naCells] <- min(apply(exprs(eb1F),1,function(X) min(X,na.rm = T)))
  }
  if (exists("qClust")) {
    save(dropNorm,forText_numQClust,qClust,eb1F,file=paste0("../",dataName,"/output/",dataName,"_eb1Fnorm.RData"))
  } else {
    save(dropNorm,eb1F,file=paste0("../",dataName,"/output/",dataName,"_eb1Fnorm.RData"))
  }
} else {
  load(paste0("../",dataName,"/output/",dataName,"_eb1Fnorm.RData"))
}

### Visualize normalization output (and effect of clustering if applicable)
geneStatsN <- data.frame(DR=apply(exprs(eb1F),1,function(X) sum(X > 0))/ncol(eb1F),
                         MDTC=apply(exprs(eb1F),1,function(X) mean(X[X > 0])),
                         MTC=rowMeans(exprs(eb1F)),sumTC=rowSums(exprs(eb1F)),
                         cellMax=apply(exprs(eb1F),1,max))

if (exists("qClust")) {
  gg_colour_hue <- function(n) {
    hues = seq(15, 375, length = n + 1)
    hcl(h = hues, l = 65, c = 100)[1:n]
  }
  if (length(levels(qClust)) <= 8) {
    clustCols <- brewer.pal(length(levels(qClust)),"Dark2")
  } else if (length(levels(qClust)) <= 12) {
    clustCols <- brewer.pal(length(levels(qClust)),"Set3")
  } else {
    clustCols <- gg_colour_hue(length(levels(qClust)))
  }
  D1x <- tapply(pData(eb1F1)$total_counts,qClust,function(X) density(X))
  D1y <- tapply(pData(eb1F1)$total_features,qClust,function(X) density(X))
  cycScores$scores$G1[is.na(cycScores$scores$G1)] <- 0
  cycScores$scores$G2M[is.na(cycScores$scores$G2M)] <- 0
  D2x <- tapply(cycScores$score$G1,qClust,function(X) density(X))
  D2y <- tapply(cycScores$score$G2M,qClust,function(X) density(X))
}

layout(matrix(c(2,1,7,0,3,7,5,4,7,0,6,7),3),widths=c(3.6,.6,3.6,.6),heights=c(.6,3.6,.4))
par(mar=c(3,3,0,0),mgp=2:0)
plot(x=NULL,y=NULL,xlim=range(pData(eb1F1)$total_counts),
     ylim=range(pData(eb1F1)$total_features),
     xlab="Library Size", ylab="Genes Detected")
if (exists("qClust")) {
  if (length(clustCols) %in% 9:12) {
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col="grey50")
    mtext("Quick clustering for normalization",side=3,line=-1.5,col="white")
  } else {
    mtext("Quick clustering for normalization",side=3,line=-1.5)
  }
  points(pData(eb1F1)$total_counts,pData(eb1F1)$total_features,
         pch=21,col=alpha(clustCols[qClust],0.4),
         bg=alpha(clustCols[qClust],0.2))
} else {
  points(pData(eb1F1)$total_counts,pData(eb1F1)$total_features,
         pch=21,col=alpha("black",0.3),bg=alpha("black",0.1))
}
points(pData(eb1F1)$total_counts[!colnames(eb1F1) %in% colnames(eb1F)],
       pData(eb1F1)$total_features[!colnames(eb1F1) %in% colnames(eb1F)],
       pch=4,col="red")
if (exists("qClust")) {
  par(mar=c(0,3,.1,0))
  plot(x=NULL,y=NULL,ylab="Density",xaxt="n",
       xlim=range(pData(eb1F1)$total_counts),
       ylim=c(min(sapply(D1x,function(X) min(X$y))),
              max(sapply(D1x,function(X) max(X$y)))))
  if (length(clustCols) %in% 9:12) {
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col="grey50")
  }
  for (x in seq_along(D1x)) {
    lines(D1x[[x]],col=clustCols[x],lwd=2)
  }
  par(mar=c(3,0,0,.1))
  plot(x=NULL,y=NULL,xlab="Density",yaxt="n",
       xlim=c(min(sapply(D1y,function(X) min(X$y))),
              max(sapply(D1y,function(X) max(X$y)))),
       ylim=range(pData(eb1F1)$total_features))
  if (length(clustCols) %in% 9:12) {
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col="grey50")
  }
  for (x in seq_along(D1x)) {
    lines(x=D1y[[x]]$y,y=D1y[[x]]$x,col=clustCols[x],lwd=2)
  }
} else {
  par(mar=c(0,3,.1,0))
  hist(pData(eb1F1)$total_counts,breaks=100,col="grey",main=NULL,xaxt="n")
  par(mar=c(3,0,0,.1))
  barplot(hist(pData(eb1F1)$total_features,breaks=100,plot=F)$counts,
          horiz=T,space=0,col="grey",main=NULL,xlab="Frequency")
}

par(mar=c(3,3,0,0),mgp=2:0)
plot(x=NULL,y=NULL,xlim=0:1,ylim=0:1,xlab="G1 score", ylab="G2/M score")
if (exists("qClust")) {
  if (length(clustCols) %in% 9:12) {
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col="grey50")
  }
  points(cycScores$score$G1,cycScores$score$G2M,
         pch=21,col=alpha(clustCols[qClust],0.4),
         bg=alpha(clustCols[qClust],0.2))
} else {
  points(cycScores$score$G1,cycScores$score$G2M,
         pch=21,col=alpha("black",0.3),bg=alpha("black",0.1))
}
points(cycScores$score$G1[!colnames(eb1F1) %in% colnames(eb1F)],
       cycScores$score$G2M[!colnames(eb1F1) %in% colnames(eb1F)],
       pch=4,col="red")
if (exists("qClust")) {
  par(mar=c(0,3,.1,0))
  plot(x=NULL,y=NULL,ylab="Density",xaxt="n",
       xlim=range(cycScores$score$G1),
       ylim=c(min(sapply(D2x,function(X) min(X$y))),
              max(sapply(D2x,function(X) max(X$y)))))
  if (length(clustCols) %in% 9:12) {
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col="grey50")
  }
  for (x in seq_along(D2x)) {
    lines(D2x[[x]],col=clustCols[x],lwd=2)
  }
  par(mar=c(3,0,0,.1))
  plot(x=NULL,y=NULL,xlab="Density",yaxt="n",
       xlim=c(min(sapply(D2y,function(X) min(X$y))),
              max(sapply(D2y,function(X) max(X$y)))),
       ylim=range(cycScores$score$G2M))
  if (length(clustCols) %in% 9:12) {
    rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col="grey50")
  }
  for (x in seq_along(D2x)) {
    lines(x=D2y[[x]]$y,y=D2y[[x]]$x,col=clustCols[x],lwd=2)
  }
} else {
  par(mar=c(0,3,.1,0))
  hist(cycScores$score$G1,breaks=100,col="grey",main=NULL,xaxt="n")
  par(mar=c(3,0,0,.1))
  barplot(hist(cycScores$score$G2M,breaks=100,plot=F)$counts,
          horiz=T,space=0,col="grey",main=NULL,xlab="Frequency")
}

par(mar=c(0.1,3,0,3))
plot.new()
legend("top",pch=4,col="red",bty="n",
       legend=paste(dropNorm,"cells could not be normalized."))
if (exists("qClust")) {
  legend("bottom",bty="n",horiz=T,
         legend=c("Cells:",as.vector(table(qClust))),
         pch=c(NA,rep(21,times=length(levels(qClust)))),
         col=c(NA,col=alpha(clustCols,0.5)),
         pt.bg=c(NA,alpha(clustCols,0.3)))
}


layout(matrix(c(2,1,4,0,3,0,6,5,8,0,7,0),3),widths=c(3.6,.6,3.6,.6),heights=c(.6,3.6,.4))
par(mar=c(3,3,0,0),mgp=2:0)
iH <- 101-cut(sort(pData(eb1F)$total_features),breaks=100,labels=F)
plot(pData(eb1F)[order(pData(eb1F)$total_features),"total_counts"],
     sizeFactors(eb1F)[order(pData(eb1F)$total_features)],log="xy",
     pch=21,col=viridis(100,0.5)[iH],bg=viridis(100,0.3)[iH],
     xlab="Library Size (log scale)",ylab="Size Factor (log scale)")
par(mar=c(0,3,.1,0))
hist(log10(pData(eb1F)[order(pData(eb1F)$total_features),"total_counts"]),
     breaks=100,col="grey",main=NULL,xaxt="n")
par(mar=c(3,0,0,.1))
barplot(hist(log10(sizeFactors(eb1F)[order(pData(eb1F)$total_features)]),
             breaks=100,plot=F)$counts,
        horiz=T,space=0,col="grey",main=NULL,xlab="Frequency")
par(mar=c(0.1,3,.5,0))
barplot(rep(1,100),col=viridis(100,begin=1,end=0),space=0,border=NA,axes=F,ylim=c(-3,1))
text(c(1,50,100),rep(-1,3),labels=c(min(pData(eb1F)$total_features),
                                    "Genes detected per cell",
                                    max(pData(eb1F)$total_features)))

iH <- 101-cut(sort(geneStatsN$cellMax),breaks=100,labels=F)
par(mar=c(3,3,0,0),mgp=2:0)
plot(MDTC~DR,data=geneStatsN[order(geneStatsN$cellMax),],
     pch=21,col=viridis(100,0.3)[iH],bg=viridis(100,0.2)[iH],
     xlab="Proportion of cells detecting gene",
     ylab="Mean normalized gene expression of detected genes")
par(mar=c(0,3,.1,0))
hist(geneStatsN$DR,breaks=100,col="grey",main=NULL,xaxt="n")
par(mar=c(3,0,0,.1))
barplot(hist(geneStatsN$MDTC,breaks=100,plot=F)$counts,
        horiz=T,space=0,col="grey",main=NULL,xlab="Frequency")
par(mar=c(0.1,3,.5,0))
barplot(rep(1,100),col=viridis(100,begin=1,end=0),
        space=0,border=NA,axes=F,ylim=c(-3,1))
text(c(1,50,100),rep(-1,3),labels=c(round(min(geneStatsN$cellMax),2),
                                    "Max normalized gene expression per cell",
                                    round(max(geneStatsN$cellMax),2)))
```
Cells that fail to normalize are generally due to poor information content (small library size, weak gene expression relative to other cells).

```{r, include=F}
rm(list=ls()[!(ls() %in% c("cycScores","eb1F","dataName") | grepl("forText",ls()))])
gc()
```


# Clustering by SNN-cliq  
Seurat implements an interpretation of SNN-Cliq (https://doi.org/10.1093/bioinformatics/btv088) for clustering of single-cell expression data.  They use PCs to define the distance metric, then embed the cells in a graph where edges between cells (nodes) are weighted based on their similarity (euclidean distance in PCA space).  These edge weights are refined based on Jaccard distance (overlap in local neighbourhoods), and then communities ("quasi-cliques") are identified in the graph using a smart local moving algorithm (SLM, http://dx.doi.org/10.1088/1742-5468/2008/10/P10008) to optimize the modularity measure of the defined communities in the graph.  
Cluster resolution is assessed by testing for differential expression between each cluster and all other clusters using a likelihood-ratio test designed for single-cell qPCR assays (https://doi.org/10.1093/bioinformatics/bts714).  Cluster distributions are modelled as mixture models, with a discrete component modelling the likelihood of detecting a gene, and a log-normal component representing gene expression of detected genes.  The number of uniquely differentially expressed genes (at a false discovery rate of 1% vs all other clusters, tested pairwise) was used to determine the optimal cluster resolution.

``` {r seurat_clustering, include=F}
if (!file.exists(paste0("../",dataName,"/output/",dataName,"_eb1S.RData"))) {
  pDat <- pData(eb1F)[,c("total_counts","total_features")]
  eb1S <- new("seurat",raw.data=exprs(eb1F))
  rm(eb1F)
  gc()
  eb1Ssm <- eb1S <- Setup(eb1S,project=dataName,min.cells=0,min.genes=0,do.logNormalize=F,save.raw=F)
  eb1Ssm@scale.data <- NULL
  eb1S@var.genes <- rownames(eb1S@data) 
  eb1S <- PCA(eb1S,pc.genes=eb1S@var.genes,do.print=F)
  print("PCs calculated")
  eb1S <- JackStraw(eb1S) 
  #PCElbowPlot(eb1S,40)
  #JackStrawPlot(eb1S)
  temp <- as.numeric(sub("PC[0-9]+ ","",levels(JackStrawPlot(eb1S,PCs=1:30)[["data"]]$PC.Score)))
  maxPCt <- max(which(temp < 1e-4))
  PCuse <- seq(1,maxPCt) #based on JackStraw output
  print("Significant PCs determined")
  eb1S <- RunTSNE(eb1S,dims.use=PCuse,do.fast=T)
  print("tSNE projection calculated")
  CL <- makeCluster(detectCores() - 2) # WARNING: Memory usage will multiply with the number of cores used.
  tempCL <- clusterEvalQ(CL,library(Seurat))
  clusterExport(CL,"eb1Ssm")
  rm(eb1Ssm)
  gc()
  
  minDEgenes <- uniqDE <- list()
  k <- 0; minUniqDE <- 100
  while (minUniqDE > 1) {
    if (minUniqDE <= 30) {
      k <- k + 0.2
    } else {
      k <- k + 0.4
    }
    print(paste0("~~~~~~~~~~~~ Clustering at res.",k," ~~~~~~~~~~~~"))
    if (!any(grepl("res",colnames(eb1S@data.info)))) {
      eb1S <- FindClusters(eb1S,pc.use=PCuse,print.output=F,save.SNN=T,resolution=k)
    } else {
      eb1S <- FindClusters(eb1S,pc.use=PCuse,print.output=F,reuse.SNN=T,resolution=k)
    }
    res <- colnames(eb1S@data.info)[length(colnames(eb1S@data.info))]
    temp_ident <- eb1S@ident <- eb1S@data.info[,res] <- as.factor(as.integer(eb1S@data.info[,res]) + 1)
    names(temp_ident) <- names(eb1S@ident) <- rownames(eb1S@data.info)
    tempCL <- clusterExport(CL,"temp_ident")
    tempCL <- clusterEvalQ(CL,{ eb1Ssm@ident <- temp_ident; rm(temp_ident); gc() })
    print(paste(length(levels(eb1S@ident)),"clusters identified"))
    
    deGall <- parLapplyLB(cl=CL,levels(eb1S@ident),function(X) {
      FindMarkers(eb1Ssm,X,test.use="bimod",thresh.use=0.25,min.pct=0.1)
    })
    for (i in seq_along(deGall)) {
      deGall[[i]]$fdr <- p.adjust(deGall[[i]]$p_val,"fdr")
      deGall[[i]] <- deGall[[i]][deGall[[i]]$fdr <= 1e-2 & deGall[[i]]$avg_diff > 0,] # postive DE at FDR of 1%
    }
    print("DE vs tissue calculated")
    
    compClust <- apply(combn(levels(eb1S@ident),2),2,function(X) paste(X,collapse="~"))
    deGvs <- parLapplyLB(CL,strsplit(compClust,"~"),function(X) {
      FindMarkers(eb1Ssm,ident.1=X[1],ident.2=X[2],test.use="bimod",thresh.use=0.25,min.pct=0.1)
    })
    for (i in seq_along(deGvs)) {
      deGvs[[i]]$fdr <- p.adjust(deGvs[[i]]$p_val,"fdr")
      deGvs[[i]]$clustPair <- factor(compClust[i])
      deGvs[[i]]$gene <- rownames(deGvs[[i]])
      deGvs[[i]] <- deGvs[[i]][deGvs[[i]]$fdr <= 1e-2,] # FDR at 1%
    }
    print("DE vs other clusters calculated")
    
    names(deGvs) <- apply(combn(levels(eb1S@ident),2),2,function(X) paste(X,collapse="~"))
    deGvs <- do.call(rbind,deGvs[order(names(deGvs))])
    deGvs$posClust <- as.factor(mapply(function(a,b) strsplit(as.character(a),"~")[[1]][(b < 0)+1],
                                       deGvs$clustPair,deGvs$avg_diff))
    uniqDE[[res]] <- tapply(deGvs$gene,deGvs$posClust,function(X) 
      names(table(X))[table(X) == length(levels(deGvs$posClust))-1])
    
    print("Number of DE genes to neighbouring clusters:")
    print(summary(sapply(uniqDE[[res]],length)))
    minDEgenes[[res]] <- minUniqDE <- min(sapply(uniqDE[[res]],length))
    
    DR <- apply(eb1S@data,1,function(X) tapply(X,eb1S@ident,function(Y) sum(Y>0)/length(Y)))
    MDTC <- apply(eb1S@data,1,
                  function(X) tapply(X,eb1S@ident,
                                     function(Y) {
                                       temp <- mean(Y[Y>0])
                                       if (is.na(temp)) { temp <- 0 }
                                       return(temp)
                                     }))
    MTC <- apply(eb1S@data,1,function(X) tapply(X,eb1S@ident,mean))
    CGS <- lapply(levels(eb1S@ident), function(X) data.frame(DR=DR[X,],MDTC=MDTC[X,],MTC=MTC[X,]))
    
    save(deGvs,file=paste0("../",dataName,"/output/",dataName,"_precalc_",gsub(".","",res,fixed=T),"_deGvs.RData"))
    save(deGall,file=paste0("../",dataName,"/output/",dataName,"_precalc_",gsub(".","",res,fixed=T),"_deGall.RData"))
    save(CGS,file=paste0("../",dataName,"/output/",dataName,"_precalc_",gsub(".","",res,fixed=T),"_CGS.RData"))
    rm(list=c("deGvs","deGall","CGS","DR","MTC","MDTC"))
    gc()
    print(paste("Saved data at",res))
  }
  stopCluster(CL)
  save(eb1S,maxPCt,minDEgenes,uniqDE,pDat,file=paste0("../",dataName,"/output/",dataName,"_eb1S.RData"))
}
```