utils_fastqc.R

## `utils_fastqc.R' contains utility functions to analyze outputs from the
## FastQC program (http://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
## Copyright (C) 2014-2015 Institut National de la Recherche Agronomique (INRA)
## License: GPL-3+
## Persons: Timothée Flutre [cre,aut], Nicolas Rode [ctb]
## Version: see below
## Download: https://github.com/timflutre/quantgen
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.

utils_fastqc.version <- "2.2.0" # http://semver.org/

##' Reads a "fastqc_data.txt" file generated by FastQC.
##'
##' Heavily inspired from readFastQC() in the Repitools package.
##' @param file the name of the file which the data are to be read from
##' @return list
##' @author Timothée Flutre [cre,aut]
read.fastqc.txt <- function(file){
  stopifnot(file.exists(file),
            grepl(pattern="##FastQC", x=readLines(file, n=1)))
  temp <- readLines(file)
  temp <- gsub("#", "", temp)
  temp <- temp[!grepl(">>END_MODULE", temp)]
  temp <- split(temp, cumsum(grepl("^>>", temp)))[-1]
  names(temp) <- sapply(temp, function(x) {
    gsub("^>>", "", gsub("\t.*", "", gsub(" ", "_", x[1])))
  })
  temp <- lapply(temp, function(x) {
    if(length(x)==1)
      return(data.frame())
    x <- strsplit(x[-1], split="\t")
    tab <- as.data.frame(do.call(rbind, x[-1]), stringsAsFactors=FALSE)
    for(i in 1:ncol(tab))
      if(!any(is.na(suppressWarnings(as.numeric(tab[,i])))))
        tab[,i] <- as.numeric(tab[,i])
    colnames(tab) <- x[[1]]
    tab
  })
  return(temp)
}

##' Loads several zip archives generated by FastQC.
##'
##' Zip archives are decompressed in a temporary directory which is removed afterwards.
##' @param path character vector of the path to the directory containing the zip archives (will be followed by "*_fastqc.zip")
##' @param glob character vector with wildcard(s) to find zip archives
##' @param verbose verbosity level
##' @return list of lists (one per zip archive)
##' @author Timothée Flutre [cre,aut], Nicolas Rode [ctb]
read.fastq.zips <- function(path=".", glob="*_fastqc.zip", verbose=0){
  zip.archives <- Sys.glob(paste(path, glob, sep="/"))
  if(length(zip.archives) == 0)
    stop("not a single zip archive was found", call.=FALSE)
  message(paste("nb of zip archives detected:", length(zip.archives)))

  all.qc <- lapply(zip.archives, function(zip.archive){
    qc <- NULL

    zipdir <- tempfile()
    dir.create(zipdir)

    if(verbose > 0)
      message(paste0("try to unzip ", zip.archive))
    retval <- tryCatch(
        {
          unzip(zip.archive, exdir=zipdir)
        },
        warning = function(w){
          message(paste(basename(zip.archive), "could no be unzipped."))
          message("Original warning message:")
          message(paste0(w, ""))
        },
        error = function(e){
          message(paste(basename(zip.archive), "could no be unzipped."))
          message("Original error message:")
          message(paste0(e, ""))
        })

    if(! is.null(retval)){
      if(verbose > 0)
        message(paste0("try to read fastqc_data.txt"))
      tryCatch(
          {
            f.base <- sub(".zip", "", basename(zip.archive))
            qc <- read.fastqc.txt(paste0(zipdir, "/", f.base, "/fastqc_data.txt"))
          },
          warning = function(w){
            qc <- list(warn="warning")
            message(paste0(sub(".zip", "", basename(zip.archive)),
                           ".txt could no be found."))
            message("Original warning message:")
            message(paste0(w, ""))
          },
          error = function(e){
            qc <- list(err="error")
            message(paste0(sub(".zip", "", basename(zip.archive)),
                           ".txt could no be found."))
            message("Original error message:")
            message(paste0(e, 33))
          })
    }

    unlink(zipdir)
    return(qc)
	})

  names(all.qc) <- sapply(zip.archives, function(zip.archive){
    sub("_fastqc.zip", "", basename(zip.archive))
  })

  return(all.qc[! sapply(all.qc, is.null)])
}

##' Returns the number of sequences per entry in a set of zip archives generated by FastQC.
##'
##' To be used after read.fastq.zips().
##' @param all.qc return value from read.fastq.zips()
##' @return numeric vector
##' @author Timothée Flutre [cre,aut]
nreads.fastqc <- function(all.qc){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)))
  sapply(all.qc, function(qc){
    as.numeric(qc[["Basic_Statistics"]]$Value[qc[["Basic_Statistics"]]$Measure
                                              == "Total Sequences"])
  })
}

##' Creates a bar plot with the number of sequences per entry in a set of zip archives generated by FastQC.
##'
##' To be used after nreads.fastqc().
##' @param x numeric vector with the number of sequences per entry
##' @param main an overall title for the plot
##' @param cex numeric character expansion factor for x-axis labels
##' @return None
##' @author Timothée Flutre [cre,aut]
barplot.nreads.fastqc <- function(x, main="", cex=1){
  stopifnot(is.vector(x), is.numeric(x), ! is.null(names(x)))
  par(mar=c(10, 7, 4, 1))
  bp <- barplot(sort(x), xaxt="n", xlab="", ylab="Number of sequences",
                main=main)
  axis(1, at=bp, labels=FALSE)
  text(bp, par("usr")[3], srt=45, adj=1.1, labels=names(sort(x)),
       xpd=TRUE, cex=cex)
}

##' Returns the number, or percentage, of sequences per quality score per entry in a set of zip archives generated by FastQC.
##'
##' To be used after read.fastq.zips().
##' @param all.qc return value from read.fastq.zips()
##' @param perc return percentage of sequences if TRUE, number of sequences otherwise
##' @param nreads return value from nreads.fastqc(), required if perc=TRUE
##' @return numeric matrix with entries in rows and number (percentage) of sequences per quality in columns
##' @author Timothée Flutre [cre,aut]
quals.fastqc <- function(all.qc, perc=FALSE, nreads=NULL){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)),
            ifelse(perc, ! is.null(nreads), TRUE))
  N <- length(all.qc)
  qual <- matrix(NA, nrow=N, ncol=50,
                 dimnames=list(names(all.qc), paste0("Q=", 1:50)))
  for(i in 1:N)
    qual[i, all.qc[[i]][["Per_sequence_quality_scores"]][,"Quality"]] <-
      all.qc[[i]][["Per_sequence_quality_scores"]][,"Count"]
  if(perc)
    for(i in 1:nrow(qual))
      qual[i,] <- (qual[i,] / nreads[i]) * 100
  return(qual)
}

##' Plot the number, or percentage, of sequences per quality score with one curve per dataset.
##'
##' To be used after quals.fastqc().
##' @param qual return value from quals.fastqc()
##' @param perc value of perc used when qual was generated by quals.fastqc()
##' @param ylim left and right limits of the y-axis (will be min and max of qual by default)
##' @param max.datasets.per.plot max number of datasets on the same plot
##' @param main an overall title for the plot
##' @param legend.x x coordinate to position the legend (no legend if NULL)
##' @param legend.y y coordinate to position the legend
##' @param legend.cex numeric character expansion factor for legend labels
##' @param add.2nd.yaxis add a 2nd y-axis on the right side of the plot
##' @return None
##' @author Timothée Flutre [cre,aut], Nicolas Rode [ctb]
plot.nbseq.qual <- function(qual,
                            perc=FALSE,
                            ylim=NULL,
                            max.datasets.per.plot=25,
                            main="Quality control",
                            legend.x="topleft",
                            legend.y=NULL,
                            legend.cex=1,
                            add.2nd.yaxis=TRUE){
  stopifnot(is.matrix(qual),
            ! is.null(rownames(qual)))

  xlab <- "Phred quality"
  ylab <- "Number of sequences"
  if(perc)
    ylab <- "Percentage of sequences"

  ## determine the lowest and highest qualities for the x-axis
  lowest.qual <- NA
  for(j in 1:ncol(qual)){
    if(any(! is.na(qual[,j]))){
      lowest.qual <- j
      break
    }
  }
  highest.qual <- NA
  for(j in ncol(qual):1){
    if(any(! is.na(qual[,j]))){
      highest.qual <- j
      break
    }
  }
  xlim <- c(lowest.qual, highest.qual)

  ## determine the lowest and highest counts for the y-axis
  lowest.count <- min(qual[,lowest.qual])
  highest.count <- max(qual[,lowest.qual])
  for(j in lowest.qual:highest.qual){
    lowest.count <- min(lowest.count, qual[,j], na.rm=TRUE)
    highest.count <- max(highest.count, qual[,j], na.rm=TRUE)
  }
  if(is.null(ylim))
    ylim <- c(lowest.count, highest.count)

  ## plot the data
  if(nrow(qual) <= max.datasets.per.plot){ # show all datasets on a single plot
    plot(x=0, y=0, xlim=xlim, ylim=ylim,
         xlab=xlab, ylab=ylab, main=main,
         type="n", bty="n")
    for(i in 1:nrow(qual)){
      idx <- which(! is.na(qual[i,]))
      points(x=idx, y=qual[i, idx], col=i, pch=(1:25)[i %% 25], type="b")
    }
    if(add.2nd.yaxis)
      axis(side=4)
    if(! is.null(legend.x))
      legend(x=legend.x, y=legend.y, cex=legend.cex, bty="n",
             legend=rownames(qual),
             col=1:nrow(qual),
             pch=1:min(25, nrow(qual)))
  } else{ # show all datasets on several plots
    nb.plots <- ceiling(nrow(qual) / max.datasets.per.plot)
    for(plot.id in 1:nb.plots){
      plot(x=0, y=0, xlim=xlim, ylim=ylim,
           xlab=xlab, ylab=ylab, main=main,
           type="n", bty="n")
      subset.idx.rows <- ((plot.id-1)*max.datasets.per.plot+1):(plot.id*max.datasets.per.plot)
      subset.idx.rows <- subset.idx.rows[subset.idx.rows %in% 1:nrow(qual)]
      for(i in subset.idx.rows){
        j <- i - (plot.id-1) * max.datasets.per.plot
        idx <- which(! is.na(qual[i,]))
        points(x=idx, y=qual[i, idx], col=j, pch=(1:25)[j %% 25], type="b")
      }
      if(add.2nd.yaxis)
        axis(side=4)
      if(! is.null(legend.x))
        legend(x=legend.x, y=legend.y, cex=legend.cex, bty="n",
               legend=rownames(qual)[subset.idx.rows],
               col=(1:max.datasets.per.plot)[1:length(subset.idx.rows)],
               pch=1:min(25, max.datasets.per.plot)[1:length(subset.idx.rows)])
    }
  }
}

##' Returns the adapter content along the sequences per entry in a set of zip archives generated by FastQC.
##'
##' To be used after read.fastq.zips().
##' @param all.qc return value from read.fastq.zips()
##' @param adp name of the adapter to plot (default="Illumina Universal Adapter")
##' @return numeric matrix with entries in rows and positions along sequences in columns
##' @author Timothée Flutre [cre,aut], Nicolas Rode [ctb]
adp.contents.fastqc <- function(all.qc, adp="Illumina Universal Adapter"){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)))
  N <- length(all.qc)
  L <- NULL
  for(i in 1:N)
    L <- c(L, length(all.qc[[i]]$Adapter_Content[["Position"]]))
  max.idx <- which(L == max(L))[1]
  positions <- all.qc[[max.idx]]$Adapter_Content[["Position"]]
  adp.content <- matrix(NA, nrow=N, ncol=length(positions),
                        dimnames=list(names(all.qc), positions))
  for(i in 1:N){
    stopifnot(adp %in% names(all.qc[[i]]$Adapter_Content))
    dif <- L[max.idx] - L[i]
    if(dif == 0){
      adp.content[i,] <- all.qc[[i]]$Adapter_Content[[adp]]
    } else
      adp.content[i,] <- c(all.qc[[i]]$Adapter_Content[[adp]], rep(0,dif))
  }
  return(adp.content)
}

##' Returns the N counts per entry in a set of zip archives generated by FastQC.
##'
##'
##' To be used after read.fastq.zips().
##' @param all.qc return value from read.fastq.zips()
##' @return numeric matrix with entries in rows and positions along sequences in columns
##' @author Timothée Flutre [cre,aut]
baseNs.fastqc <- function(all.qc){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)))
  N <- length(all.qc)
  L <- NULL
  for(i in 1:N)
    L <- c(L, length(all.qc[[i]]$Per_base_N_content[,"Base"]))
  max.idx <- which(L == max(L))[1]
  positions <- all.qc[[max.idx]]$Per_base_N_content[,"Base"]
  baseN <- matrix(NA, nrow=N, ncol=length(positions),
                  dimnames=list(names(all.qc), positions))
  for(i in 1:N){
    dif <- L[max.idx] - L[i]
    if(dif == 0){
      baseN[i,] <- all.qc[[i]]$Per_base_N_content[,"N-Count"]
    } else
      baseN[i,] <- c(all.qc[[i]]$Per_base_N_content[,"N-Count"], rep(0,dif))
  }
  return(baseN)
}

##' Plot a variable content (adapter or N) as percentage along the sequences per entry in a set of zip archives generated by FastQC.
##'
##' To be used after adp.contents.fastqc() or baseNs.fastqc().
##' @param content return value from adp.contents.fastqc() or baseNs.fastqc()
##' @param max.datasets.per.plot max number of datasets on the same plot
##' @param lowest.perc lowest percentage of content for the y-axis
##' @param highest.perc highest percentage of content for the y-axis
##' @param ylab a title for the y axis
##' @param main an overall title for the plot
##' @param legend.x x coordinate to position the legend (no legend if NULL)
##' @param legend.y y coordinate to position the legend
##' @param legend.cex numeric character expansion factor for legend labels
##' @param add.2nd.yaxis add a 2nd y-axis on the right side of the plot
##' @return None
##' @author Timothée Flutre [cre,aut], Nicolas Rode [ctb]
plot.content <- function(content,
                         max.datasets.per.plot=25,
                         lowest.perc=NULL,
                         highest.perc=NULL,
                         ylab="Content (%)",
                         main="Quality control",
                         legend.x="topleft",
                         legend.y=NULL,
                         legend.cex=1,
                         add.2nd.yaxis=TRUE){
  stopifnot(is.matrix(content),
            ! is.null(rownames(content)),
            ! is.null(colnames(content)))

  xlab <- "Positions (bp)"

  ## determine the range of positions for the x-axis
  positions <- sapply(strsplit(colnames(content), "-"),
                      function(x){as.numeric(x[1])})
  xlim <- c(positions[1], positions[length(positions)])

  ## determine the lowest and highest content percentage for the y-axis
  if(is.null(lowest.perc))
    lowest.perc <- min(c(content))
  if(is.null(highest.perc))
    highest.perc <- max(c(content))
  ylim <- c(lowest.perc, highest.perc)

  ## plot the data
  if(nrow(content) <= max.datasets.per.plot){ # show all datasets on a single plot
    plot(x=0, y=0, xlim=xlim, ylim=ylim,
         xlab=xlab, ylab=ylab, main=main,
         type="n", bty="n")
    for(i in 1:nrow(content))
      points(x=positions, y=content[i,], col=i, pch=(1:25)[i %% 25], type="b")
    if(add.2nd.yaxis)
      axis(side=4)
    if(! is.null(legend.x))
      legend(x=legend.x, y=legend.y, cex=legend.cex, bty="n",
             legend=rownames(content),
             col=1:nrow(content),
             pch=1:min(25, nrow(content)))
  } else{ # show all datasets on several plots
    nb.plots <- ceiling(nrow(content) / max.datasets.per.plot)
    for(plot.id in 1:nb.plots){
      plot(x=0, y=0, xlim=xlim, ylim=ylim,
           xlab=xlab, ylab=ylab, main=main,
           type="n", bty="n")
      subset.idx.rows <- ((plot.id-1)*max.datasets.per.plot+1):(plot.id*max.datasets.per.plot)
      subset.idx.rows <- subset.idx.rows[subset.idx.rows %in% 1:nrow(content)]
      for(i in subset.idx.rows){
        j <- i - (plot.id-1) * max.datasets.per.plot
        points(x=positions, y=content[i,], col=j, pch=(1:25)[j %% 25], type="b")
      }
      if(add.2nd.yaxis)
        axis(side=4)
      if(! is.null(legend.x))
        legend(x=legend.x, y=legend.y, cex=legend.cex, bty="n",
               legend=rownames(content)[subset.idx.rows],
               col=(1:max.datasets.per.plot)[1:length(subset.idx.rows)],
               pch=1:min(25, max.datasets.per.plot)[1:length(subset.idx.rows)])
    }
  }
}

##' Returns the sequence length distribution per entry in a set of zip archives generated by FastQC.
##'
##' To be used after read.fastq.zips().
##' @param all.qc return value from read.fastq.zips()
##' @return numeric matrix with entries in rows and positions along sequences in columns
##' @author Timothée Flutre [cre,aut], Nicolas Rode [ctb]
seq.lengths.fastqc <- function(all.qc){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)))
  N <- length(all.qc)
  positions <- lapply(1:N, function(i){
    as.character(all.qc[[i]]$Sequence_Length_Distribution[["Length"]])
  })
  positions <- unique(sort(sapply(
      strsplit(do.call(c, positions), "-"), function(x){
        as.numeric(x[1])
      })))
  seq.lengths <- matrix(0, nrow=N, ncol=length(positions),
                        dimnames=list(names(all.qc), positions))
  for(i in 1:N){
    tmp <- all.qc[[i]]$Sequence_Length_Distribution
    if (length(grep("-",tmp$Length)) != 0)
      tmp$Length <- sapply(strsplit(tmp$Length, "-"), function(x){x[1]})
    seq.lengths[i,as.character(tmp$Length)] <- tmp$Count
  }
  return(seq.lengths)
}

##' Plot the distribution of sequence lengths per entry in a set of zip archives generated by FastQC.
##'
##' To be used after seq.lengths.fastqc().
##' @param seq.length matrix with datasets in rows and number of sequences per length in columns
##' @param max.datasets.per.plot max number of datasets on the same plot
##' @param lowest.len lowest sequence length for the y-axis
##' @param highest.len highest sequence length for the y-axis
##' @param main an overall title for the plot
##' @param ylab label for the y-axis
##' @param legend.x x coordinate to position the legend (no legend if NULL)
##' @param legend.y y coordinate to position the legend
##' @param legend.cex numeric character expansion factor for legend labels
##' @param add.2nd.yaxis add a 2nd y-axis on the right side of the plot
##' @return None
##' @author Timothée Flutre [cre,aut], Nicolas Rode [ctb]
plot.seq.length <- function(seq.length,
                            max.datasets.per.plot=25,
                            lowest.len=NULL,
                            highest.len=NULL,
                            main="Quality control",
                            ylab="Number of sequences",
                            legend.x="topleft",
                            legend.y=NULL,
                            legend.cex=1,
                            add.2nd.yaxis=TRUE){
  stopifnot(is.matrix(seq.length),
            ! is.null(rownames(seq.length)),
            ! is.null(colnames(seq.length)))

  xlab <- "Sequence lengths (bp)"

  ## determine the range of lengths for the x-axis
  lengths <- sapply(strsplit(colnames(seq.length), "-"),
                    function(x){as.numeric(x[1])})
  xlim <- c(lengths[1], lengths[length(lengths)])

  ## determine the lowest and highest counts of seq lengths for the y-axis
  if(is.null(lowest.len)){
    lowest.len <- min(c(seq.length))
    if(is.infinite(lowest.len))
      stop("did you give log10(seq.length)? maybe use also lowest.len=0")
  }
  if(is.null(highest.len))
    highest.len <- max(c(seq.length))
  ylim <- c(lowest.len, highest.len)

  ## plot the data
  if(nrow(seq.length) <= max.datasets.per.plot){ # show all datasets on a single plot
    plot(x=0, y=0, xlim=xlim, ylim=ylim,
         xlab=xlab, ylab=ylab, main=main,
         type="n", bty="n")
    for(i in 1:nrow(seq.length))
      points(x=lengths, y=seq.length[i,], col=i, pch=(1:25)[i %% 25], type="b")
    if(add.2nd.yaxis)
      axis(side=4)
    if(! is.null(legend.x))
      legend(x=legend.x, y=legend.y, cex=legend.cex, bty="n",
             legend=rownames(seq.length),
             col=1:nrow(seq.length),
             pch=1:min(25, nrow(seq.length)))
  } else{ # show all datasets on several plots
    nb.plots <- ceiling(nrow(seq.length) / max.datasets.per.plot)
    for(plot.id in 1:nb.plots){
      plot(x=0, y=0, xlim=xlim, ylim=ylim,
           xlab=xlab, ylab=ylab, main=main,
           type="n", bty="n")
      subset.idx.rows <- ((plot.id-1)*max.datasets.per.plot+1):(plot.id*max.datasets.per.plot)
      subset.idx.rows <- subset.idx.rows[subset.idx.rows %in% 1:nrow(seq.length)]
      for(i in subset.idx.rows){
        j <- i - (plot.id-1) * max.datasets.per.plot
        points(x=lengths, y=seq.length[i,], col=j, pch=(1:25)[j %% 25], type="b")
      }
      if(add.2nd.yaxis)
        axis(side=4)
      if(! is.null(legend.x))
        legend(x=legend.x, y=legend.y, cex=legend.cex, bty="n",
               legend=rownames(seq.length)[subset.idx.rows],
               col=(1:max.datasets.per.plot)[1:length(subset.idx.rows)],
               pch=1:min(25, max.datasets.per.plot)[1:length(subset.idx.rows)])
    }
  }
}