From c8495a4154e06ff6e025692514457dd10488fde7 Mon Sep 17 00:00:00 2001 From: Adam Dinan Date: Mon, 22 Jan 2024 13:55:24 -0600 Subject: [PATCH] Remove GFF requirement when kallisto pseudoaligner used --- .github/workflows/ci.yml | 1 + README.md | 2 +- bin/count_reads_BIOTYPES.R | 346 +++++++++++++++++++++++++++++++++++ bin/merge_kallisto_counts.py | 98 +++++----- bin/normalise_counts.R | 7 +- main.nf | 7 +- modules/kallisto.nf | 8 +- 7 files changed, 413 insertions(+), 56 deletions(-) create mode 100644 bin/count_reads_BIOTYPES.R diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 07462c7..a3b0213 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,6 +43,7 @@ jobs: - name: Run pipeline with test data run: | nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner kallisto --outdir ./results - name: Login to Docker Hub uses: docker/login-action@v1 diff --git a/README.md b/README.md index 855d18c..f2bd999 100644 --- a/README.md +++ b/README.md @@ -39,13 +39,13 @@ nextflow run BactSeq --data_dir [dir] --sample_file [file] --ref_genome [file] - Mandatory arguments: --data_dir [file] Path to directory containing FastQ files. --ref_genome [file] Path to FASTA file containing reference genome sequence (bwa) or multi-FASTA file containing coding gene sequences (kallisto). - --ref_ann [file] Path to GFF file containing reference genome annotation. --sample_file [file] Path to file containing sample information. -profile [str] Configuration profile to use. Available: conda, docker, singularity. Other options: --aligner [str] (Pseudo-)aligner to be used. Options: `bwa`, `kallisto`. Default = bwa. + --ref_ann [file] Path to GFF file containing reference genome annotation. Required only if bwa aligner used. --cont_tabl [file] Path to tsv file containing contrasts to be performed for differential expression. --fragment_len [str] Estimated average fragment length for kallisto transcript quantification (only required for single-end reads). Default = 150. --fragment_sd [str] Estimated standard deviation of fragment length for kallisto transcript quantification (only required for single-end reads). Default = 20. diff --git a/bin/count_reads_BIOTYPES.R b/bin/count_reads_BIOTYPES.R new file mode 100644 index 0000000..0d6508d --- /dev/null +++ b/bin/count_reads_BIOTYPES.R @@ -0,0 +1,346 @@ +#!/usr/bin/env Rscript + +if (!require("optparse")){ + install.packages("optparse",repos = "http://cran.us.r-project.org") +} +if (!require("tidyverse")){ + install.packages("tidyverse",repos = "http://cran.us.r-project.org") +} +if (!require("scales")){ + install.packages("scales",repos = "http://cran.us.r-project.org") +} +if (!require("RColorBrewer")){ + install.packages("RColorBrewer",repos = "http://cran.us.r-project.org") +} + + +library(optparse) +library(tidyverse) +library(scales) +library(RColorBrewer) +library(reshape2) + + +option_list <- list( + make_option(c("-m", "--metadata"), type="character", default=NULL, + help="sample metadata tsv file", metavar="character"), + make_option(c("-r", "--ref_gene_f"), type="character", default=NULL, + help="Gene annotations in reference strain", + metavar="character"), + make_option(c("-g", "--gene_counts_f"), type="character", default=NULL, + help="Gene annotations in reference strain", + metavar="character"), + make_option(c("-p", "--is_paired"), type="character", default=NULL, + help="are the reads paired-end? default = FALSE", + metavar="character") +) + +opt_parser <- OptionParser(option_list=option_list) +opt <- parse_args(opt_parser) + +meta_f <- opt$metadata +ref_gene_f <- opt$ref_gene_f +gene_counts_f <- opt$gene_counts_f +ispaired <- if(opt$is_paired == "TRUE") TRUE else FALSE + + +##------------------------------------------------------------------------------ +## Read data +##------------------------------------------------------------------------------ +meta_tab <- read.table(meta_f, header = TRUE, sep = "\t") +## columns: sample file1 file2 group rep_no paired + + +## cat the counts files +# system("cat *.counts > merged_counts.txt") + +total_counts_list <- lapply(meta_tab$sample, function(x){ + # total_counts <- read.table(paste0(x,".counts"), header = FALSE, sep = "\t") + # total_counts$sample <- x + # colnames(total_counts) <- c("chr","chr_size","mapped","blank","sample") + # total_counts + mapped_count <- read.table(paste0(x,".counts"), header = FALSE) + colnames(mapped_count) <- "mapped" + mapped_count$sample <- x + mapped_count +}) +merged_total_counts <- as.data.frame(do.call(rbind, total_counts_list)) + + + +##------------------------------------------------------------------------------ +## Read genome annotation +##------------------------------------------------------------------------------ +# ref_annot <- ape::read.gff(gff_f, na.strings = c(".", "?"), GFF3 = TRUE) + +# ref_annot <- subset(ref_annot, type=="gene") + +# gene_attr <- stringr::str_split(ref_annot$attributes,";") +# locus_tags <- unlist(lapply(gene_attr, function(x){x[grepl("locus_tag", x)]})) +# gene_biotypes <- unlist(lapply(gene_attr, function(x){x[grepl("gene_biotype", x)]})) +# common_gene_names <- unlist(lapply(gene_attr, function(x){ +# x <- x[grepl("gene=", x)] +# x[identical(x, character(0))] <- "" +# x +# })) +# gene_lengths <- (ref_annot$end - ref_annot$start) + 1 + +# ref_gene_df <- data.frame( +# locus_tag = locus_tags, +# biotype = gene_biotypes, +# gene_name = common_gene_names, +# gene_length = gene_lengths +# ) +# ref_gene_df$locus_tag <- gsub("locus_tag=","",ref_gene_df$locus_tag) +# ref_gene_df$biotype <- gsub("gene_biotype=","",ref_gene_df$biotype) +# ref_gene_df$gene_name <- gsub("gene=","",ref_gene_df$gene_name) + +# write.table( +# ref_gene_df, "ref_gene_df.tsv", col.names = TRUE, row.names = FALSE, +# sep = "\t", quote = FALSE +# ) + +ref_gene_df <- read_tsv(ref_gene_f) + + +##------------------------------------------------------------------------------ +## Count reads mapping to genes +##------------------------------------------------------------------------------ +# bamfilesCount <- paste0(meta_tab$sample, ".bam") + +# # 0 (unstranded), 1 (stranded) and 2 (reversely stranded) +# strand <- switch(as.character(strandedness), +# "unstranded" = 0, +# "forward" = 1, +# "reverse" = 2, +# stop("Invalid input") +# ) + +# gene_counts <- Rsubread::featureCounts( +# bamfilesCount, annot.ext = gff_f, +# isGTFAnnotationFile = TRUE, +# GTF.featureType = "gene", +# GTF.attrType = "locus_tag", +# nthreads = threads, +# countMultiMappingReads = TRUE, +# fraction = TRUE, ## assign fractional counts to multimappers +# isPairedEnd = ispaired, +# strandSpecific = strand +# ) +# colnames(counts_mat) <- gsub(".bam", "", colnames(counts_mat)) +# colnames(counts_mat) <- gsub("\\.","_",colnames(counts_mat)) + + +# counts_mat <- counts_mat +# counts_mat <- tibble::rownames_to_column(as.data.frame(counts_mat), "feature_id") + + +# write.table( +# counts_mat, "gene_counts.tsv", col.names = TRUE, row.names = FALSE, +# sep = "\t", quote = FALSE +# ) + +# ## protein-coding genes only +# gene_counts_pc <- counts_mat[ref_gene_df$biotype=="protein_coding",] +# write.table( +# gene_counts_pc, "gene_counts_pc.tsv", col.names = TRUE, row.names = FALSE, +# sep = "\t", quote = FALSE +# ) + +gene_counts <- read_tsv(gene_counts_f) +counts_mat <- as.data.frame(gene_counts[,2:ncol(gene_counts)]) +rownames(counts_mat) <- gene_counts$feature_id + + +##------------------------------------------------------------------------------ +## Plot library composition +##------------------------------------------------------------------------------ +## set up plots +brewer_pallette1 <- brewer.pal(9,"Set1") +brewer_pallette3 <- brewer.pal(8,"Dark2") + +gg_color_hue <- function(n) { + hues = seq(15, 375, length = n + 1) + hcl(h = hues, l = 65, c = 100)[1:n] +} +ggColsDefault <- (gg_color_hue(4)) +ggCols <- brewer_pallette1[c(1,3,4,5,2,7,8)] + +## summarise counts per sample + +all_biotypes <- unique(ref_gene_df$biotype) + +biotype_counts <- data.frame(do.call(cbind, + lapply(all_biotypes, function(biotype){ + colSums(counts_mat[ref_gene_df$biotype==biotype,,drop=FALSE]) +}))) +colnames(biotype_counts) <- all_biotypes + + + +counts_summary <- data.frame( + sample = meta_tab$"sample", + group = meta_tab$"group", + rep = meta_tab$"rep_no"#, + # protein_coding = colSums( + # counts_mat[ref_gene_df$biotype=="protein_coding",]), + # tRNA = colSums( + # counts_mat[ref_gene_df$biotype=="tRNA",]), + # rRNA = colSums( + # counts_mat[ref_gene_df$biotype=="rRNA",]) +) + +counts_summary <- cbind(counts_summary,biotype_counts) + +## add total mapped counts +counts_summary <- merge(counts_summary,merged_total_counts, by = "sample") +# counts_summary$other <- counts_summary$mapped-counts_summary$rRNA + + +counts_summary$antisense_other <- counts_summary$mapped - rowSums(biotype_counts)#( + #counts_summary$rRNA + counts_summary$protein_coding) + +write.table( + counts_summary, "counts_summary.tsv", col.names = TRUE, row.names = FALSE, + sep = "\t", quote = FALSE +) + +write.table( + biotype_counts, "biotype_counts.tsv", col.names = TRUE, row.names = FALSE, + sep = "\t", quote = FALSE +) + + + +counts_summary <- counts_summary[rev(order(counts_summary$sample)),] + + +cc1 <- 12 + +all_biotypes <- c(all_biotypes, "antisense_other") +non_rRNA_btypes <- all_biotypes[!all_biotypes=="rRNA"] + + +############################# +## raw counts plot +############################# +counts_melt <- reshape2::melt( + counts_summary, id.vars = c("sample"), + measure.vars = c( + # "protein_coding", + # "antisense_other", + # "rRNA" + all_biotypes + ) +) +counts_melt$sample <- factor( + counts_melt$sample, levels = rev(unique(sort(counts_melt$sample))) +) +counts_melt$variable <- factor(counts_melt$variable, levels=c( + "rRNA", + non_rRNA_btypes + # "antisense_other", + # "protein_coding" + ) +) +# minUsable <- min(mergedDf$q15_dedup_reads) + + +ylabel <- ifelse(isTRUE(ispaired), "Million read pairs", "Million reads") + +p1 <- ggplot(counts_melt, + aes(x = sample, colour = variable, fill = variable, y = value) + ) + + geom_bar(position = "stack", stat = "identity", width = 0.7) + + coord_flip() + + xlab("Sample") + ylab(ylabel) + + scale_fill_manual( + "", + values = ggCols, + guide = guide_legend(reverse = TRUE) + ) + + scale_colour_manual(values = ggCols, guide = FALSE) + + scale_y_continuous(labels = unit_format(unit = "", scale = 1e-6)) + + ## add a dashed line at the min usable number of reads + # geom_hline(yintercept = 5e6, linetype="dashed") + + theme_bw(base_size = cc1*1.3) + + theme( + legend.position = "top", + legend.title = element_blank(), + legend.text=element_text(size=cc1), + axis.text.x = element_text(colour = "black"), + axis.text.y = element_text(colour = "black") + ) + +nsamps <- ncol(counts_mat) + +ggsave( + p1, file = paste0('library_composition.png'), + device = "png", + width = 8, height = (nsamps/2.2), + dpi = 300 +) + + +############################# +## proportions plot +############################# +## get the proportions of reads per library +# propCols <- (mergedDf[,c(3,13,14,5)] / mergedDf[,2]) + +propCols <- counts_summary[c( + # "protein_coding", + # "antisense_other", + # "rRNA" + all_biotypes + )] / counts_summary$mapped + +propCols$sample <- counts_summary$sample + +# rowSums(propCols) ## each row should sum to 1 +prop_melt <- melt( + propCols, id.vars = c("sample"), + measure.vars = c( + # "protein_coding", + # "antisense_other", + # "rRNA" + all_biotypes + ) +) +prop_melt$sample <- factor( + prop_melt$sample, levels = rev(unique(sort(prop_melt$sample))) +) +prop_melt$variable <- factor(prop_melt$variable, levels=c( + "rRNA", + non_rRNA_btypes + # "antisense_other", + # "protein_coding" + ) +) + +p2 <- ggplot(prop_melt, + aes(x = sample, colour = variable, fill = variable, y = value) + ) + geom_bar(stat = "identity", width = 0.7) + + coord_flip() + + xlab("Sample") + ylab("Proportion of reads") + + scale_fill_manual( + "", + values = ggCols, + guide = guide_legend(reverse = TRUE) + ) + + scale_colour_manual(values = ggCols, guide = FALSE) + + scale_y_continuous(labels = comma) + + theme_bw(base_size = cc1*1.3) + + theme( + legend.position="top", + legend.text=element_text(size=cc1), + axis.text.x = element_text(colour = "black"), + axis.text.y = element_text(colour = "black") + ) + +ggsave( + p2, file = 'library_composition_proportions.png', + device = "png", + width = 8, height = (nsamps/2.2), + dpi = 300 +) \ No newline at end of file diff --git a/bin/merge_kallisto_counts.py b/bin/merge_kallisto_counts.py index e26d545..7bbe35b 100755 --- a/bin/merge_kallisto_counts.py +++ b/bin/merge_kallisto_counts.py @@ -3,71 +3,75 @@ import argparse import pandas as pd import os.path -import gffpandas.gffpandas as gffpd + +# import gffpandas.gffpandas as gffpd from collections import Counter def parse(): parser = argparse.ArgumentParser() - parser.add_argument( - "--metadata_f", - help = "TSV file containing metadata" - ) - parser.add_argument( - "--gff_f", - help = "GFF annotation file" - ) + parser.add_argument("--metadata_f", help="TSV file containing metadata") + # parser.add_argument("--gff_f", help="GFF annotation file") parser.add_argument("--out_dir", help="Directory for results") args = parser.parse_args() merge_counts(**vars(args)) -def merge_counts(metadata_f, gff_f, out_dir): + +def merge_counts( + metadata_f, + # gff_f, + out_dir, +): # merge counts - metadata = pd.read_csv(metadata_f, sep = "\t") + metadata = pd.read_csv(metadata_f, sep="\t") quant_dfs = [] for index, row in metadata.iterrows(): - sample_name = row['sample'] - quant_file = os.path.join('kallisto_'+sample_name, 'abundance.tsv') - quant_dat = pd.read_csv(quant_file, sep = "\t") + sample_name = row["sample"] + quant_file = os.path.join("kallisto_" + sample_name, "abundance.tsv") + quant_dat = pd.read_csv(quant_file, sep="\t") quant_dat = quant_dat[["target_id", "est_counts", "length"]] - quant_dat = quant_dat.rename(columns={'est_counts': sample_name}) + quant_dat = quant_dat.rename(columns={"est_counts": sample_name}) quant_dfs.append(quant_dat) - quant_dfs = [df.set_index('target_id') for df in quant_dfs] + quant_dfs = [df.set_index("target_id") for df in quant_dfs] quant_merged = pd.concat(quant_dfs, axis=1) - quant_merged = quant_merged.loc[:,~quant_merged.columns.duplicated()] + quant_merged = quant_merged.loc[:, ~quant_merged.columns.duplicated()] # get gene lengths from kallisto results (ensures no divergence from GFF) - quant_merged = quant_merged.rename_axis('feature_id').reset_index() - gene_lengths = quant_merged[['feature_id', 'length']] - quant_merged = quant_merged[['feature_id'] + metadata['sample'].tolist()] + quant_merged = quant_merged.rename_axis("feature_id").reset_index() + gene_lengths = quant_merged[["feature_id", "length"]] + gene_lengths = gene_lengths.rename(columns={"feature_id": "locus_tag"}) + quant_merged = quant_merged[["feature_id"] + metadata["sample"].tolist()] # export merged counts - outf1 = os.path.join(out_dir, 'gene_counts.tsv') - quant_merged.to_csv(outf1, index=False, sep='\t') - # gene annotation df for extracting protein-coding genes - annot_dat = (gffpd.read_gff3(gff_f)).df - cds_annot = (annot_dat[annot_dat['type']=='CDS']).copy(deep=True) - # rRNA_annot = annot_dat[annot_dat['type']=='rRNA'] - annot_split = cds_annot['attributes'].str.split(";") - locus_tags = (annot_split.str[1]).str.replace(r'Parent=gene-','') - gene_names = ((annot_split.str[6]).str.split("=")).str[1] - cds_annot['locus_tag'] = locus_tags - cds_annot['gene_name'] = gene_names - cds_annot['gene_length'] = (cds_annot['end'] - cds_annot['start']) + 1 - cds_annot['biotype'] = 'protein_coding' - cds_annot = cds_annot[['locus_tag', 'biotype', 'gene_name', 'gene_length']] - # export PC counts - quant_merged_pc = quant_merged[quant_merged['feature_id'].isin( - cds_annot['locus_tag'].tolist() - )] - outf2 = os.path.join(out_dir, 'gene_counts_pc.tsv') - quant_merged_pc.to_csv(outf2, index=False, sep='\t') + outf1 = os.path.join(out_dir, "gene_counts.tsv") + quant_merged.to_csv(outf1, index=False, sep="\t") + # # gene annotation df for extracting protein-coding genes + # annot_dat = (gffpd.read_gff3(gff_f)).df + # cds_annot = (annot_dat[annot_dat['type']=='CDS']).copy(deep=True) + # # rRNA_annot = annot_dat[annot_dat['type']=='rRNA'] + # annot_split = cds_annot['attributes'].str.split(";") + # locus_tags = (annot_split.str[1]).str.replace(r'Parent=gene-','') + # gene_names = ((annot_split.str[6]).str.split("=")).str[1] + # cds_annot['locus_tag'] = locus_tags + # cds_annot['gene_name'] = gene_names + # cds_annot['gene_length'] = (cds_annot['end'] - cds_annot['start']) + 1 + # cds_annot['biotype'] = 'protein_coding' + # cds_annot = cds_annot[['locus_tag', 'biotype', 'gene_name', 'gene_length']] + # # export PC counts + # quant_merged_pc = quant_merged[ + # quant_merged["feature_id"].isin(cds_annot["locus_tag"].tolist()) + # ] + quant_merged_pc = quant_merged + outf2 = os.path.join(out_dir, "gene_counts_pc.tsv") + quant_merged_pc.to_csv(outf2, index=False, sep="\t") # export ref gene df (using kallisto gene lengths instead of GFF) - gene_lengths = gene_lengths.rename(columns={'feature_id': 'locus_tag'}) - cds_annot = pd.merge(cds_annot, gene_lengths, on = "locus_tag") - cds_annot = cds_annot[['locus_tag', 'biotype', 'gene_name', 'length']] - cds_annot = cds_annot.drop_duplicates(keep='first') - cds_annot = cds_annot.rename(columns={'length': 'gene_length'}) - outf3 = os.path.join(out_dir, 'ref_gene_df.tsv') - cds_annot.to_csv(outf3, index=False, sep='\t') + # cds_annot = pd.merge(cds_annot, gene_lengths, on="locus_tag") + # cds_annot = cds_annot[["locus_tag", "biotype", "gene_name", "length"]] + # cds_annot = cds_annot.drop_duplicates(keep="first") + # cds_annot = cds_annot.rename(columns={"length": "gene_length"}) + # when not using GFF annotation: + gene_lengths = gene_lengths.rename(columns={"length": "gene_length"}) + outf3 = os.path.join(out_dir, "ref_gene_df.tsv") + # cds_annot.to_csv(outf3, index=False, sep="\t") + gene_lengths.to_csv(outf3, index=False, sep="\t") if __name__ == "__main__": diff --git a/bin/normalise_counts.R b/bin/normalise_counts.R index 94acbff..f76f7c1 100755 --- a/bin/normalise_counts.R +++ b/bin/normalise_counts.R @@ -40,9 +40,10 @@ counts_tab[["feature_id"]] <- NULL counts_tab <- as.data.frame(sapply(counts_tab, as.numeric)) rownames(counts_tab) <- gene_names -## remove rRNA genes -ref_tab_sub <- ref_gene_tab[!ref_gene_tab$biotype == "rRNA", ] -non_rRNA_counts <- counts_tab[rownames(counts_tab) %in% ref_tab_sub$locus_tag, ] +# ## remove rRNA genes +# ref_tab_sub <- ref_gene_tab[!ref_gene_tab$biotype == "rRNA", ] +# non_rRNA_counts <- counts_tab[rownames(counts_tab) %in% ref_tab_sub$locus_tag, ] +ref_tab_sub <- ref_gene_tab ## ensure that ref gene annotations order matches counts table ref_tab_sub <- ref_tab_sub[match( diff --git a/main.nf b/main.nf index 751a846..064d3cf 100644 --- a/main.nf +++ b/main.nf @@ -214,11 +214,12 @@ workflow { */ MERGE_COUNTS ( ch_kallisto_out_dirs, - ch_gff_file, + // ch_gff_file, ch_metadata ) - ch_readcounts_df = MERGE_COUNTS.out.counts_df - ch_readcounts_df_pc = MERGE_COUNTS.out.counts_df_pc + // ch_readcounts_df = MERGE_COUNTS.out.counts_df + // ch_readcounts_df_pc = MERGE_COUNTS.out.counts_df_pc + ch_readcounts_df_pc = MERGE_COUNTS.out.counts_df ch_refgene_df = MERGE_COUNTS.out.ref_gene_df diff --git a/modules/kallisto.nf b/modules/kallisto.nf index 4111609..deab388 100644 --- a/modules/kallisto.nf +++ b/modules/kallisto.nf @@ -65,7 +65,7 @@ process MERGE_COUNTS { input: path kallisto_dirs - path gff + // path gff path meta output: @@ -77,8 +77,12 @@ process MERGE_COUNTS { """ merge_kallisto_counts.py \ --metadata_f=$meta \ - --gff_f=$gff \ --out_dir="./" + + #merge_kallisto_counts.py \ + # --metadata_f=$meta \ + # --gff_f=$gff \ + # --out_dir="./" """ }