From c8495a4154e06ff6e025692514457dd10488fde7 Mon Sep 17 00:00:00 2001
From: Adam Dinan <adam1989ie@gmail.com>
Date: Mon, 22 Jan 2024 13:55:24 -0600
Subject: [PATCH] Remove GFF requirement when kallisto pseudoaligner used

---
 .github/workflows/ci.yml     |   1 +
 README.md                    |   2 +-
 bin/count_reads_BIOTYPES.R   | 346 +++++++++++++++++++++++++++++++++++
 bin/merge_kallisto_counts.py |  98 +++++-----
 bin/normalise_counts.R       |   7 +-
 main.nf                      |   7 +-
 modules/kallisto.nf          |   8 +-
 7 files changed, 413 insertions(+), 56 deletions(-)
 create mode 100644 bin/count_reads_BIOTYPES.R

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 07462c7..a3b0213 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -43,6 +43,7 @@ jobs:
       - name: Run pipeline with test data
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --aligner kallisto --outdir ./results
 
       - name: Login to Docker Hub
         uses: docker/login-action@v1
diff --git a/README.md b/README.md
index 855d18c..f2bd999 100644
--- a/README.md
+++ b/README.md
@@ -39,13 +39,13 @@ nextflow run BactSeq --data_dir [dir] --sample_file [file] --ref_genome [file] -
 Mandatory arguments:
   --data_dir [file]               Path to directory containing FastQ files.
   --ref_genome [file]             Path to FASTA file containing reference genome sequence (bwa) or multi-FASTA file containing coding gene sequences (kallisto).
-  --ref_ann [file]                Path to GFF file containing reference genome annotation.
   --sample_file [file]            Path to file containing sample information.
   -profile [str]                  Configuration profile to use.
                                   Available: conda, docker, singularity.
 
 Other options:
   --aligner [str]                 (Pseudo-)aligner to be used. Options: `bwa`, `kallisto`. Default = bwa.
+  --ref_ann [file]                Path to GFF file containing reference genome annotation. Required only if bwa aligner used.
   --cont_tabl [file]              Path to tsv file containing contrasts to be performed for differential expression.
   --fragment_len [str]            Estimated average fragment length for kallisto transcript quantification (only required for single-end reads). Default = 150.
   --fragment_sd [str]             Estimated standard deviation of fragment length for kallisto transcript quantification (only required for single-end reads). Default = 20.
diff --git a/bin/count_reads_BIOTYPES.R b/bin/count_reads_BIOTYPES.R
new file mode 100644
index 0000000..0d6508d
--- /dev/null
+++ b/bin/count_reads_BIOTYPES.R
@@ -0,0 +1,346 @@
+#!/usr/bin/env Rscript
+
+if (!require("optparse")){
+    install.packages("optparse",repos = "http://cran.us.r-project.org")
+}
+if (!require("tidyverse")){
+    install.packages("tidyverse",repos = "http://cran.us.r-project.org")
+}
+if (!require("scales")){
+    install.packages("scales",repos = "http://cran.us.r-project.org")
+}
+if (!require("RColorBrewer")){
+    install.packages("RColorBrewer",repos = "http://cran.us.r-project.org")
+} 
+
+
+library(optparse)
+library(tidyverse)
+library(scales)
+library(RColorBrewer)
+library(reshape2)
+
+
+option_list <- list(
+    make_option(c("-m", "--metadata"), type="character", default=NULL,
+        help="sample metadata tsv file", metavar="character"),
+    make_option(c("-r", "--ref_gene_f"), type="character", default=NULL,
+        help="Gene annotations in reference strain", 
+        metavar="character"),
+    make_option(c("-g", "--gene_counts_f"), type="character", default=NULL,
+        help="Gene annotations in reference strain", 
+        metavar="character"),
+    make_option(c("-p", "--is_paired"), type="character", default=NULL,
+        help="are the reads paired-end? default = FALSE", 
+        metavar="character")
+)
+
+opt_parser <- OptionParser(option_list=option_list)
+opt <- parse_args(opt_parser)
+
+meta_f <- opt$metadata
+ref_gene_f <- opt$ref_gene_f
+gene_counts_f <- opt$gene_counts_f
+ispaired <- if(opt$is_paired == "TRUE") TRUE else FALSE
+
+
+##------------------------------------------------------------------------------
+## Read data
+##------------------------------------------------------------------------------
+meta_tab <- read.table(meta_f, header = TRUE, sep = "\t")
+## columns: sample	 file1   file2	group	rep_no    paired
+
+
+## cat the counts files
+# system("cat *.counts > merged_counts.txt")
+
+total_counts_list <- lapply(meta_tab$sample, function(x){
+    # total_counts <- read.table(paste0(x,".counts"), header = FALSE, sep = "\t")
+    # total_counts$sample <- x
+    # colnames(total_counts) <- c("chr","chr_size","mapped","blank","sample")
+    # total_counts
+    mapped_count <- read.table(paste0(x,".counts"), header = FALSE)
+    colnames(mapped_count) <- "mapped"
+    mapped_count$sample <- x
+    mapped_count
+})
+merged_total_counts <- as.data.frame(do.call(rbind, total_counts_list))
+
+
+
+##------------------------------------------------------------------------------
+## Read genome annotation
+##------------------------------------------------------------------------------
+# ref_annot <- ape::read.gff(gff_f, na.strings = c(".", "?"), GFF3 = TRUE)
+
+# ref_annot <- subset(ref_annot, type=="gene")
+
+# gene_attr <- stringr::str_split(ref_annot$attributes,";")
+# locus_tags <- unlist(lapply(gene_attr, function(x){x[grepl("locus_tag", x)]}))
+# gene_biotypes <- unlist(lapply(gene_attr, function(x){x[grepl("gene_biotype", x)]}))
+# common_gene_names <- unlist(lapply(gene_attr, function(x){
+#     x <- x[grepl("gene=", x)]
+#     x[identical(x, character(0))] <- ""
+#     x
+# }))
+# gene_lengths <- (ref_annot$end - ref_annot$start) + 1
+
+# ref_gene_df <- data.frame(
+#     locus_tag = locus_tags,
+#     biotype = gene_biotypes,
+#     gene_name = common_gene_names,
+#     gene_length = gene_lengths
+# )
+# ref_gene_df$locus_tag <- gsub("locus_tag=","",ref_gene_df$locus_tag)
+# ref_gene_df$biotype <- gsub("gene_biotype=","",ref_gene_df$biotype)
+# ref_gene_df$gene_name <- gsub("gene=","",ref_gene_df$gene_name)
+
+# write.table(
+#     ref_gene_df, "ref_gene_df.tsv", col.names = TRUE, row.names = FALSE,
+#     sep = "\t", quote = FALSE
+# )
+
+ref_gene_df <- read_tsv(ref_gene_f)
+
+
+##------------------------------------------------------------------------------
+## Count reads mapping to genes
+##------------------------------------------------------------------------------
+# bamfilesCount <- paste0(meta_tab$sample, ".bam")
+
+# # 0 (unstranded), 1 (stranded) and 2 (reversely stranded)
+# strand <- switch(as.character(strandedness),
+#        "unstranded" = 0,
+#        "forward" = 1,
+#        "reverse" = 2,
+#        stop("Invalid input")
+# )
+
+# gene_counts <- Rsubread::featureCounts(
+#     bamfilesCount, annot.ext = gff_f,
+#     isGTFAnnotationFile = TRUE,
+#     GTF.featureType = "gene", 
+#     GTF.attrType = "locus_tag",
+#     nthreads = threads, 
+#     countMultiMappingReads = TRUE, 
+#     fraction = TRUE, ## assign fractional counts to multimappers
+#     isPairedEnd = ispaired, 
+#     strandSpecific = strand
+# )
+# colnames(counts_mat) <- gsub(".bam", "", colnames(counts_mat))
+# colnames(counts_mat) <- gsub("\\.","_",colnames(counts_mat))
+
+
+# counts_mat <- counts_mat
+# counts_mat <- tibble::rownames_to_column(as.data.frame(counts_mat), "feature_id")
+
+
+# write.table(
+#     counts_mat, "gene_counts.tsv", col.names = TRUE, row.names = FALSE,
+#     sep = "\t", quote = FALSE
+# )
+
+# ## protein-coding genes only
+# gene_counts_pc <- counts_mat[ref_gene_df$biotype=="protein_coding",]
+# write.table(
+#     gene_counts_pc, "gene_counts_pc.tsv", col.names = TRUE, row.names = FALSE,
+#     sep = "\t", quote = FALSE
+# )
+
+gene_counts <- read_tsv(gene_counts_f)
+counts_mat <- as.data.frame(gene_counts[,2:ncol(gene_counts)])
+rownames(counts_mat) <- gene_counts$feature_id
+
+
+##------------------------------------------------------------------------------
+## Plot library composition
+##------------------------------------------------------------------------------
+## set up plots
+brewer_pallette1 <- brewer.pal(9,"Set1")
+brewer_pallette3 <- brewer.pal(8,"Dark2")
+
+gg_color_hue <- function(n) {
+    hues = seq(15, 375, length = n + 1)
+    hcl(h = hues, l = 65, c = 100)[1:n]
+}
+ggColsDefault <- (gg_color_hue(4))
+ggCols <- brewer_pallette1[c(1,3,4,5,2,7,8)]
+
+## summarise counts per sample
+
+all_biotypes <- unique(ref_gene_df$biotype)
+
+biotype_counts <- data.frame(do.call(cbind,
+    lapply(all_biotypes, function(biotype){
+    colSums(counts_mat[ref_gene_df$biotype==biotype,,drop=FALSE])
+})))
+colnames(biotype_counts) <- all_biotypes
+
+
+
+counts_summary <- data.frame(
+    sample = meta_tab$"sample",
+    group = meta_tab$"group",
+    rep = meta_tab$"rep_no"#,
+    # protein_coding = colSums(
+    #     counts_mat[ref_gene_df$biotype=="protein_coding",]),
+    # tRNA = colSums(
+        # counts_mat[ref_gene_df$biotype=="tRNA",]),
+    # rRNA = colSums(
+    #     counts_mat[ref_gene_df$biotype=="rRNA",])
+)
+
+counts_summary <- cbind(counts_summary,biotype_counts)
+
+## add total mapped counts
+counts_summary <- merge(counts_summary,merged_total_counts, by = "sample")
+# counts_summary$other <- counts_summary$mapped-counts_summary$rRNA
+
+
+counts_summary$antisense_other <- counts_summary$mapped - rowSums(biotype_counts)#(
+    #counts_summary$rRNA + counts_summary$protein_coding)
+
+write.table(
+    counts_summary, "counts_summary.tsv", col.names = TRUE, row.names = FALSE,
+    sep = "\t", quote = FALSE
+)
+
+write.table(
+    biotype_counts, "biotype_counts.tsv", col.names = TRUE, row.names = FALSE,
+    sep = "\t", quote = FALSE
+)
+
+
+
+counts_summary <- counts_summary[rev(order(counts_summary$sample)),]
+
+
+cc1 <- 12
+
+all_biotypes <- c(all_biotypes, "antisense_other")
+non_rRNA_btypes <- all_biotypes[!all_biotypes=="rRNA"]
+
+
+#############################
+## raw counts plot
+#############################
+counts_melt <- reshape2::melt(
+    counts_summary, id.vars = c("sample"),
+    measure.vars = c(
+        # "protein_coding", 
+        # "antisense_other",
+        # "rRNA"
+        all_biotypes
+        )
+)
+counts_melt$sample <- factor(
+    counts_melt$sample, levels = rev(unique(sort(counts_melt$sample)))
+)
+counts_melt$variable <- factor(counts_melt$variable, levels=c(
+    "rRNA", 
+    non_rRNA_btypes
+    # "antisense_other", 
+    # "protein_coding"
+    )
+)
+# minUsable <- min(mergedDf$q15_dedup_reads)
+
+
+ylabel <- ifelse(isTRUE(ispaired), "Million read pairs", "Million reads")
+
+p1 <- ggplot(counts_melt,
+        aes(x = sample, colour = variable, fill = variable, y = value)
+    ) + 
+    geom_bar(position = "stack", stat = "identity", width = 0.7) +
+    coord_flip() +
+    xlab("Sample") + ylab(ylabel) +
+    scale_fill_manual(
+        "",
+        values = ggCols,
+        guide = guide_legend(reverse = TRUE)
+    ) +
+    scale_colour_manual(values = ggCols, guide = FALSE) +
+    scale_y_continuous(labels = unit_format(unit = "", scale = 1e-6)) +
+    ## add a dashed line at the min usable number of reads
+    # geom_hline(yintercept = 5e6, linetype="dashed") +
+    theme_bw(base_size = cc1*1.3) +
+    theme(
+        legend.position = "top",
+        legend.title = element_blank(),
+        legend.text=element_text(size=cc1),
+        axis.text.x = element_text(colour = "black"),
+        axis.text.y = element_text(colour = "black")
+    )
+
+nsamps <- ncol(counts_mat)
+
+ggsave(
+    p1, file = paste0('library_composition.png'),
+    device = "png",
+    width = 8, height = (nsamps/2.2),
+    dpi = 300
+)
+
+
+#############################
+## proportions plot
+#############################
+## get the proportions of reads per library
+# propCols <- (mergedDf[,c(3,13,14,5)] / mergedDf[,2])
+
+propCols <- counts_summary[c(
+    # "protein_coding", 
+    # "antisense_other", 
+    # "rRNA"
+    all_biotypes
+    )] / counts_summary$mapped
+
+propCols$sample <- counts_summary$sample
+
+# rowSums(propCols) ## each row should sum to 1
+prop_melt <- melt(
+    propCols, id.vars = c("sample"),
+    measure.vars = c(
+        # "protein_coding", 
+        # "antisense_other", 
+        # "rRNA"
+        all_biotypes
+        )
+)
+prop_melt$sample <- factor(
+    prop_melt$sample, levels = rev(unique(sort(prop_melt$sample)))
+)
+prop_melt$variable <- factor(prop_melt$variable, levels=c(
+    "rRNA", 
+    non_rRNA_btypes
+    # "antisense_other", 
+    # "protein_coding"
+    )
+)
+
+p2 <- ggplot(prop_melt,
+         aes(x = sample, colour = variable, fill = variable, y = value)
+    ) + geom_bar(stat = "identity",  width = 0.7) +
+    coord_flip()  +
+    xlab("Sample") + ylab("Proportion of reads") +
+    scale_fill_manual(
+        "",
+        values = ggCols,
+        guide = guide_legend(reverse = TRUE)
+    ) +
+    scale_colour_manual(values = ggCols, guide = FALSE) +
+    scale_y_continuous(labels = comma) +
+    theme_bw(base_size = cc1*1.3) +
+    theme(
+        legend.position="top", 
+        legend.text=element_text(size=cc1),
+        axis.text.x = element_text(colour = "black"),
+        axis.text.y = element_text(colour = "black")
+        )
+
+ggsave(
+    p2, file = 'library_composition_proportions.png',
+    device = "png",
+    width = 8, height = (nsamps/2.2),
+    dpi = 300
+)
\ No newline at end of file
diff --git a/bin/merge_kallisto_counts.py b/bin/merge_kallisto_counts.py
index e26d545..7bbe35b 100755
--- a/bin/merge_kallisto_counts.py
+++ b/bin/merge_kallisto_counts.py
@@ -3,71 +3,75 @@
 import argparse
 import pandas as pd
 import os.path
-import gffpandas.gffpandas as gffpd
+
+# import gffpandas.gffpandas as gffpd
 from collections import Counter
 
 
 def parse():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--metadata_f",
-        help = "TSV file containing metadata"
-    )
-    parser.add_argument(
-        "--gff_f",
-        help = "GFF annotation file"
-    )
+    parser.add_argument("--metadata_f", help="TSV file containing metadata")
+    # parser.add_argument("--gff_f", help="GFF annotation file")
     parser.add_argument("--out_dir", help="Directory for results")
     args = parser.parse_args()
     merge_counts(**vars(args))
 
-def merge_counts(metadata_f, gff_f, out_dir):
+
+def merge_counts(
+    metadata_f,
+    # gff_f,
+    out_dir,
+):
     # merge counts
-    metadata = pd.read_csv(metadata_f, sep = "\t")
+    metadata = pd.read_csv(metadata_f, sep="\t")
     quant_dfs = []
     for index, row in metadata.iterrows():
-        sample_name = row['sample']
-        quant_file = os.path.join('kallisto_'+sample_name, 'abundance.tsv')
-        quant_dat = pd.read_csv(quant_file, sep = "\t")
+        sample_name = row["sample"]
+        quant_file = os.path.join("kallisto_" + sample_name, "abundance.tsv")
+        quant_dat = pd.read_csv(quant_file, sep="\t")
         quant_dat = quant_dat[["target_id", "est_counts", "length"]]
-        quant_dat = quant_dat.rename(columns={'est_counts': sample_name})
+        quant_dat = quant_dat.rename(columns={"est_counts": sample_name})
         quant_dfs.append(quant_dat)
-    quant_dfs = [df.set_index('target_id') for df in quant_dfs]
+    quant_dfs = [df.set_index("target_id") for df in quant_dfs]
     quant_merged = pd.concat(quant_dfs, axis=1)
-    quant_merged = quant_merged.loc[:,~quant_merged.columns.duplicated()]
+    quant_merged = quant_merged.loc[:, ~quant_merged.columns.duplicated()]
     # get gene lengths from kallisto results (ensures no divergence from GFF)
-    quant_merged = quant_merged.rename_axis('feature_id').reset_index()
-    gene_lengths = quant_merged[['feature_id', 'length']]
-    quant_merged = quant_merged[['feature_id'] + metadata['sample'].tolist()]
+    quant_merged = quant_merged.rename_axis("feature_id").reset_index()
+    gene_lengths = quant_merged[["feature_id", "length"]]
+    gene_lengths = gene_lengths.rename(columns={"feature_id": "locus_tag"})
+    quant_merged = quant_merged[["feature_id"] + metadata["sample"].tolist()]
     # export merged counts
-    outf1 = os.path.join(out_dir, 'gene_counts.tsv')
-    quant_merged.to_csv(outf1, index=False, sep='\t')
-    # gene annotation df for extracting protein-coding genes
-    annot_dat = (gffpd.read_gff3(gff_f)).df
-    cds_annot = (annot_dat[annot_dat['type']=='CDS']).copy(deep=True)
-    # rRNA_annot = annot_dat[annot_dat['type']=='rRNA']
-    annot_split = cds_annot['attributes'].str.split(";")
-    locus_tags = (annot_split.str[1]).str.replace(r'Parent=gene-','')
-    gene_names = ((annot_split.str[6]).str.split("=")).str[1]
-    cds_annot['locus_tag'] = locus_tags
-    cds_annot['gene_name'] = gene_names
-    cds_annot['gene_length'] = (cds_annot['end'] - cds_annot['start']) + 1
-    cds_annot['biotype'] = 'protein_coding'
-    cds_annot = cds_annot[['locus_tag', 'biotype', 'gene_name', 'gene_length']]
-    # export PC counts
-    quant_merged_pc = quant_merged[quant_merged['feature_id'].isin(
-        cds_annot['locus_tag'].tolist()
-    )]
-    outf2 = os.path.join(out_dir, 'gene_counts_pc.tsv')
-    quant_merged_pc.to_csv(outf2, index=False, sep='\t')
+    outf1 = os.path.join(out_dir, "gene_counts.tsv")
+    quant_merged.to_csv(outf1, index=False, sep="\t")
+    # # gene annotation df for extracting protein-coding genes
+    # annot_dat = (gffpd.read_gff3(gff_f)).df
+    # cds_annot = (annot_dat[annot_dat['type']=='CDS']).copy(deep=True)
+    # # rRNA_annot = annot_dat[annot_dat['type']=='rRNA']
+    # annot_split = cds_annot['attributes'].str.split(";")
+    # locus_tags = (annot_split.str[1]).str.replace(r'Parent=gene-','')
+    # gene_names = ((annot_split.str[6]).str.split("=")).str[1]
+    # cds_annot['locus_tag'] = locus_tags
+    # cds_annot['gene_name'] = gene_names
+    # cds_annot['gene_length'] = (cds_annot['end'] - cds_annot['start']) + 1
+    # cds_annot['biotype'] = 'protein_coding'
+    # cds_annot = cds_annot[['locus_tag', 'biotype', 'gene_name', 'gene_length']]
+    # # export PC counts
+    # quant_merged_pc = quant_merged[
+    #     quant_merged["feature_id"].isin(cds_annot["locus_tag"].tolist())
+    # ]
+    quant_merged_pc = quant_merged
+    outf2 = os.path.join(out_dir, "gene_counts_pc.tsv")
+    quant_merged_pc.to_csv(outf2, index=False, sep="\t")
     # export ref gene df (using kallisto gene lengths instead of GFF)
-    gene_lengths = gene_lengths.rename(columns={'feature_id': 'locus_tag'})
-    cds_annot = pd.merge(cds_annot, gene_lengths, on = "locus_tag")
-    cds_annot = cds_annot[['locus_tag', 'biotype', 'gene_name', 'length']]
-    cds_annot = cds_annot.drop_duplicates(keep='first')
-    cds_annot = cds_annot.rename(columns={'length': 'gene_length'})
-    outf3 = os.path.join(out_dir, 'ref_gene_df.tsv')
-    cds_annot.to_csv(outf3, index=False, sep='\t')
+    # cds_annot = pd.merge(cds_annot, gene_lengths, on="locus_tag")
+    # cds_annot = cds_annot[["locus_tag", "biotype", "gene_name", "length"]]
+    # cds_annot = cds_annot.drop_duplicates(keep="first")
+    # cds_annot = cds_annot.rename(columns={"length": "gene_length"})
+    # when not using GFF annotation:
+    gene_lengths = gene_lengths.rename(columns={"length": "gene_length"})
+    outf3 = os.path.join(out_dir, "ref_gene_df.tsv")
+    # cds_annot.to_csv(outf3, index=False, sep="\t")
+    gene_lengths.to_csv(outf3, index=False, sep="\t")
 
 
 if __name__ == "__main__":
diff --git a/bin/normalise_counts.R b/bin/normalise_counts.R
index 94acbff..f76f7c1 100755
--- a/bin/normalise_counts.R
+++ b/bin/normalise_counts.R
@@ -40,9 +40,10 @@ counts_tab[["feature_id"]] <- NULL
 counts_tab <- as.data.frame(sapply(counts_tab, as.numeric))
 rownames(counts_tab) <- gene_names
 
-## remove rRNA genes
-ref_tab_sub <- ref_gene_tab[!ref_gene_tab$biotype == "rRNA", ]
-non_rRNA_counts <- counts_tab[rownames(counts_tab) %in% ref_tab_sub$locus_tag, ]
+# ## remove rRNA genes
+# ref_tab_sub <- ref_gene_tab[!ref_gene_tab$biotype == "rRNA", ]
+# non_rRNA_counts <- counts_tab[rownames(counts_tab) %in% ref_tab_sub$locus_tag, ]
+ref_tab_sub <- ref_gene_tab
 
 ## ensure that ref gene annotations order matches counts table
 ref_tab_sub <- ref_tab_sub[match(
diff --git a/main.nf b/main.nf
index 751a846..064d3cf 100644
--- a/main.nf
+++ b/main.nf
@@ -214,11 +214,12 @@ workflow {
          */
         MERGE_COUNTS (
             ch_kallisto_out_dirs,
-            ch_gff_file,
+            // ch_gff_file,
             ch_metadata
         )
-        ch_readcounts_df = MERGE_COUNTS.out.counts_df
-        ch_readcounts_df_pc = MERGE_COUNTS.out.counts_df_pc
+        // ch_readcounts_df = MERGE_COUNTS.out.counts_df
+        // ch_readcounts_df_pc = MERGE_COUNTS.out.counts_df_pc
+        ch_readcounts_df_pc = MERGE_COUNTS.out.counts_df
         ch_refgene_df = MERGE_COUNTS.out.ref_gene_df
 
 
diff --git a/modules/kallisto.nf b/modules/kallisto.nf
index 4111609..deab388 100644
--- a/modules/kallisto.nf
+++ b/modules/kallisto.nf
@@ -65,7 +65,7 @@ process MERGE_COUNTS {
 
     input:
     path kallisto_dirs
-    path gff
+    // path gff
     path meta
 
     output:
@@ -77,8 +77,12 @@ process MERGE_COUNTS {
     """
     merge_kallisto_counts.py \
         --metadata_f=$meta \
-        --gff_f=$gff \
         --out_dir="./"
+
+    #merge_kallisto_counts.py \
+    #    --metadata_f=$meta \
+    #    --gff_f=$gff \
+    #    --out_dir="./"
     """
 }