Merge pull request #19 from PacificBiosciences/update/newGTDBTk

Update/new-gtdb-tk
PacificBiosciences · Apr 13, 2022 · dbc183d · dbc183d
2 parents 678ae04 + 2b8a7cd
commit dbc183d
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 19 deletions.
diff --git a/HiFi-MAG-Pipeline/config.yaml b/HiFi-MAG-Pipeline/config.yaml
@@ -64,7 +64,7 @@ gtdbtk:
   max_contamination: 10
 
   # The maximum number of contigs allowed in a genome bin.
-  max_contigs: 10
+  max_contigs: 20
 
   # The number of threads to use for gtdb-tk.
   threads: 48
@@ -79,5 +79,5 @@ gtdbtk:
   # Please note that the data for release 202 are ~50GB in size.
   # The path below must point to the directory than contains the following folders:
   # fastani, markers, masks, metadata, mrca_red, msa, pplacer, radii, taxonomy
-  gtdbtk_data: "/dept/appslab/datasets/dp_gtdb/release202/"
+  gtdbtk_data: "/dept/appslab/datasets/dp_gtdb/release207"
 
diff --git a/HiFi-MAG-Pipeline/envs/gtdbtk.yml b/HiFi-MAG-Pipeline/envs/gtdbtk.yml
@@ -4,5 +4,5 @@ channels:
 - conda-forge
 - defaults
 dependencies:
-- gtdbtk == 1.5.0
+- gtdbtk == 2.0.0
 - python == 3.7
diff --git a/Taxonomic-Functional-Profiling-Protein/Snakefile-taxprot b/Taxonomic-Functional-Profiling-Protein/Snakefile-taxprot
@@ -8,7 +8,9 @@ CHUNKS = [str(i) for i in list(range(0,config['diamond']['chunks']))]
 
 rule all:
     input:
-        expand("4-rma/{sample}.protein.{mode}.rma", sample = SAMPLES, mode = config['sam2rma']['readassignmentmode'])
+        expand("4-rma/{sample}_filtered.protein.{mode}.rma", sample = SAMPLES, mode = config['sam2rma']['readassignmentmode']),
+        expand("4-rma/{sample}_unfiltered.protein.{mode}.rma", sample = SAMPLES, mode = config['sam2rma']['readassignmentmode'])
+
 
 rule SplitFasta:
     input: 
@@ -60,12 +62,12 @@ rule MergeSam:
     shell:
         "python scripts/sam-merger-screen-cigar.py -i {input} -o {output} -l {log}"
 
-rule MakeRMA:
+rule MakeRMAfiltered:
     input:
         sam = "3-merged/{sample}.merged.sam",
         reads = "inputs/{sample}.fasta"
     output:
-        expand("4-rma/{{sample}}.protein.{mode}.rma", mode = config['sam2rma']['readassignmentmode'])
+        expand("4-rma/{{sample}}_filtered.protein.{mode}.rma", mode = config['sam2rma']['readassignmentmode'])
     conda:
         "envs/general.yml"
     threads: config['sam2rma']['threads']
@@ -75,11 +77,32 @@ rule MakeRMA:
         ram = config['sam2rma']['readassignmentmode'],
         ms = config['sam2rma']['minSupportPercent']
     log: 
-        f"logs/{{sample}}.MakeRMA.{config['sam2rma']['readassignmentmode']}.log"
+        f"logs/{{sample}}.MakeRMA.filtered.{config['sam2rma']['readassignmentmode']}.log"
     benchmark: 
-        f"benchmarks/{{sample}}.MakeRMA.{config['sam2rma']['readassignmentmode']}.tsv"
+        f"benchmarks/{{sample}}.MakeRMA.filtered.{config['sam2rma']['readassignmentmode']}.tsv"
     shell:
         "{params.sam2rma} -i {input.sam} -r {input.reads} -o {output} -lg -alg longReads "
         "-t {threads} -mdb {params.db} -ram {params.ram} --minSupportPercent {params.ms} "
         "-v 2> {log}"
 
+rule MakeRMAunfiltered:
+    input:
+        sam = "3-merged/{sample}.merged.sam",
+        reads = "inputs/{sample}.fasta"
+    output:
+        expand("4-rma/{{sample}}_unfiltered.protein.{mode}.rma", mode = config['sam2rma']['readassignmentmode'])
+    conda:
+        "envs/general.yml"
+    threads: config['sam2rma']['threads']
+    params:
+        sam2rma = config['sam2rma']['path'],
+        db = config['sam2rma']['db'],
+        ram = config['sam2rma']['readassignmentmode']
+    log: 
+        f"logs/{{sample}}.MakeRMA.unfiltered.{config['sam2rma']['readassignmentmode']}.log"
+    benchmark: 
+        f"benchmarks/{{sample}}.MakeRMA.unfiltered.{config['sam2rma']['readassignmentmode']}.tsv"
+    shell:
+        "{params.sam2rma} -i {input.sam} -r {input.reads} -o {output} -lg -alg longReads "
+        "-t {threads} -mdb {params.db} -ram {params.ram} --minSupportPercent 0 "
+        "-v 2> {log}"
diff --git a/Taxonomic-Functional-Profiling-Protein/config.yaml b/Taxonomic-Functional-Profiling-Protein/config.yaml
@@ -42,7 +42,7 @@ sam2rma:
   # https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html
   # please note that the name of this file will change depending on the version, 
   # and that this must be the the version for the NCBI-nr accessions.
-  db: "/home/dportik/programs/megan/db/megan-map-Jan2021.db"
+  db: "/home/dportik/programs/megan/db/megan-map-Feb2022.db"
 
   # Number of threads to use for sam2rma, 24 is generally sufficient
   threads: 24

diff --git a/docs/HiFi-MAG-Binning.png b/docs/HiFi-MAG-Binning.png
diff --git a/docs/Tutorial-HiFi-MAG-Pipeline.md b/docs/Tutorial-HiFi-MAG-Pipeline.md
@@ -20,7 +20,11 @@ The purpose of this snakemake workflow is to obtain high-quality metagenome-asse
 
 ![GBSteps](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/HiFi-MAG-Summary.png)
 
-HiFi reads are first mapped to contigs using minimap2 to generate BAM files. The BAM files are used to obtain coverage estimates for the contigs. The coverages and contigs are used as inputs to MetaBAT2, which constructs the genome bins. **New feature in v1.4+:** Bins are constructed in three ways: 1) using the full set of contigs for MetaBat2 binning, 2) using only linear contigs for MetaBat2 binning, and 3) assigning circular contigs to bins directly. These binning strategies are subsequently compared and merged using DAS_Tool. This method prevents improper binning of complete, circular contigs and improves MAG recovery by 10-30%:
+HiFi reads are first mapped to contigs using minimap2 to generate BAM files. The BAM files are used to obtain coverage estimates for the contigs. The coverages and contigs are used as inputs to MetaBAT2, which constructs the genome bins. **New feature in v1.4+:** Bins are constructed in three ways: 1) using the full set of contigs for MetaBat2 binning, 2) using only linear contigs for MetaBat2 binning, and 3) assigning circular contigs to bins directly. These binning strategies are subsequently compared and merged using DAS_Tool. 
+
+![Binning](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/HiFi-MAG-Binning.png)
+
+This improved method prevents improper binning of complete, circular contigs and provides a 8–28% increase in total MAGs and 28–73% increase in single contig MAGs.
 
 ![Improvement](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/HiFi-MAG-Pipeline-Update.png)
 
@@ -38,7 +42,7 @@ The merged bin set is then screened with CheckM to assess the quality of the res
 This workflow requires [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](https://docs.conda.io/projects/conda/en/latest/index.html) and [Snakemake](https://snakemake.readthedocs.io/en/stable/) to be installed, and will require 45-150GB memory and >250GB temporary disk space (see [Requirements section](#RFR)). All dependencies in the workflow are installed using conda and the environments are activated by snakemake for relevant steps. Snakemake v5+ is required, and the workflows have been tested using v5.19.3.
 
 - Clone the HiFi-MAG-Pipeline directory.
-- Download and unpack the databases for CheckM (<2GB) and GTDB (~28GB). Specify paths to each database in `config.yaml`.
+- Download and unpack the databases for CheckM (<2GB) and GTDB (~66GB). Specify paths to each database in `config.yaml`.
 - Include all input HiFi fasta files (`SAMPLE.fasta`) and contig fasta files (`SAMPLE.contigs.fasta`) in the `inputs/` folder. These can be files or symlinks. If you assembled with metaFlye, you must also include the `SAMPLE-assembly_info.txt` file here.
 - Edit sample names in `Sample-Config.yaml` configuration file in `configs/` for your project. 
 - Edit the assembly method used to generate the contigs in `config.yaml`. This choice will be used to identify which contigs are circular in the assembly. Default is `hifiasm-meta`, but other supported options include `hicanu` and `metaflye`.
@@ -146,7 +150,7 @@ The downloaded file must be decompressed to use it. The unpacked contents will b
 
 Complete instructions for the GTDB-Tk database can be found at: https://ecogenomics.github.io/GTDBTk/installing/index.html
 
-This workflow uses GTDB-Tk v1.5.0, which requires GTDB R06-RS202 (release 202) or later. 
+**As of April 2022, this workflow now uses GTDB-Tk v2.0+, which requires GTDB 07-RS207 (release 207).**
 
 The current GTDB release can be downloaded from: 
 https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_data.tar.gz
@@ -156,7 +160,7 @@ wget https://data.gtdb.ecogenomic.org/releases/latest/auxillary_files/gtdbtk_dat
 tar -xvzf gtdbtk_data.tar.gz  
 ```
 
-It must also be decompressed prior to usage. The unpacked contents (of release 202) will be ~50GB in size. The path to the directory containing the decompressed contents must be specified in the main configuration file (`config.yaml`). The decompressed file should result in several folders (`fastani/`, `markers/`, `masks/`, `metadata/`, `mrca_red/`, `msa/`, `pplacer/`, `radii/`, `taxonomy/`).
+It must also be decompressed prior to usage. The unpacked contents will be ~66GB in size. The path to the directory containing the decompressed contents must be specified in the main configuration file (`config.yaml`). The decompressed file should result in several folders (`fastani/`, `markers/`, `masks/`, `metadata/`, `mrca_red/`, `msa/`, `pplacer/`, `radii/`, `taxonomy/`).
 
 
 [Back to top](#TOP)

diff --git a/docs/Tutorial-Taxonomic-Functional-Profiling-Protein.md b/docs/Tutorial-Taxonomic-Functional-Profiling-Protein.md
@@ -16,11 +16,11 @@
 
 # **Taxonomic-Functional-Profiling-Protein Overview** <a name="PO"></a>
 
-The purpose of this snakemake workflow is to perform translation alignment of HiFi reads to a protein database, and convert the resulting alignments to RMA input files for MEGAN6. This allows interactive taxonomic and functional analyses to be performed across samples. The general steps of the Taxonomic-Functional-Profiling-Protein pipeline are shown below:
+The purpose of this snakemake workflow is to perform translation alignment of HiFi reads to a protein database, and convert the resulting alignments to RMA input files for MEGAN-LR. This allows interactive taxonomic and functional analyses to be performed across samples. The general steps of the Taxonomic-Functional-Profiling-Protein pipeline are shown below:
 
 ![GBSteps](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Taxonomic-Functional-Profiling-Protein-Steps.png)
 
-Translation alignments of the HiFi reads to the protein database are performed using long-read settings in DIAMOND, resulting in a protein-SAM file. The specific long-read settings involve the `--range-culling` option. DIAMOND can take a very long time to run with large files. In this pipeline, the process is parallelized by splitting each file of HiFi reads into *n* chunks (default = 4). This allows a speedup when jobs are run simultaneously, and also reduces the resources required to run the jobs sequentially. The chunked protein-SAM files per sample are filtered for illegal CIGAR strings (frameshift characters) and merged into a single protein-SAM file. The resulting protein-SAM file is converted into long-read RMA format using the sam2rma tool distributed with MEGAN6. The settings for the RMA file are optimized to balance precision and recall, but can be changed for different use-cases. Each input file of HiFi reads will produce a corresponding output RMA file, ready to use with MEGAN6.  
+Translation alignments of the HiFi reads to the protein database are performed using long-read settings in DIAMOND, resulting in a protein-SAM file. The specific long-read settings involve the `--range-culling` option. DIAMOND can take a very long time to run with large files. In this pipeline, the process is parallelized by splitting each file of HiFi reads into *n* chunks (default = 4). This allows a speedup when jobs are run simultaneously, and also reduces the resources required to run the jobs sequentially. The chunked protein-SAM files per sample are filtered for illegal CIGAR strings (frameshift characters) and merged into a single protein-SAM file. The resulting protein-SAM file is converted into long-read RMA format using the sam2rma tool distributed with MEGAN-LR. The settings for the RMA file are optimized to balance precision and recall, but can be changed for different use-cases. Each input file of HiFi reads will produce two corresponding output RMA files (one filtered, one unfiltered), ready to use with MEGAN-LR.  
 
 
 [Back to top](#TOP)
@@ -32,7 +32,7 @@ Translation alignments of the HiFi reads to the protein database are performed u
 This workflow requires [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](https://docs.conda.io/projects/conda/en/latest/index.html) and [Snakemake](https://snakemake.readthedocs.io/en/stable/) to be installed, and will require ~60GB memory and 40-200GB disk space per sample (see [Requirements section](#RFR)). All dependencies in the workflow are installed using conda and the environments are activated by snakemake for relevant steps. Snakemake v5+ is required, and the workflows have been tested using v5.19.3.
 
 - Clone the Taxonomic-Functional-Profiling-Protein directory.
-- Download MEGAN6 community edition from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html) to obtain `sam2rma`. **Ensure you have at least version 6.19.+**
+- Download MEGAN6/MEGAN-LR community edition from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html) to obtain `sam2rma`. **Ensure you have at least version 6.19.+**
 - Download and unpack the newest MEGAN mapping file for NCBI-nr accessions from the [MEGAN download page](https://software-ab.informatik.uni-tuebingen.de/download/megan6/welcome.html).
 - Download the NCBI-nr database from: ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz*
 - Index the NCBI-nr database with DIAMOND using `diamond makedb --in nr.gz --db diamond_nr_db --threads 24`. This will result in a `diamond_nr_db.dmnd` file that is ~150GB.
@@ -143,7 +143,7 @@ Depending on your system resources, you may choose to change the number of threa
 
 The `hit_limit` argument allows you to specify the type of hit limit method and corresponding value. You can choose between the `--top` method or `-k` method, which are used with the range-culling mode (see [DIAMOND documentation](http://www.diamondsearch.org/index.php?pages/command_line_options/)). The default is `--top 5`, meaning a hit will only be deleted if its score is more than 5% lower than that of a higher scoring hit over at least 50% of its query range. Using `-k 5` instead means that a hit will only be deleted if at least 50% of its query range is spanned by at least 5 higher or equal scoring hits. In general, the `-k` method will keep far fewer hits, and specifying `-k 1` will keep a single hit per query range. This can be useful for 1) very simple metagenomic communities, or 2) reducing the output file size. If you choose to modify the `hit_limit` argument, you will want to supply the complete DIAMOND flag (e.g., `-k 3` or `--top 10`).
 
-Finally, consider the `minSupportPercent` argument, which is the minimum support as percent of assigned reads required to report a taxon. The default in MEGAN is 0.05, but with HiFi the best value appears to be 0.01. This provides an optimal trade-off between precision and recall, with near perfect detection of species down to ~0.04% abundance. To avoid any filtering based on this threshold, use a value of 0 instead. This will report ALL assigned reads, which will potentially include thousands of false positives at ultra-low abundances (<0.01%), similar to results from short-read methods (e.g., Kraken2, Centrifuge, etc). Make sure you filter such files after the analysis to reduce false positives!
+Finally, consider the `minSupportPercent` argument, which is the minimum support as percent of assigned reads required to report a taxon. The default in MEGAN is 0.05, but with HiFi the best value appears to be 0.01. This provides an optimal trade-off between precision and recall, with near perfect detection of species down to ~0.04% abundance. To avoid any filtering based on this threshold, use a value of 0 instead. This will report ALL assigned reads, which will potentially include thousands of false positives at ultra-low abundances (<0.01%), similar to results from short-read methods (e.g., Kraken2, Centrifuge, etc). Make sure you filter such files after the analysis to reduce false positives! **Note:** This parameter will only affect the filtered RMA file; a second unfiltered RMA file is also produced by default.
 
 **You must also specify the full paths to `sam2rma`, the MEGAN mapping database file, and the indexed NCBI-nr database (`diamond_nr_db.dmnd`)**. 
 
@@ -278,7 +278,7 @@ Taxonomic-Functional-Profiling-Protein
 - `1-chunks/` temporarily holds the four chunks of an input reads file. Will be empty if no errors occur.
 - `2-diamond/` temporarily holds the four protein-SAM files generated per sample. Will be empty if no errors occur.
 - `3-merged/` contains the filtered and merged protein-SAM files for each sample. *These can be deleted if no other RMA files will be created.*
-- `4-rma/` contains final RMA files for MEGAN. **These are the main files of interest.**
+- `4-rma/` contains final RMA files for MEGAN. **These are the main files of interest.** This includes `{sample}_filtered.protein.{mode}.rma` and `{sample}_unfiltered.protein.{mode}.rma`, which are the optimal filtered and unfiltered RMA files, respectively.
 
 If no additional RMA files are to be generated, you should remove all the large protein-SAM files from the `3-merged/` folder. 
 
@@ -325,7 +325,7 @@ diamond blastx -d {params.db} -q {input} -o {output} -f 101 -F 5000 --range-cull
 
 ### sam2rma
 
-Run the sam2rma tool with long read settings (`-alg longReads`). The default readAssignmentMode (`-ram`) for long reads is `alignedBases`, and `readCount` for all else. This controls whether or not the abundance counts rely on the total number of bases, or the number of reads. I prefer the number of reads, but this option can be changed in the `config.yaml` file to use `alignedBases` instead. The `--minSupportPercent` argument controls the minimum percent of assigned reads required to report a taxon. The default for this pipeline is `0.01` (best tradeoff between precision and recall), but this can be lowered to `0.001` or `0.0001` to recover lots of species at ultra-low abundances (<0.01%). 
+Run the sam2rma tool with long read settings (`-alg longReads`). The default readAssignmentMode (`-ram`) for long reads is `alignedBases`, and `readCount` for all else. This controls whether or not the abundance counts rely on the total number of bases, or the number of reads. I prefer the number of reads, but this option can be changed in the `config.yaml` file to use `alignedBases` instead. The `--minSupportPercent` argument controls the minimum percent of assigned reads required to report a taxon. The default for this pipeline is `0.01` (best tradeoff between precision and recall). Note that a second unfiltered RMA file is produced using a value of `0` (enabling all read-hits to be reported).
 
 ```
 sam2rma -i {input.sam} -r {input.reads} -o {output} -lg -alg longReads -t {threads} -mdb {params.db} -ram {params.ram} --minSupportPercent {params.ms} -v 2> {log}