From 78bade674c253792c4dad725a4402de6aee0e3ad Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Thu, 19 May 2022 15:24:46 +1200
Subject: [PATCH 01/20] port some dependent sofware to pipeline_run_env

---
 README.md                           |   2 +-
 docs/running_on_a_hpc.md            |  50 +++--
 docs/running_on_a_single_machine.md |  46 ++--
 workflow/pipeline_run_env.yml       | 322 ++++++++++++++--------------
 4 files changed, 210 insertions(+), 210 deletions(-)

diff --git a/README.md b/README.md
index 1200c87..206c4d4 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ Cohort samples:
 ## Prerequisites
 
 - **Prerequisite hardware:** [NVIDIA GPUs](https://www.nvidia.com/en-gb/graphics-cards/) (for GPU accelerated runs) (tested with NVIDIA V100)
-- **Prerequisite software:** [NVIDIA CLARA parabricks and dependencies](https://www.nvidia.com/en-us/docs/parabricks/local-installation/) (for GPU accelerated runs) (tested with parabricks version 3.6.1-1), [Git](https://git-scm.com/) (tested with version 1.8.3.1), [Mamba](https://github.com/TheSnakePit/mamba) (tested with version 0.19.1) with [Conda](https://docs.conda.io/projects/conda/en/latest/index.html) (tested with version 4.11.0), [gsutil](https://pypi.org/project/gsutil/) (tested with version 4.34), [gunzip](https://linux.die.net/man/1/gunzip) (tested with version 1.5), [R](https://www.r-project.org/) (tested with version 3.5.1)
+- **Prerequisite software:** [NVIDIA CLARA parabricks and dependencies](https://www.nvidia.com/en-us/docs/parabricks/local-installation/) (for GPU accelerated runs) (tested with parabricks version 3.6.1-1), [Git](https://git-scm.com/) (tested with version 1.8.3.1), [Conda](https://docs.conda.io/projects/conda/en/latest/index.html) (tested with version 4.11.0), [Mamba](https://github.com/TheSnakePit/mamba) (tested with version 0.19.1)
 
 ## Test vcf_annotation_pipeline
 
diff --git a/docs/running_on_a_hpc.md b/docs/running_on_a_hpc.md
index b5bdadc..7b3bc23 100644
--- a/docs/running_on_a_hpc.md
+++ b/docs/running_on_a_hpc.md
@@ -9,22 +9,22 @@
   - [3. Setup files and directories](#3-setup-files-and-directories)
     - [Test data](#test-data)
   - [4. Get prerequisite software/hardware](#4-get-prerequisite-softwarehardware)
-  - [5. Create a local copy of the GATK resource bundle (either b37 or hg38)](#5-create-a-local-copy-of-the-gatk-resource-bundle-either-b37-or-hg38)
+  - [5. Create and activate a conda environment with python, snakemake, gsutil and wget installed](#5-create-and-activate-a-conda-environment-with-python-snakemake-gsutil-and-wget-installed)
+  - [6. Create a local copy of the GATK resource bundle (either b37 or hg38)](#6-create-a-local-copy-of-the-gatk-resource-bundle-either-b37-or-hg38)
     - [b37](#b37)
     - [hg38](#hg38)
-  - [6. Create a local copy of other databases (either GRCh37 or GRCh38)](#6-create-a-local-copy-of-other-databases-either-grch37-or-grch38)
+  - [7. Create a local copy of other databases (either GRCh37 or GRCh38)](#7-create-a-local-copy-of-other-databases-either-grch37-or-grch38)
     - [GRCh37](#grch37)
     - [GRCh38](#grch38)
-  - [7. Modify the configuration file](#7-modify-the-configuration-file)
+  - [8. Modify the configuration file](#8-modify-the-configuration-file)
     - [Overall workflow](#overall-workflow)
     - [Pipeline resources](#pipeline-resources)
     - [Variant filtering](#variant-filtering)
       - [Single samples](#single-samples)
       - [Cohort samples](#cohort-samples)
     - [VCF annotation](#vcf-annotation)
-  - [8. Configure to run on a HPC](#8-configure-to-run-on-a-hpc)
-  - [9. Modify the run scripts](#9-modify-the-run-scripts)
-  - [10. Create and activate a conda environment with python and snakemake installed](#10-create-and-activate-a-conda-environment-with-python-and-snakemake-installed)
+  - [9. Configure to run on a HPC](#9-configure-to-run-on-a-hpc)
+  - [10. Modify the run scripts](#10-modify-the-run-scripts)
   - [11. Run the pipeline](#11-run-the-pipeline)
   - [12. Evaluate the pipeline run](#12-evaluate-the-pipeline-run)
   - [13. Commit and push to your forked version of the github repo](#13-commit-and-push-to-your-forked-version-of-the-github-repo)
@@ -106,19 +106,25 @@ bash ./test/setup_test.sh -a cohort
 
 ## 4. Get prerequisite software/hardware
 
-For GPU accelerated runs, you'll need [NVIDIA GPUs](https://www.nvidia.com/en-gb/graphics-cards/) and [NVIDIA CLARA PARABRICKS and dependencies](https://www.nvidia.com/en-us/docs/parabricks/local-installation/). Talk to your system administrator to see if the HPC has this hardware and software available.
+For GPU accelerated runs, you'll need [NVIDIA GPUs](https://www.nvidia.com/en-gb/graphics-cards/) (tested with NVIDIA V100) and [NVIDIA CLARA PARABRICKS and dependencies](https://www.nvidia.com/en-us/docs/parabricks/local-installation/) (tested with parabricks version 3.6.1-1). Talk to your system administrator to see if the HPC has this hardware and software available.
 
 Other software required to get setup and run the pipeline:
 
-- [Git](https://git-scm.com/) (tested with version 2.7.4)
-- [Conda](https://docs.conda.io/projects/conda/en/latest/index.html) (tested with version 4.8.2)
-- [Mamba](https://github.com/TheSnakePit/mamba) (tested with version 0.4.4) (note. [mamba can be installed via conda with a single command](https://mamba.readthedocs.io/en/latest/installation.html#existing-conda-install))
-- [gsutil](https://pypi.org/project/gsutil/) (tested with version 4.52)
-- [gunzip](https://linux.die.net/man/1/gunzip) (tested with version 1.6)
+- [Git](https://git-scm.com/) (tested with version 1.8.3.1)
+- [Conda](https://docs.conda.io/projects/conda/en/latest/index.html) (tested with version 4.11.0)
+- [Mamba](https://github.com/TheSnakePit/mamba) (tested with version 0.19.1) (note. [mamba can be installed via conda with a single command](https://mamba.readthedocs.io/en/latest/installation.html#existing-conda-install))
 
-Most of this software is commonly pre-installed on HPC's, likely available as modules that can be loaded. Talk to your system administrator if you need help with this.
+This software is commonly pre-installed on HPC's, likely available as modules that can be loaded. Talk to your system administrator if you need help with this.
 
-## 5. Create a local copy of the [GATK resource bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle) (either b37 or hg38)
+## 5. Create and activate a conda environment with python, snakemake, gsutil and wget installed
+
+```bash
+cd ./workflow/
+mamba env create -f pipeline_run_env.yml
+conda activate pipeline_run_env
+```
+
+## 6. Create a local copy of the [GATK resource bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle) (either b37 or hg38)
 
 ### b37
 
@@ -136,7 +142,7 @@ Download from [Google Cloud Bucket](https://console.cloud.google.com/storage/bro
 gsutil cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
 ```
 
-## 6. Create a local copy of other databases (either GRCh37 or GRCh38)
+## 7. Create a local copy of other databases (either GRCh37 or GRCh38)
 
 ### GRCh37
 
@@ -180,7 +186,7 @@ wget https://krishna.gs.washington.edu/download/CADD/v1.5/GRCh38/whole_genome_SN
 
 Create a custom [dbNSFP database](https://sites.google.com/site/jpopgen/dbNSFP) build by following [this documentation](https://github.com/GenomicsAotearoa/dbNSFP_build)
 
-## 7. Modify the configuration file
+## 8. Modify the configuration file
 
 Edit 'config.yaml' found within the config directory.
 
@@ -341,7 +347,7 @@ dbNSFP: "/scratch/publicData/dbNSFP/GRCh37/dbNSFPv4.0a.hg19.custombuild.gz"
 CADD: "/scratch/publicData/CADD/GRCh37/whole_genome_SNVs.tsv.gz"
 ```
 
-## 8. Configure to run on a HPC
+## 9. Configure to run on a HPC
 
 *This will deploy the non-GPU accelerated rules to slurm and deploy the GPU accelerated rules locally (pbrun_cnnscorevariants). Therefore, if running the pipeline gpu accelerated, the pipeline should be deployed from the machine with the GPU's.*
 
@@ -362,7 +368,7 @@ Configure `account:` and `partition:` in the default section of 'cluster.json' i
 
 There are a plethora of additional slurm parameters that can be configured (and can be configured per rule). If you set additional slurm parameters, remember to pass them to the `--cluster` flag in the runscripts. See [here](https://snakemake-on-nesi.sschmeier.com/snake.html#slurm-and-nesi-specific-setup) for a good working example of deploying a snakemake workflow to [NeSi](https://www.nesi.org.nz/).
 
-## 9. Modify the run scripts
+## 10. Modify the run scripts
 
 Set the singularity bind location to a directory that contains your pipeline working directory with the `--singularity-args '-B'` flag. Set the number maximum number of cores to be used with the `--cores` flag and the maximum amount of memory to be used (in megabytes) with the `resources mem_mb=` flag. If running GPU accelerated, also set the maximum number of GPU's to be used with the `--resources gpu=` flag. For example:
 
@@ -407,14 +413,6 @@ snakemake \
 
 See the [snakemake documentation](https://snakemake.readthedocs.io/en/v4.5.1/executable.html#all-options) for additional run parameters.
 
-## 10. Create and activate a conda environment with python and snakemake installed
-
-```bash
-cd ./workflow/
-mamba env create -f pipeline_run_env.yml
-conda activate pipeline_run_env
-```
-
 ## 11. Run the pipeline
 
 First carry out a dry run
diff --git a/docs/running_on_a_single_machine.md b/docs/running_on_a_single_machine.md
index 66a2577..6ae4df0 100644
--- a/docs/running_on_a_single_machine.md
+++ b/docs/running_on_a_single_machine.md
@@ -9,21 +9,21 @@
   - [3. Setup files and directories](#3-setup-files-and-directories)
     - [Test data](#test-data)
   - [4. Get prerequisite software/hardware](#4-get-prerequisite-softwarehardware)
-  - [5. Create a local copy of the GATK resource bundle (either b37 or hg38)](#5-create-a-local-copy-of-the-gatk-resource-bundle-either-b37-or-hg38)
+  - [5. Create and activate a conda environment with python, snakemake, gsutil and wget installed](#5-create-and-activate-a-conda-environment-with-python-snakemake-gsutil-and-wget-installed)
+  - [6. Create a local copy of the GATK resource bundle (either b37 or hg38)](#6-create-a-local-copy-of-the-gatk-resource-bundle-either-b37-or-hg38)
     - [b37](#b37)
     - [hg38](#hg38)
-  - [6. Create a local copy of other databases (either GRCh37 or GRCh38)](#6-create-a-local-copy-of-other-databases-either-grch37-or-grch38)
+  - [7. Create a local copy of other databases (either GRCh37 or GRCh38)](#7-create-a-local-copy-of-other-databases-either-grch37-or-grch38)
     - [GRCh37](#grch37)
     - [GRCh38](#grch38)
-  - [7. Modify the configuration file](#7-modify-the-configuration-file)
+  - [8. Modify the configuration file](#8-modify-the-configuration-file)
     - [Overall workflow](#overall-workflow)
     - [Pipeline resources](#pipeline-resources)
     - [Variant filtering](#variant-filtering)
       - [Single samples](#single-samples)
       - [Cohort samples](#cohort-samples)
     - [VCF annotation](#vcf-annotation)
-  - [8. Modify the run scripts](#8-modify-the-run-scripts)
-  - [9. Create and activate a conda environment with python and snakemake installed](#9-create-and-activate-a-conda-environment-with-python-and-snakemake-installed)
+  - [9. Modify the run scripts](#9-modify-the-run-scripts)
   - [10. Run the pipeline](#10-run-the-pipeline)
   - [11. Evaluate the pipeline run](#11-evaluate-the-pipeline-run)
   - [12. Commit and push to your forked version of the github repo](#12-commit-and-push-to-your-forked-version-of-the-github-repo)
@@ -105,19 +105,25 @@ bash ./test/setup_test.sh -a cohort
 
 ## 4. Get prerequisite software/hardware
 
-For GPU accelerated runs, you'll need [NVIDIA GPUs](https://www.nvidia.com/en-gb/graphics-cards/) and [NVIDIA CLARA PARABRICKS and dependencies](https://www.nvidia.com/en-us/docs/parabricks/local-installation/). Talk to your system administrator to see if the HPC has this hardware and software available.
+For GPU accelerated runs, you'll need [NVIDIA GPUs](https://www.nvidia.com/en-gb/graphics-cards/) (tested with NVIDIA V100) and [NVIDIA CLARA PARABRICKS and dependencies](https://www.nvidia.com/en-us/docs/parabricks/local-installation/) (tested with parabricks version 3.6.1-1). Talk to your system administrator to see if the HPC has this hardware and software available.
 
 Other software required to get setup and run the pipeline:
 
-- [Git](https://git-scm.com/) (tested with version 2.7.4)
-- [Conda](https://docs.conda.io/projects/conda/en/latest/index.html) (tested with version 4.8.2)
-- [Mamba](https://github.com/TheSnakePit/mamba) (tested with version 0.4.4) (note. [mamba can be installed via conda with a single command](https://mamba.readthedocs.io/en/latest/installation.html#existing-conda-install))
-- [gsutil](https://pypi.org/project/gsutil/) (tested with version 4.52)
-- [gunzip](https://linux.die.net/man/1/gunzip) (tested with version 1.6)
+- [Git](https://git-scm.com/) (tested with version 1.8.3.1)
+- [Conda](https://docs.conda.io/projects/conda/en/latest/index.html) (tested with version 4.11.0)
+- [Mamba](https://github.com/TheSnakePit/mamba) (tested with version 0.19.1) (note. [mamba can be installed via conda with a single command](https://mamba.readthedocs.io/en/latest/installation.html#existing-conda-install))
 
-Most of this software is commonly pre-installed on HPC's, likely available as modules that can be loaded. Talk to your system administrator if you need help with this.
+This software is commonly pre-installed on HPC's, likely available as modules that can be loaded. Talk to your system administrator if you need help with this.
 
-## 5. Create a local copy of the [GATK resource bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle) (either b37 or hg38)
+## 5. Create and activate a conda environment with python, snakemake, gsutil and wget installed
+
+```bash
+cd ./workflow/
+mamba env create -f pipeline_run_env.yml
+conda activate pipeline_run_env
+```
+
+## 6. Create a local copy of the [GATK resource bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle) (either b37 or hg38)
 
 ### b37
 
@@ -135,7 +141,7 @@ Download from [Google Cloud Bucket](https://console.cloud.google.com/storage/bro
 gsutil cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
 ```
 
-## 6. Create a local copy of other databases (either GRCh37 or GRCh38)
+## 7. Create a local copy of other databases (either GRCh37 or GRCh38)
 
 ### GRCh37
 
@@ -179,7 +185,7 @@ wget https://krishna.gs.washington.edu/download/CADD/v1.5/GRCh38/whole_genome_SN
 
 Create a custom [dbNSFP database](https://sites.google.com/site/jpopgen/dbNSFP) build by following [this documentation](https://github.com/GenomicsAotearoa/dbNSFP_build)
 
-## 7. Modify the configuration file
+## 8. Modify the configuration file
 
 Edit 'config.yaml' found within the config directory.
 
@@ -340,7 +346,7 @@ dbNSFP: "/scratch/publicData/dbNSFP/GRCh37/dbNSFPv4.0a.hg19.custombuild.gz"
 CADD: "/scratch/publicData/CADD/GRCh37/whole_genome_SNVs.tsv.gz"
 ```
 
-## 8. Modify the run scripts
+## 9. Modify the run scripts
 
 Set the singularity bind location to a directory that contains your pipeline working directory with the `--singularity-args '-B'` flag. Set the number maximum number of cores to be used with the `--cores` flag and the maximum amount of memory to be used (in megabytes) with the `resources mem_mb=` flag. If running GPU accelerated, also set the maximum number of GPU's to be used with the `--resources gpu=` flag. For example:
 
@@ -385,14 +391,6 @@ snakemake \
 
 See the [snakemake documentation](https://snakemake.readthedocs.io/en/v4.5.1/executable.html#all-options) for additional run parameters.
 
-## 9. Create and activate a conda environment with python and snakemake installed
-
-```bash
-cd ./workflow/
-mamba env create -f pipeline_run_env.yml
-conda activate pipeline_run_env
-```
-
 ## 10. Run the pipeline
 
 First carry out a dry run
diff --git a/workflow/pipeline_run_env.yml b/workflow/pipeline_run_env.yml
index 45126ca..e25b131 100644
--- a/workflow/pipeline_run_env.yml
+++ b/workflow/pipeline_run_env.yml
@@ -1,172 +1,176 @@
 name: pipeline_run_env
 channels:
-  - bioconda
   - conda-forge
+  - bioconda
   - defaults
 dependencies:
   - _libgcc_mutex=0.1=conda_forge
-  - _openmp_mutex=4.5=1_gnu
-  - aioeasywebdav=2.4.0=py37_1000
-  - aiohttp=3.6.2=py37h516909a_0
-  - appdirs=1.4.4=py_0
-  - async-timeout=3.0.1=py_1000
-  - attrs=19.3.0=py_0
-  - bcrypt=3.1.7=py37h8f50634_1
-  - boto3=1.9.191=py_0
-  - botocore=1.12.191=py_0
-  - brotlipy=0.7.0=py37h516909a_1000
-  - bzip2=1.0.8=h516909a_2
-  - c-ares=1.16.1=h516909a_0
-  - ca-certificates=2020.6.24=0
-  - cachetools=4.1.1=py_0
-  - cairo=1.16.0=h3fc0475_1005
-  - certifi=2020.6.20=py37hc8dfbb8_0
-  - cffi=1.14.1=py37h2b28604_0
-  - chardet=3.0.4=py37hc8dfbb8_1006
-  - configargparse=1.2.3=pyh9f0ad1d_0
-  - cryptography=3.0=py37hb09aad4_0
-  - datrie=0.8.2=py37h8f50634_0
-  - decorator=4.4.2=py_0
-  - docutils=0.16=py37hc8dfbb8_1
-  - dropbox=10.2.0=py37_0
-  - expat=2.2.9=he1b5a44_2
-  - fftw=3.3.8=mpi_mpich_h3f9e1be_1011
+  - _openmp_mutex=4.5=2_gnu
+  - aioeasywebdav=2.4.0=pyha770c72_0
+  - aiohttp=3.8.1=py310h5764c6d_1
+  - aiosignal=1.2.0=pyhd8ed1ab_0
+  - amply=0.1.5=pyhd8ed1ab_0
+  - appdirs=1.4.4=pyh9f0ad1d_0
+  - argcomplete=2.0.0=pyhd8ed1ab_0
+  - async-timeout=4.0.2=pyhd8ed1ab_0
+  - attmap=0.13.2=pyhd8ed1ab_0
+  - attrs=21.4.0=pyhd8ed1ab_0
+  - backports=1.0=py_2
+  - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
+  - bcrypt=3.2.2=py310h5764c6d_0
+  - boto=2.49.0=py_0
+  - boto3=1.23.2=pyhd8ed1ab_0
+  - botocore=1.26.2=pyhd8ed1ab_0
+  - brotlipy=0.7.0=py310h5764c6d_1004
+  - bzip2=1.0.8=h7f98852_4
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2022.5.18=ha878542_0
+  - cachetools=5.0.0=pyhd8ed1ab_0
+  - certifi=2022.5.18=py310hff52083_0
+  - cffi=1.15.0=py310hd667e15_1
+  - charset-normalizer=2.0.12=pyhd8ed1ab_0
+  - coincbc=2.10.5=hcee13e7_1
+  - configargparse=1.5.3=pyhd8ed1ab_0
+  - connection_pool=0.0.3=pyhd3deb0d_0
+  - coverage=6.3.3=py310h5764c6d_0
+  - crcmod=1.7=py310h5764c6d_1008
+  - cryptography=37.0.1=py310h9ce1e76_0
+  - datrie=0.8.2=py310h6acc77f_3
+  - decorator=5.1.1=pyhd8ed1ab_0
+  - defusedxml=0.7.1=pyhd8ed1ab_0
+  - docutils=0.18.1=py310hff52083_1
+  - dpath=2.0.6=py310hff52083_1
+  - dropbox=11.30.0=pyhd8ed1ab_0
+  - fasteners=0.17.3=pyhd8ed1ab_0
   - filechunkio=1.8=py_2
-  - fontconfig=2.13.1=h1056068_1002
-  - freetype=2.10.2=he06d7ca_0
-  - ftputil=4.0.0=py_0
-  - gdk-pixbuf=2.36.12=h3f25603_1005
-  - gettext=0.19.8.1=hc5be6a0_1002
-  - ghostscript=9.22=hf484d3e_1001
-  - giflib=5.1.7=h516909a_1
-  - gitdb=4.0.5=py_0
-  - gitpython=3.1.7=py_0
-  - glib=2.65.0=h6f030ca_0
-  - gobject-introspection=1.64.1=py37h619baee_1
-  - google-api-core=1.22.1=py37hc8dfbb8_0
-  - google-api-python-client=1.10.0=pyh9f0ad1d_0
-  - google-auth=1.20.1=py_0
-  - google-auth-httplib2=0.0.3=py_3
-  - google-cloud-core=1.4.1=pyh9f0ad1d_0
-  - google-cloud-storage=1.30.0=pyh9f0ad1d_0
-  - google-crc32c=1.0.0=py37h193935f_0
-  - google-resumable-media=0.7.0=pyh9f0ad1d_0
-  - googleapis-common-protos=1.51.0=py37hc8dfbb8_2
-  - graphite2=1.3.14=h23475e2_0
-  - graphviz=2.38.0=hf68f40c_1011
-  - grpcio=1.31.0=py37hb0870dc_0
-  - harfbuzz=2.4.0=hee91db6_5
-  - httplib2=0.18.1=pyh9f0ad1d_0
-  - icu=67.1=he1b5a44_0
-  - idna=2.10=pyh9f0ad1d_0
-  - imagemagick=7.0.8_54=pl526h39023e4_0
-  - importlib-metadata=1.7.0=py37hc8dfbb8_0
-  - importlib_metadata=1.7.0=0
-  - ipython_genutils=0.2.0=py_1
-  - jbig=2.1=h516909a_2002
-  - jinja2=2.11.2=pyh9f0ad1d_0
-  - jmespath=0.10.0=pyh9f0ad1d_0
-  - jpeg=9d=h516909a_0
-  - jsonschema=3.2.0=py37hc8dfbb8_1
-  - jupyter_core=4.6.3=py37hc8dfbb8_1
-  - ld_impl_linux-64=2.34=hc38a660_9
-  - libblas=3.8.0=17_openblas
-  - libcblas=3.8.0=17_openblas
-  - libcrc32c=1.1.1=he1b5a44_2
-  - libcroco=0.6.13=h8d621e5_1
-  - libffi=3.2.1=he1b5a44_1007
-  - libgcc-ng=9.3.0=h24d8f2e_14
-  - libgfortran-ng=7.5.0=hdf63c60_14
-  - libgomp=9.3.0=h24d8f2e_14
-  - libiconv=1.15=h516909a_1006
-  - liblapack=3.8.0=17_openblas
-  - libopenblas=0.3.10=pthreads_hb3c22a3_4
-  - libpng=1.6.37=hed695b0_2
-  - libprotobuf=3.12.4=h8b12597_0
-  - librsvg=2.44.14=h11c8777_0
-  - libsodium=1.0.18=h516909a_0
-  - libstdcxx-ng=9.3.0=hdf63c60_14
-  - libtiff=4.1.0=hc3755c2_3
-  - libtool=2.4.6=h516909a_1003
-  - libuuid=2.32.1=h14c3975_1000
-  - libwebp=1.0.2=hf4e8a37_4
-  - libxcb=1.14=h7b6447c_0
-  - libxml2=2.9.10=h72b56ed_2
-  - lz4-c=1.9.2=he1b5a44_1
-  - markupsafe=1.1.1=py37h8f50634_1
-  - mpi=1.0=mpich
-  - mpich=3.3.2=hc856adb_0
-  - multidict=4.7.5=py37h8f50634_1
-  - nbformat=5.0.7=py_0
-  - ncurses=6.2=he1b5a44_1
-  - networkx=2.4=py_1
-  - numpy=1.19.1=py37h8960a57_0
+  - filelock=3.7.0=pyhd8ed1ab_0
+  - frozenlist=1.3.0=py310h5764c6d_1
+  - ftputil=5.0.4=pyhd8ed1ab_0
+  - gcs-oauth2-boto-plugin=3.0=pyhd8ed1ab_0
+  - gettext=0.19.8.1=h0b5b191_1005
+  - gitdb=4.0.9=pyhd8ed1ab_0
+  - gitpython=3.1.27=pyhd8ed1ab_0
+  - google-api-core=2.7.1=pyhd8ed1ab_0
+  - google-api-python-client=2.48.0=pyhd8ed1ab_0
+  - google-apitools=0.5.32=pyhd8ed1ab_0
+  - google-auth=2.6.6=pyh6c4a22f_0
+  - google-auth-httplib2=0.1.0=pyhd8ed1ab_0
+  - google-cloud-core=2.2.2=pyh6c4a22f_0
+  - google-cloud-storage=2.1.0=pyh6c4a22f_0
+  - google-crc32c=1.1.2=py310he8fe98e_3
+  - google-reauth=0.1.1=pyhd3deb0d_0
+  - google-resumable-media=2.1.0=pyh6c4a22f_0
+  - googleapis-common-protos=1.56.1=py310hff52083_0
+  - grpcio=1.35.0=py310hce63b2e_0
+  - gsutil=5.10=pyhd8ed1ab_0
+  - httplib2=0.20.4=pyhd8ed1ab_0
+  - idna=3.3=pyhd8ed1ab_0
+  - importlib-metadata=4.11.3=py310hff52083_1
+  - importlib_metadata=4.11.3=hd8ed1ab_1
+  - importlib_resources=5.7.1=pyhd8ed1ab_1
+  - iniconfig=1.1.1=pyh9f0ad1d_0
+  - jinja2=3.1.2=pyhd8ed1ab_0
+  - jmespath=1.0.0=pyhd8ed1ab_0
+  - jsonschema=4.5.1=pyhd8ed1ab_0
+  - jupyter_core=4.10.0=py310hff52083_0
+  - ld_impl_linux-64=2.36.1=hea4e1c9_2
+  - libblas=3.9.0=14_linux64_openblas
+  - libcblas=3.9.0=14_linux64_openblas
+  - libcrc32c=1.1.2=h9c3ff4c_0
+  - libffi=3.3=h58526e2_2
+  - libgcc-ng=12.1.0=h8d9b700_16
+  - libgfortran-ng=12.1.0=h69a702a_16
+  - libgfortran5=12.1.0=hdcd56e2_16
+  - libgomp=12.1.0=h8d9b700_16
+  - libidn2=2.3.2=h7f98852_0
+  - liblapack=3.9.0=14_linux64_openblas
+  - libnsl=2.0.0=h7f98852_0
+  - libopenblas=0.3.20=pthreads_h78a6416_0
+  - libprotobuf=3.20.1=h4ff587b_0
+  - libsodium=1.0.18=h36c2ea0_1
+  - libstdcxx-ng=12.1.0=ha89aaad_16
+  - libunistring=0.9.10=h7f98852_0
+  - libuuid=1.0.3=h7f8727e_2
+  - logmuse=0.2.6=pyh8c360ce_0
+  - markupsafe=2.1.1=py310h5764c6d_1
+  - monotonic=1.5=py_0
+  - multidict=6.0.2=py310h5764c6d_1
+  - nbformat=5.4.0=pyhd8ed1ab_0
+  - ncurses=6.3=h27087fc_1
+  - numpy=1.22.3=py310h4ef5377_2
   - oauth2client=4.1.3=py_0
-  - openjpeg=2.3.1=h981e76c_3
-  - openssl=1.1.1g=h516909a_1
-  - pandas=1.1.0=py37h3340039_0
-  - pango=1.40.14=he7ab937_1005
-  - paramiko=2.7.1=py37_0
-  - pcre=8.44=he1b5a44_0
-  - perl=5.26.2=h516909a_1006
-  - pip=20.2.2=py_0
-  - pixman=0.38.0=h516909a_1003
-  - pkg-config=0.29.2=h516909a_1006
-  - prettytable=0.7.2=py_3
-  - protobuf=3.12.4=py37h3340039_0
-  - psutil=5.7.2=py37h8f50634_0
+  - openssl=1.1.1o=h166bdaf_0
+  - packaging=21.3=pyhd8ed1ab_0
+  - pandas=1.4.2=py310h769672d_1
+  - paramiko=2.11.0=pyhd8ed1ab_0
+  - peppy=0.31.2=pyhd8ed1ab_2
+  - pip=22.1=pyhd8ed1ab_0
+  - plac=1.3.5=pyhd8ed1ab_0
+  - pluggy=1.0.0=py310hff52083_3
+  - ply=3.11=py_1
+  - prettytable=3.3.0=pyhd8ed1ab_0
+  - protobuf=3.20.1=py310hd8f1fbe_0
+  - psutil=5.9.0=py310h5764c6d_1
+  - pulp=2.6.0=py310hff52083_1
+  - py=1.11.0=pyh6c4a22f_0
   - pyasn1=0.4.8=py_0
   - pyasn1-modules=0.2.7=py_0
-  - pycparser=2.20=pyh9f0ad1d_2
-  - pygments=2.6.1=py_0
-  - pygraphviz=1.5=py37h8f50634_1002
-  - pynacl=1.4.0=py37h7b6447c_1
-  - pyopenssl=19.1.0=py37_0
-  - pyrsistent=0.16.0=py37h8f50634_0
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pygments=2.12.0=pyhd8ed1ab_0
+  - pynacl=1.5.0=py310h5764c6d_1
+  - pyopenssl=22.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.9=pyhd8ed1ab_0
+  - pyrsistent=0.18.1=py310h5764c6d_1
   - pysftp=0.2.9=py_1
-  - pysocks=1.7.1=py37hc8dfbb8_1
-  - python=3.7.6=cpython_he5300dc_6
-  - python-dateutil=2.8.1=py_0
-  - python-irodsclient=0.8.2=py_0
-  - python_abi=3.7=1_cp37m
-  - pytz=2020.1=pyh9f0ad1d_0
-  - pyyaml=5.3.1=py37h8f50634_0
-  - ratelimiter=1.2.0=py37hc8dfbb8_1001
-  - readline=8.0=he28a2e2_2
-  - requests=2.24.0=pyh9f0ad1d_0
-  - rsa=4.6=pyh9f0ad1d_0
-  - s3transfer=0.2.1=py37_0
-  - setuptools=49.5.0=py37hc8dfbb8_0
-  - simplejson=3.17.2=py37h8f50634_0
-  - six=1.15.0=pyh9f0ad1d_0
+  - pysocks=1.7.1=py310hff52083_5
+  - pytest=7.1.2=py310hff52083_0
+  - python=3.10.4=h12debd9_0
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python-fastjsonschema=2.15.3=pyhd8ed1ab_0
+  - python-irodsclient=1.1.3=pyhd8ed1ab_0
+  - python_abi=3.10=2_cp310
+  - pytz=2022.1=pyhd8ed1ab_0
+  - pyu2f=0.1.5=pyhd8ed1ab_0
+  - pyyaml=6.0=py310h5764c6d_4
+  - ratelimiter=1.2.0=py_1002
+  - readline=8.1=h46c0cb4_0
+  - requests=2.27.1=pyhd8ed1ab_0
+  - retry=0.9.2=py_0
+  - retry_decorator=1.1.1=pyh9f0ad1d_0
+  - rsa=4.8=pyhd8ed1ab_0
+  - s3transfer=0.5.2=pyhd8ed1ab_0
+  - setuptools=62.3.1=py310hff52083_0
+  - setuptools-scm=6.4.2=pyhd8ed1ab_0
+  - six=1.16.0=pyh6c4a22f_0
   - slacker=0.14.0=py_0
-  - smmap=3.0.4=pyh9f0ad1d_0
-  - snakemake=5.22.0=0
-  - snakemake-minimal=5.22.0=py_0
-  - sqlite=3.32.3=hcee41ef_1
-  - tk=8.6.10=hed695b0_0
-  - toposort=1.5=py_3
-  - traitlets=4.3.3=py37hc8dfbb8_1
-  - uritemplate=3.0.1=py_0
-  - urllib3=1.25.10=py_0
-  - wheel=0.34.2=py37_0
-  - wrapt=1.12.1=py37h8f50634_1
-  - xmlrunner=1.7.7=py_0
-  - xorg-kbproto=1.0.7=h14c3975_1002
-  - xorg-libice=1.0.10=h516909a_0
-  - xorg-libsm=1.2.3=h84519dc_1000
-  - xorg-libx11=1.6.11=h516909a_0
-  - xorg-libxext=1.3.4=h516909a_0
-  - xorg-libxpm=3.5.13=h516909a_0
-  - xorg-libxrender=0.9.10=h516909a_1002
-  - xorg-libxt=1.1.5=h516909a_1003
-  - xorg-renderproto=0.11.1=h14c3975_1002
-  - xorg-xextproto=7.3.0=h14c3975_1002
-  - xorg-xproto=7.0.31=h14c3975_1007
+  - smart_open=6.0.0=pyhd8ed1ab_0
+  - smmap=3.0.5=pyh44b312d_0
+  - snakemake=7.7.0=hdfd78af_0
+  - snakemake-minimal=7.7.0=pyhdfd78af_0
+  - socksipy-branch=1.01=pyh9f0ad1d_0
+  - sqlite=3.38.3=hc218d9a_0
+  - stone=3.3.1=pyhd8ed1ab_0
+  - stopit=1.1.2=py_0
+  - tabulate=0.8.9=pyhd8ed1ab_0
+  - tk=8.6.11=h1ccaba5_1
+  - tomli=2.0.1=pyhd8ed1ab_0
+  - toposort=1.7=pyhd8ed1ab_0
+  - traitlets=5.2.1.post0=pyhd8ed1ab_0
+  - typing-extensions=4.2.0=hd8ed1ab_1
+  - typing_extensions=4.2.0=pyha770c72_1
+  - tzdata=2022a=h191b570_0
+  - ubiquerg=0.6.1=pyh9f0ad1d_0
+  - uritemplate=4.1.1=pyhd8ed1ab_0
+  - urllib3=1.26.9=pyhd8ed1ab_0
+  - veracitools=0.1.3=py_0
+  - wcwidth=0.2.5=pyh9f0ad1d_2
+  - wget=1.21.3=h0b77cf5_0
+  - wheel=0.37.1=pyhd8ed1ab_0
+  - wrapt=1.14.1=py310h5764c6d_0
   - xz=5.2.5=h516909a_1
-  - yaml=0.2.5=h516909a_0
-  - yarl=1.4.2=py37h7b6447c_0
-  - zipp=3.1.0=py_0
-  - zlib=1.2.11=h516909a_1007
-  - zstd=1.4.5=h6597ccf_2
-prefix: /home/lkemp/anaconda3/envs/pipeline_run_env
+  - yaml=0.2.5=h7f98852_2
+  - yarl=1.7.2=py310h5764c6d_2
+  - yte=1.4.0=py310hff52083_0
+  - zipp=3.8.0=pyhd8ed1ab_0
+  - zlib=1.2.12=h7f8727e_2
+prefix: /home/lkemp/miniconda3/envs/pipeline_run_env

From 2d52d6d34af01270bc9f157e5ae142a58ddb112e Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Thu, 19 May 2022 15:34:55 +1200
Subject: [PATCH 02/20] -j flag changed in newer version of snakemake

---
 docs/running_on_a_hpc.md            | 12 ++++++++----
 docs/running_on_a_single_machine.md | 24 ++++++++++--------------
 workflow/dryrun.sh                  |  2 +-
 workflow/dryrun_hpc.sh              |  2 +-
 workflow/run.sh                     |  2 +-
 workflow/run_hpc.sh                 |  2 +-
 6 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/docs/running_on_a_hpc.md b/docs/running_on_a_hpc.md
index 7b3bc23..2e378c4 100644
--- a/docs/running_on_a_hpc.md
+++ b/docs/running_on_a_hpc.md
@@ -268,7 +268,7 @@ Set the maximum number of GPU's to be used per rule/sample for gpu-accelerated r
 GPU: 1
 ```
 
-It is a good idea to consider the number of samples that you are processing. For example, if you set `THREADS: "8"` and set the maximum number of cores to be used by the pipeline in the run script to `-j/--cores 32` (see [step 9](#9-modify-the-run-scripts)), a maximum of 3 samples will be able to run at one time for these rules (if they are deployed at the same time), but each sample will complete faster. In contrast, if you set `THREADS: "1"` and `-j/--cores 32`, a maximum of 32 samples could be run at one time, but each sample will take longer to complete. This also needs to be considered when setting `MAXMEMORY` + `--resources mem_mb` and `GPU` + `--resources gpu`.
+It is a good idea to consider the number of samples that you are processing. For example, if you set `THREADS: "8"` and set the maximum number of jobs to be run in the run script to `-j/--jobs 32` (see [step 9](#9-modify-the-run-scripts)), a maximum of 3 samples will be able to run at one time for these rules (if they are deployed at the same time), but each sample will complete faster. In contrast, if you set `THREADS: "1"` and `-j/--jobs 32`, a maximum of 32 samples could be run at one time, but each sample will take longer to complete. This also needs to be considered when setting `MAXMEMORY` + `--resources mem_mb` and `GPU` + `--resources gpu`.
 
 ### Variant filtering
 
@@ -370,14 +370,16 @@ There are a plethora of additional slurm parameters that can be configured (and
 
 ## 10. Modify the run scripts
 
-Set the singularity bind location to a directory that contains your pipeline working directory with the `--singularity-args '-B'` flag. Set the number maximum number of cores to be used with the `--cores` flag and the maximum amount of memory to be used (in megabytes) with the `resources mem_mb=` flag. If running GPU accelerated, also set the maximum number of GPU's to be used with the `--resources gpu=` flag. For example:
+Set the singularity bind location to a directory that contains your pipeline working directory with the `--singularity-args '-B'` flag. Set the number maximum number of job to be deployed with the `--jobs` flag and the maximum amount of memory to be used (in megabytes) with the `resources mem_mb=` flag. If running GPU accelerated, also set the maximum number of GPU's to be used with the `--resources gpu=` flag. For example:
 
 Dry run (dryrun_hpc.sh):
 
 ```bash
+#!/bin/bash -x
+
 snakemake \
 --dryrun \
---cores 32 \
+--jobs 32 \
 --resources mem_mb=150000 \
 --resources gpu=2 \
 --use-conda \
@@ -395,8 +397,10 @@ snakemake \
 Full run (run_hpc.sh):
 
 ```bash
+#!/bin/bash -x
+
 snakemake \
---cores 32 \
+--jobs 32 \
 --resources mem_mb=150000 \
 --resources gpu=2 \
 --use-conda \
diff --git a/docs/running_on_a_single_machine.md b/docs/running_on_a_single_machine.md
index 6ae4df0..107c3d0 100644
--- a/docs/running_on_a_single_machine.md
+++ b/docs/running_on_a_single_machine.md
@@ -267,7 +267,7 @@ Set the maximum number of GPU's to be used per rule/sample for gpu-accelerated r
 GPU: 1
 ```
 
-It is a good idea to consider the number of samples that you are processing. For example, if you set `THREADS: "8"` and set the maximum number of cores to be used by the pipeline in the run script to `-j/--cores 32` (see [step 8](#8-modify-the-run-scripts)), a maximum of 3 samples will be able to run at one time for these rules (if they are deployed at the same time), but each sample will complete faster. In contrast, if you set `THREADS: "1"` and `-j/--cores 32`, a maximum of 32 samples could be run at one time, but each sample will take longer to complete. This also needs to be considered when setting `MAXMEMORY` + `--resources mem_mb` and `GPU` + `--resources gpu`.
+It is a good idea to consider the number of samples that you are processing. For example, if you set `THREADS: "8"` and set the maximum number of jobs to be run in the run script to `-j/--jobs 32` (see [step 8](#8-modify-the-run-scripts)), a maximum of 3 samples will be able to run at one time for these rules (if they are deployed at the same time), but each sample will complete faster. In contrast, if you set `THREADS: "1"` and `-j/--jobs 32`, a maximum of 32 samples could be run at one time, but each sample will take longer to complete. This also needs to be considered when setting `MAXMEMORY` + `--resources mem_mb` and `GPU` + `--resources gpu`.
 
 ### Variant filtering
 
@@ -348,14 +348,16 @@ CADD: "/scratch/publicData/CADD/GRCh37/whole_genome_SNVs.tsv.gz"
 
 ## 9. Modify the run scripts
 
-Set the singularity bind location to a directory that contains your pipeline working directory with the `--singularity-args '-B'` flag. Set the number maximum number of cores to be used with the `--cores` flag and the maximum amount of memory to be used (in megabytes) with the `resources mem_mb=` flag. If running GPU accelerated, also set the maximum number of GPU's to be used with the `--resources gpu=` flag. For example:
+Set the singularity bind location to a directory that contains your pipeline working directory with the `--singularity-args '-B'` flag. Set the number maximum number of job to be deployed with the `--jobs` flag and the maximum amount of memory to be used (in megabytes) with the `resources mem_mb=` flag. If running GPU accelerated, also set the maximum number of GPU's to be used with the `--resources gpu=` flag. For example:
 
 Dry run (dryrun_hpc.sh):
 
 ```bash
+#!/bin/bash -x
+
 snakemake \
 --dryrun \
---cores 32 \
+--jobs 32 \
 --resources mem_mb=150000 \
 --resources gpu=2 \
 --use-conda \
@@ -363,18 +365,16 @@ snakemake \
 --latency-wait 120 \
 --use-singularity \
 --singularity-args '-B /scratch/' \
---configfile ../config/config.yaml \
---cluster-config ../config/cluster.json \
---cluster "sbatch -A {cluster.account} \
--p {cluster.partition} \
--o {cluster.output}"
+--configfile ../config/config.yaml
 ```
 
 Full run (run_hpc.sh):
 
 ```bash
+#!/bin/bash -x
+
 snakemake \
---cores 32 \
+--jobs 32 \
 --resources mem_mb=150000 \
 --resources gpu=2 \
 --use-conda \
@@ -382,11 +382,7 @@ snakemake \
 --latency-wait 120 \
 --use-singularity \
 --singularity-args '-B /scratch/' \
---configfile ../config/config.yaml \
---cluster-config ../config/cluster.json \
---cluster "sbatch -A {cluster.account} \
--p {cluster.partition} \
--o {cluster.output}"
+--configfile ../config/config.yaml
 ```
 
 See the [snakemake documentation](https://snakemake.readthedocs.io/en/v4.5.1/executable.html#all-options) for additional run parameters.
diff --git a/workflow/dryrun.sh b/workflow/dryrun.sh
index f0d62c6..83888d9 100644
--- a/workflow/dryrun.sh
+++ b/workflow/dryrun.sh
@@ -2,7 +2,7 @@
 
 snakemake \
 --dryrun \
---cores 32 \
+--jobs 32 \
 --resources mem_mb=150000 \
 --resources gpu=2 \
 --use-conda \
diff --git a/workflow/dryrun_hpc.sh b/workflow/dryrun_hpc.sh
index 9e0db27..ca6c6fc 100644
--- a/workflow/dryrun_hpc.sh
+++ b/workflow/dryrun_hpc.sh
@@ -2,7 +2,7 @@
 
 snakemake \
 --dryrun \
---cores 32 \
+--jobs 32 \
 --resources mem_mb=150000 \
 --resources gpu=2 \
 --use-conda \
diff --git a/workflow/run.sh b/workflow/run.sh
index 62af075..d2b682b 100644
--- a/workflow/run.sh
+++ b/workflow/run.sh
@@ -1,7 +1,7 @@
 #!/bin/bash -x
 
 snakemake \
---cores 32 \
+--jobs 32 \
 --resources mem_mb=150000 \
 --resources gpu=2 \
 --use-conda \
diff --git a/workflow/run_hpc.sh b/workflow/run_hpc.sh
index 4269d3e..eacc222 100644
--- a/workflow/run_hpc.sh
+++ b/workflow/run_hpc.sh
@@ -1,7 +1,7 @@
 #!/bin/bash -x
 
 snakemake \
---cores 32 \
+--jobs 32 \
 --resources mem_mb=150000 \
 --resources gpu=2 \
 --use-conda \

From 6e748098b325732251f48201dc0af78f7bd07c78 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Thu, 19 May 2022 15:43:18 +1200
Subject: [PATCH 03/20] port gatk to containers

---
 workflow/envs/gatk4.yaml                             | 12 ------------
 workflow/rules/gatk_ApplyVQSR_indel.smk              |  4 ++--
 workflow/rules/gatk_ApplyVQSR_snp.smk                |  4 ++--
 workflow/rules/gatk_CNNScoreVariants.smk             |  2 +-
 workflow/rules/gatk_CalculateGenotypePosteriors.smk  |  4 ++--
 workflow/rules/gatk_FilterVariantTranches.smk        |  2 +-
 .../rules/gatk_VariantAnnotator_PossibleDeNovo.smk   |  4 ++--
 workflow/rules/gatk_VariantRecalibrator_indel.smk    |  4 ++--
 workflow/rules/gatk_VariantRecalibrator_snp.smk      |  4 ++--
 9 files changed, 14 insertions(+), 26 deletions(-)
 delete mode 100644 workflow/envs/gatk4.yaml

diff --git a/workflow/envs/gatk4.yaml b/workflow/envs/gatk4.yaml
deleted file mode 100644
index db397c7..0000000
--- a/workflow/envs/gatk4.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-  - defaults
-dependencies:
-  - bioconda::gatk4 =4.1.6.0
-  - bioconda::gatktool =0.0.1
-  - conda-forge::numpy =1.18.1
-  - conda-forge::blas =1.0
-  - main::certifi =2019.11.28
-  - main::openssl =1.0.2
-  - main::scipy =1.3.2
\ No newline at end of file
diff --git a/workflow/rules/gatk_ApplyVQSR_indel.smk b/workflow/rules/gatk_ApplyVQSR_indel.smk
index 5f639d5..137a76e 100644
--- a/workflow/rules/gatk_ApplyVQSR_indel.smk
+++ b/workflow/rules/gatk_ApplyVQSR_indel.smk
@@ -16,8 +16,8 @@ rule gatk_ApplyVQSR_indel:
         "logs/gatk_VQSR_indel/{sample}.log"
     benchmark:
         "benchmarks/gatk_VQSR_indel/{sample}.tsv"
-    conda:
-        "../envs/gatk4.yaml"
+    singularity:
+        "docker://broadinstitute/gatk:4.2.6.1"
     message:
         "Using machine learning to filter out probable artifacts from the variant callset (indels)"
     shell: 
diff --git a/workflow/rules/gatk_ApplyVQSR_snp.smk b/workflow/rules/gatk_ApplyVQSR_snp.smk
index 06b734f..fbbfc1b 100644
--- a/workflow/rules/gatk_ApplyVQSR_snp.smk
+++ b/workflow/rules/gatk_ApplyVQSR_snp.smk
@@ -15,8 +15,8 @@ rule gatk_ApplyVQSR_snp:
         "logs/gatk_VQSR_snp/{sample}.log"
     benchmark:
         "benchmarks/gatk_VQSR_snp/{sample}.tsv"
-    conda:
-        "../envs/gatk4.yaml"
+    singularity:
+        "docker://broadinstitute/gatk:4.2.6.1"
     message:
         "Using machine learning to filter out probable artifacts from the variant callset (snps)"
     shell:
diff --git a/workflow/rules/gatk_CNNScoreVariants.smk b/workflow/rules/gatk_CNNScoreVariants.smk
index 8587afe..4a2458b 100644
--- a/workflow/rules/gatk_CNNScoreVariants.smk
+++ b/workflow/rules/gatk_CNNScoreVariants.smk
@@ -16,7 +16,7 @@ rule gatk_CNNScoreVariants:
     benchmark:
         "benchmarks/gatk_CNNScoreVariants/{sample}.tsv"
     singularity:
-        "docker://broadinstitute/gatk:4.1.7.0"
+        "docker://broadinstitute/gatk:4.2.6.1"
     threads: config['THREADS']
     message:
         "Annotating {input.vcf} with scores from a Convolutional Neural Network (CNN) (2D model with pre-trained architecture)"
diff --git a/workflow/rules/gatk_CalculateGenotypePosteriors.smk b/workflow/rules/gatk_CalculateGenotypePosteriors.smk
index 73c3b08..6d2eda0 100644
--- a/workflow/rules/gatk_CalculateGenotypePosteriors.smk
+++ b/workflow/rules/gatk_CalculateGenotypePosteriors.smk
@@ -13,8 +13,8 @@ rule gatk_CalculateGenotypePosteriors:
         "logs/gatk_CalculateGenotypePosteriors/{sample}.log"
     benchmark:
         "benchmarks/gatk_CalculateGenotypePosteriors/{sample}.tsv"
-    conda:
-        "../envs/gatk4.yaml"
+    singularity:
+        "docker://broadinstitute/gatk:4.2.6.1"
     message:
         "Calculating genotype posterior probabilities given family and/or known population genotypes for {input.vcf}"
     shell:
diff --git a/workflow/rules/gatk_FilterVariantTranches.smk b/workflow/rules/gatk_FilterVariantTranches.smk
index a7d8f15..608366e 100644
--- a/workflow/rules/gatk_FilterVariantTranches.smk
+++ b/workflow/rules/gatk_FilterVariantTranches.smk
@@ -16,7 +16,7 @@ rule gatk_FilterVariantTranches:
     benchmark:
         "benchmarks/gatk_FilterVariantTranches/{sample}.tsv"
     singularity:
-        "docker://broadinstitute/gatk:4.1.7.0"
+        "docker://broadinstitute/gatk:4.2.6.1"
     message:
         "Applying tranche filtering to variant calls in {input}"
     shell:
diff --git a/workflow/rules/gatk_VariantAnnotator_PossibleDeNovo.smk b/workflow/rules/gatk_VariantAnnotator_PossibleDeNovo.smk
index 67948d8..062a527 100644
--- a/workflow/rules/gatk_VariantAnnotator_PossibleDeNovo.smk
+++ b/workflow/rules/gatk_VariantAnnotator_PossibleDeNovo.smk
@@ -13,8 +13,8 @@ rule gatk_VariantAnnotator_PossibleDeNovo:
         "logs/gatk_VariantAnnotator_PossibleDeNovo/{sample}.log"
     benchmark:
         "benchmarks/gatk_VariantAnnotator_PossibleDeNovo/{sample}.tsv"
-    conda:
-        "../envs/gatk4.yaml"
+    singularity:
+        "docker://broadinstitute/gatk:4.2.6.1"
     message:
         "Marking variants in {input.vcf} that are possible denovo mutations"
     shell:
diff --git a/workflow/rules/gatk_VariantRecalibrator_indel.smk b/workflow/rules/gatk_VariantRecalibrator_indel.smk
index 7a53db6..6ca56a9 100644
--- a/workflow/rules/gatk_VariantRecalibrator_indel.smk
+++ b/workflow/rules/gatk_VariantRecalibrator_indel.smk
@@ -19,8 +19,8 @@ rule gatk_VariantRecalibrator_indel:
         "logs/gatk_VariantRecalibrator_indel/{sample}.log"
     benchmark:
         "benchmarks/gatk_VariantRecalibrator_indel/{sample}.tsv"
-    conda:
-        "../envs/gatk4.yaml"
+    singularity:
+        "docker://broadinstitute/gatk:4.2.6.1"
     message:
         "Building a recalibration model to score variant quality (indels)"
     shell:
diff --git a/workflow/rules/gatk_VariantRecalibrator_snp.smk b/workflow/rules/gatk_VariantRecalibrator_snp.smk
index 1fccbd9..4744298 100644
--- a/workflow/rules/gatk_VariantRecalibrator_snp.smk
+++ b/workflow/rules/gatk_VariantRecalibrator_snp.smk
@@ -20,8 +20,8 @@ rule gatk_VariantRecalibrator_snp:
         "logs/gatk_VariantRecalibrator_snp/{sample}.log"
     benchmark:
         "benchmarks/gatk_VariantRecalibrator_snp/{sample}.tsv"
-    conda:
-        "../envs/gatk4.yaml"
+    singularity:
+        "docker://broadinstitute/gatk:4.2.6.1"
     message:
         "Building a recalibration model to score variant quality (snps)"
     shell:

From 003f72890fe677be681ac8bd8385860d1f118a9c Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Thu, 19 May 2022 17:54:41 +1200
Subject: [PATCH 04/20] port vep to containers + provide conda env for
 downloading vep database

---
 docs/running_on_a_hpc.md            |  18 ++--
 docs/running_on_a_single_machine.md |  20 ++--
 workflow/envs/vep.yaml              |   6 --
 workflow/rules/vep.smk              |   6 +-
 workflow/vep_database_install.yml   | 151 ++++++++++++++++++++++++++++
 5 files changed, 171 insertions(+), 30 deletions(-)
 delete mode 100644 workflow/envs/vep.yaml
 create mode 100644 workflow/vep_database_install.yml

diff --git a/docs/running_on_a_hpc.md b/docs/running_on_a_hpc.md
index 2e378c4..65e2ce3 100644
--- a/docs/running_on_a_hpc.md
+++ b/docs/running_on_a_hpc.md
@@ -146,14 +146,13 @@ gsutil cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
 
 ### GRCh37
 
-Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database using a [conda version of Ensembl-VEP](https://anaconda.org/bioconda/ensembl-vep)
+Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-conda create -n download_data_env python=3.7
-conda activate download_data_env
-conda install -c bioconda ensembl-vep=99.2
+mamba env create -f vep_database_install.yml
+conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh37 -c /output/file/path/GRCh37 --CONVERT
-conda deactivate
+conda activate pipeline_run_env
 ```
 
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
@@ -167,14 +166,13 @@ Create a custom [dbNSFP database](https://sites.google.com/site/jpopgen/dbNSFP)
 
 ### GRCh38
 
-Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database using a [conda install of Ensembl-VEP](https://anaconda.org/bioconda/ensembl-vep)
+Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba create -n download_data_env python=3.7
-conda activate download_data_env
-mamba install -c bioconda ensembl-vep=99.2
+mamba env create -f vep_database_install.yml
+conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh38 -c /output/file/path/GRCh38 --CONVERT
-conda deactivate
+conda activate pipeline_run_env
 ```
 
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
diff --git a/docs/running_on_a_single_machine.md b/docs/running_on_a_single_machine.md
index 107c3d0..cab350d 100644
--- a/docs/running_on_a_single_machine.md
+++ b/docs/running_on_a_single_machine.md
@@ -145,14 +145,13 @@ gsutil cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
 
 ### GRCh37
 
-Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database using a [conda version of Ensembl-VEP](https://anaconda.org/bioconda/ensembl-vep)
+Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-conda create -n download_data_env python=3.7
-conda activate download_data_env
-conda install -c bioconda ensembl-vep=99.2
-vep_install -a cf -s homo_sapiens -y GRCh37 -c /output/file/path/GRCh37 --CONVERT
-conda deactivate
+mamba env create -f vep_database_install.yml
+conda activate vep_database_install
+vep_install -a cf -s homo_sapiens -y GRCh37 -c ./GRCh37 --CONVERT
+conda activate pipeline_run_env
 ```
 
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
@@ -166,14 +165,13 @@ Create a custom [dbNSFP database](https://sites.google.com/site/jpopgen/dbNSFP)
 
 ### GRCh38
 
-Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database using a [conda install of Ensembl-VEP](https://anaconda.org/bioconda/ensembl-vep)
+Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba create -n download_data_env python=3.7
-conda activate download_data_env
-mamba install -c bioconda ensembl-vep=99.2
+mamba env create -f vep_database_install.yml
+conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh38 -c /output/file/path/GRCh38 --CONVERT
-conda deactivate
+conda activate pipeline_run_env
 ```
 
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
diff --git a/workflow/envs/vep.yaml b/workflow/envs/vep.yaml
deleted file mode 100644
index eb889c2..0000000
--- a/workflow/envs/vep.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-  - defaults
-dependencies:
-  - bioconda::ensembl-vep =99.2  
diff --git a/workflow/rules/vep.smk b/workflow/rules/vep.smk
index e40e1c6..484a3c0 100644
--- a/workflow/rules/vep.smk
+++ b/workflow/rules/vep.smk
@@ -9,13 +9,13 @@ rule vep:
         vcf = temp("../results/annotated/{sample}_filtered_dbnsfp_vep.vcf.gz")
     params:
         build = config['BUILD'],
-        other = "--compress_output bgzip --cache --offline --stats_text --everything --vcf --force_overwrite"
+        other = "--compress_output bgzip --offline --stats_text --everything --vcf --force_overwrite"
     log: 
         "logs/vep/{sample}.log"
     benchmark:
         "benchmarks/vep/{sample}.tsv"
-    conda:
-        "../envs/vep.yaml"
+    singularity:
+        "docker://ensemblorg/ensembl-vep:release_106.1"
     threads: config['THREADS']
     message:
         "Using the VEP database to determine the effect of the variants in {input.vcf}"
diff --git a/workflow/vep_database_install.yml b/workflow/vep_database_install.yml
new file mode 100644
index 0000000..5614825
--- /dev/null
+++ b/workflow/vep_database_install.yml
@@ -0,0 +1,151 @@
+name: vep_database_install
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=conda_forge
+  - _openmp_mutex=4.5=2_gnu
+  - binutils_impl_linux-64=2.36.1=h193b22a_2
+  - binutils_linux-64=2.36=hf3e587d_9
+  - bzip2=1.0.8=h7f98852_4
+  - c-ares=1.18.1=h7f98852_0
+  - ca-certificates=2022.5.18=ha878542_0
+  - clustalw=2.1=h9f5acd7_7
+  - curl=7.83.1=h7bff187_0
+  - ensembl-vep=106.1=pl5321h4a94de4_0
+  - expat=2.4.8=h27087fc_0
+  - gcc_impl_linux-64=10.3.0=hf2f2afa_16
+  - gcc_linux-64=10.3.0=hc39de41_9
+  - htslib=1.14=h9753748_2
+  - kernel-headers_linux-64=2.6.32=he073ed8_15
+  - keyutils=1.6.1=h166bdaf_0
+  - krb5=1.19.3=h3790be6_0
+  - ld_impl_linux-64=2.36.1=hea4e1c9_2
+  - libcurl=7.83.1=h7bff187_0
+  - libdb=6.2.32=h9c3ff4c_0
+  - libdeflate=1.10=h7f98852_0
+  - libedit=3.1.20191231=he28a2e2_2
+  - libev=4.33=h516909a_1
+  - libffi=3.3=h58526e2_2
+  - libgcc-devel_linux-64=10.3.0=he6cfe16_16
+  - libgcc-ng=12.1.0=h8d9b700_16
+  - libgomp=12.1.0=h8d9b700_16
+  - libnghttp2=1.47.0=h727a467_0
+  - libnsl=2.0.0=h7f98852_0
+  - libsanitizer=10.3.0=h26c7422_16
+  - libssh2=1.10.0=ha56f1ee_2
+  - libstdcxx-ng=12.1.0=ha89aaad_16
+  - libzlib=1.2.11=h166bdaf_1014
+  - mysql-connector-c=6.1.11=h6eb9d5d_1007
+  - ncurses=6.3=h27087fc_1
+  - openssl=1.1.1o=h166bdaf_0
+  - paml=4.9=hec16e2b_7
+  - perl=5.32.1=2_h7f98852_perl5
+  - perl-algorithm-diff=1.201=pl5321hd8ed1ab_0
+  - perl-base=2.23=pl5321hdfd78af_2
+  - perl-bio-asn1-entrezgene=1.73=pl5321hdfd78af_3
+  - perl-bio-coordinate=1.007001=pl5321hdfd78af_3
+  - perl-bio-db-hts=3.01=pl5321h6233b05_5
+  - perl-bio-featureio=1.6.905=pl5321hdfd78af_4
+  - perl-bio-samtools=1.43=pl5321h7132678_3
+  - perl-bio-searchio-hmmer=1.7.3=pl5321hdfd78af_0
+  - perl-bio-tools-phylo-paml=1.7.3=pl5321hdfd78af_3
+  - perl-bio-tools-run-alignment-clustalw=1.7.4=pl5321hdfd78af_3
+  - perl-bio-tools-run-alignment-tcoffee=1.7.4=pl5321hdfd78af_4
+  - perl-bioperl=1.7.8=hdfd78af_1
+  - perl-bioperl-core=1.7.8=pl5321hdfd78af_1
+  - perl-bioperl-run=1.007003=pl5321hdfd78af_0
+  - perl-business-isbn=3.007=pl5321hdfd78af_0
+  - perl-business-isbn-data=20210112.006=pl5321hdfd78af_0
+  - perl-capture-tiny=0.48=pl5321ha770c72_1
+  - perl-carp=1.50=pl5321hd8ed1ab_0
+  - perl-class-data-inheritable=0.09=pl5321hdfd78af_0
+  - perl-common-sense=3.75=pl5321hdfd78af_0
+  - perl-compress-raw-bzip2=2.103=pl5321h87f3376_0
+  - perl-compress-raw-zlib=2.103=pl5321h87f3376_0
+  - perl-constant=1.33=pl5321hd8ed1ab_0
+  - perl-data-dumper=2.183=pl5321hec16e2b_1
+  - perl-db_file=1.858=pl5321h166bdaf_0
+  - perl-dbd-mysql=4.050=pl5321h9f5acd7_0
+  - perl-dbi=1.643=pl5321hec16e2b_1
+  - perl-devel-checklib=1.14=pl5321hec16e2b_1
+  - perl-devel-stacktrace=2.04=pl5321hdfd78af_1
+  - perl-digest-hmac=1.04=pl5321hdfd78af_0
+  - perl-digest-md5=2.58=pl5321hec16e2b_1
+  - perl-encode=3.17=pl5321hec16e2b_0
+  - perl-encode-locale=1.05=pl5321hdfd78af_7
+  - perl-exception-class=1.45=pl5321hdfd78af_0
+  - perl-exporter=5.74=pl5321hd8ed1ab_0
+  - perl-extutils-makemaker=7.64=pl5321hd8ed1ab_0
+  - perl-file-listing=6.15=pl5321hdfd78af_0
+  - perl-file-slurp-tiny=0.004=pl5321hdfd78af_2
+  - perl-file-sort=1.01=pl5321hdfd78af_3
+  - perl-file-spec=3.48_01=pl5321hdfd78af_2
+  - perl-getopt-long=2.52=pl5321hdfd78af_0
+  - perl-html-parser=3.78=pl5321h9f5acd7_0
+  - perl-html-tagset=3.20=pl5321hdfd78af_4
+  - perl-http-cookies=6.10=pl5321hdfd78af_0
+  - perl-http-daemon=6.14=pl5321hdfd78af_0
+  - perl-http-date=6.05=pl5321hdfd78af_0
+  - perl-http-message=6.36=pl5321hdfd78af_0
+  - perl-http-negotiate=6.01=pl5321hdfd78af_4
+  - perl-io-compress=2.106=pl5321h87f3376_0
+  - perl-io-html=1.004=pl5321hdfd78af_0
+  - perl-io-socket-ssl=2.074=pl5321hdfd78af_0
+  - perl-io-string=1.08=pl5321hdfd78af_4
+  - perl-io-tty=1.16=pl5321hec16e2b_1
+  - perl-ipc-run=20200505.0=pl5321hdfd78af_0
+  - perl-json=4.05=pl5321hdfd78af_0
+  - perl-json-xs=2.34=pl5321h9f5acd7_5
+  - perl-libwww-perl=6.64=pl5321hdfd78af_0
+  - perl-libxml-perl=0.08=pl5321hdfd78af_3
+  - perl-lwp-mediatypes=6.04=pl5321hdfd78af_1
+  - perl-mime-base64=3.16=pl5321hec16e2b_2
+  - perl-net-http=6.22=pl5321hdfd78af_0
+  - perl-net-ssleay=1.92=pl5321h0e0aaa8_1
+  - perl-ntlm=1.09=pl5321hdfd78af_5
+  - perl-parent=0.238=pl5321hd8ed1ab_0
+  - perl-perlio-gzip=0.20=pl5321h7132678_3
+  - perl-scalar-list-utils=1.62=pl5321hec16e2b_0
+  - perl-sereal=4.019=pl5321hdfd78af_0
+  - perl-sereal-decoder=4.023=pl5321hec16e2b_1
+  - perl-sereal-encoder=4.023=pl5321hec16e2b_0
+  - perl-set-intervaltree=0.12=pl5321h87f3376_2
+  - perl-socket=2.027=pl5321hec16e2b_3
+  - perl-sub-uplevel=0.2800=pl5321hec16e2b_4
+  - perl-test-deep=1.130=pl5321hdfd78af_0
+  - perl-test-differences=0.69=pl5321hdfd78af_0
+  - perl-test-exception=0.43=pl5321hdfd78af_3
+  - perl-test-harness=3.44=pl5321hdfd78af_0
+  - perl-test-most=0.37=pl5321hdfd78af_0
+  - perl-test-warn=0.36=pl5321hdfd78af_2
+  - perl-text-csv=2.01=pl5321hdfd78af_0
+  - perl-text-diff=1.45=pl5321hdfd78af_1
+  - perl-time-local=1.30=pl5321hdfd78af_0
+  - perl-timedate=2.33=pl5321hdfd78af_2
+  - perl-tree-dag_node=1.32=pl5321hdfd78af_0
+  - perl-try-tiny=0.31=pl5321hdfd78af_0
+  - perl-types-serialiser=1.01=pl5321hdfd78af_0
+  - perl-uri=5.10=pl5321hdfd78af_0
+  - perl-url-encode=0.03=pl5321h9ee0642_0
+  - perl-www-robotrules=6.02=pl5321hdfd78af_4
+  - perl-xml-dom=1.46=pl5321hdfd78af_1
+  - perl-xml-dom-xpath=0.14=pl5321hdfd78af_2
+  - perl-xml-parser=2.44_01=pl5321hc3e0081_1003
+  - perl-xml-regexp=0.04=pl5321hdfd78af_3
+  - perl-xml-xpathengine=0.14=pl5321hdfd78af_3
+  - pip=22.1=pyhd8ed1ab_0
+  - python=3.7.13=h12debd9_0
+  - python_abi=3.7=2_cp37m
+  - readline=8.1=h46c0cb4_0
+  - setuptools=62.3.1=py37h89c1867_0
+  - sqlite=3.38.5=h4ff8645_0
+  - sysroot_linux-64=2.12=he073ed8_15
+  - t_coffee=11.0.8=py37hea885bf_8
+  - tk=8.6.12=h27826a3_0
+  - unzip=6.0=h7f98852_3
+  - wheel=0.37.1=pyhd8ed1ab_0
+  - xz=5.2.5=h516909a_1
+  - zlib=1.2.11=h166bdaf_1014
+prefix: /home/lkemp/miniconda3/envs/vep_database_install

From 7dea68f72a80e2f40b14ba9e6a111271da63566c Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Thu, 19 May 2022 17:57:04 +1200
Subject: [PATCH 05/20] port more software to containers

---
 workflow/envs/bcftools.yaml                        | 6 ------
 workflow/envs/bgzip.yaml                           | 6 ------
 workflow/envs/genmod.yaml                          | 8 --------
 workflow/envs/tabix.yaml                           | 6 ------
 workflow/rules/bcftools_view_multiallelicsites.smk | 4 ++--
 workflow/rules/bgzip.smk                           | 4 ++--
 workflow/rules/genmod_annotate_CADD.smk            | 4 ++--
 workflow/rules/genmod_models.smk                   | 4 ++--
 workflow/rules/genmod_score_cohort.smk             | 4 ++--
 workflow/rules/genmod_score_single.smk             | 4 ++--
 workflow/rules/tabix.smk                           | 4 ++--
 11 files changed, 14 insertions(+), 40 deletions(-)
 delete mode 100644 workflow/envs/bcftools.yaml
 delete mode 100644 workflow/envs/bgzip.yaml
 delete mode 100644 workflow/envs/genmod.yaml
 delete mode 100644 workflow/envs/tabix.yaml

diff --git a/workflow/envs/bcftools.yaml b/workflow/envs/bcftools.yaml
deleted file mode 100644
index 1718462..0000000
--- a/workflow/envs/bcftools.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-  - defaults
-dependencies:
-  - bioconda::bcftools =1.10.2
\ No newline at end of file
diff --git a/workflow/envs/bgzip.yaml b/workflow/envs/bgzip.yaml
deleted file mode 100644
index 569f58e..0000000
--- a/workflow/envs/bgzip.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-  - defaults
-dependencies:
-  - bioconda::htslib = 1.10.2
diff --git a/workflow/envs/genmod.yaml b/workflow/envs/genmod.yaml
deleted file mode 100644
index a86c9ac..0000000
--- a/workflow/envs/genmod.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-  - defaults
-dependencies:
-  - conda-forge::pip =20.2.2
-  - pip:
-    - genmod==3.7.3
\ No newline at end of file
diff --git a/workflow/envs/tabix.yaml b/workflow/envs/tabix.yaml
deleted file mode 100644
index 534984c..0000000
--- a/workflow/envs/tabix.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
-  - bioconda
-  - conda-forge
-  - defaults
-dependencies:
-  - bioconda::tabix =0.2.6
\ No newline at end of file
diff --git a/workflow/rules/bcftools_view_multiallelicsites.smk b/workflow/rules/bcftools_view_multiallelicsites.smk
index b618de2..937b925 100644
--- a/workflow/rules/bcftools_view_multiallelicsites.smk
+++ b/workflow/rules/bcftools_view_multiallelicsites.smk
@@ -9,8 +9,8 @@ rule bcftools_view_multiallelicsites:
         "logs/bcftools_view_multiallelicsites/{sample}.log"
     benchmark:
         "benchmarks/bcftools_view_multiallelicsites/{sample}.tsv"
-    conda:
-        "../envs/bcftools.yaml"
+    singularity:
+        "docker://biocontainers/bcftools:v1.9-1-deb_cv1"
     message:
         "Filtering out multiallelic sites in {input}"
     shell:
diff --git a/workflow/rules/bgzip.smk b/workflow/rules/bgzip.smk
index a27c21d..9c912e3 100644
--- a/workflow/rules/bgzip.smk
+++ b/workflow/rules/bgzip.smk
@@ -3,8 +3,8 @@ rule bgzip:
         "../results/annotated/{sample}_filtered_dbnsfp.vcf"
     output:
         temp("../results/annotated/{sample}_filtered_dbnsfp.vcf.gz")
-    conda:
-        "../envs/bgzip.yaml"
+    singularity:
+        "docker://staphb/samtools:1.15"
     message:
         "Bgzipping {input}"
     shell:
diff --git a/workflow/rules/genmod_annotate_CADD.smk b/workflow/rules/genmod_annotate_CADD.smk
index 731bc6b..749920b 100644
--- a/workflow/rules/genmod_annotate_CADD.smk
+++ b/workflow/rules/genmod_annotate_CADD.smk
@@ -8,8 +8,8 @@ rule genmod_annotate_CADD:
         "logs/genmod_annotate_CADD/{sample}.log"
     benchmark:
         "benchmarks/genmod_annotate_CADD/{sample}.tsv"
-    conda:
-        "../envs/genmod.yaml"
+    singularity:
+        "docker://clinicalgenomics/genmod:3.7.4"
     message:
         "Using genmod to annotate {input.vcf} with CADD"
     shell:
diff --git a/workflow/rules/genmod_models.smk b/workflow/rules/genmod_models.smk
index 9b71291..c6ba674 100644
--- a/workflow/rules/genmod_models.smk
+++ b/workflow/rules/genmod_models.smk
@@ -11,8 +11,8 @@ rule genmod_models:
         "logs/genmod_models/{sample}.log"
     benchmark:
         "benchmarks/genmod_models/{sample}.tsv"
-    conda:
-        "../envs/genmod.yaml"
+    singularity:
+        "docker://clinicalgenomics/genmod:3.7.4"
     threads: config['THREADS']
     message:
         "Annotating {input.vcf} with patterns of inheritance"
diff --git a/workflow/rules/genmod_score_cohort.smk b/workflow/rules/genmod_score_cohort.smk
index b96390e..302316a 100644
--- a/workflow/rules/genmod_score_cohort.smk
+++ b/workflow/rules/genmod_score_cohort.smk
@@ -10,8 +10,8 @@ rule genmod_score_cohort:
         "logs/genmod_score/{sample}.log"
     benchmark:
         "benchmarks/genmod_score/{sample}.tsv"
-    conda:
-        "../envs/genmod.yaml"
+    singularity:
+        "docker://clinicalgenomics/genmod:3.7.4"
     message:
         "Scoring the variants in {input.vcf} based on several annotations"
     shell:
diff --git a/workflow/rules/genmod_score_single.smk b/workflow/rules/genmod_score_single.smk
index 05dc9b8..c8a7e5b 100644
--- a/workflow/rules/genmod_score_single.smk
+++ b/workflow/rules/genmod_score_single.smk
@@ -10,8 +10,8 @@ rule genmod_score_single:
         "logs/genmod_score/{sample}.log"
     benchmark:
         "benchmarks/genmod_score/{sample}.tsv"
-    conda:
-        "../envs/genmod.yaml"
+    singularity:
+        "docker://clinicalgenomics/genmod:3.7.4"
     message:
         "Scoring the variants in {input.vcf} based on several annotations"
     shell:
diff --git a/workflow/rules/tabix.smk b/workflow/rules/tabix.smk
index f77b5b0..55d8720 100644
--- a/workflow/rules/tabix.smk
+++ b/workflow/rules/tabix.smk
@@ -3,8 +3,8 @@ rule tabix:
         "../results/annotated/{sample}_filtered_dbnsfp.vcf.gz"
     output:
         temp("../results/annotated/{sample}_filtered_dbnsfp.vcf.gz.tbi")
-    conda:
-        "../envs/bgzip.yaml"
+    singularity:
+        "docker://staphb/samtools:1.15"
     message:
         "Tabix indexing {input}"
     shell:

From ff8f1380b0eed2dd0052cae277709040204a89fe Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Thu, 19 May 2022 18:01:56 +1200
Subject: [PATCH 06/20] tidy up conda envs

---
 docs/running_on_a_hpc.md                     | 6 +++---
 docs/running_on_a_single_machine.md          | 6 +++---
 workflow/{ => envs}/pipeline_run_env.yml     | 0
 workflow/{ => envs}/vep_database_install.yml | 0
 4 files changed, 6 insertions(+), 6 deletions(-)
 rename workflow/{ => envs}/pipeline_run_env.yml (100%)
 rename workflow/{ => envs}/vep_database_install.yml (100%)

diff --git a/docs/running_on_a_hpc.md b/docs/running_on_a_hpc.md
index 65e2ce3..2cf852c 100644
--- a/docs/running_on_a_hpc.md
+++ b/docs/running_on_a_hpc.md
@@ -120,7 +120,7 @@ This software is commonly pre-installed on HPC's, likely available as modules th
 
 ```bash
 cd ./workflow/
-mamba env create -f pipeline_run_env.yml
+mamba env create -f ./envs/pipeline_run_env.yml
 conda activate pipeline_run_env
 ```
 
@@ -149,7 +149,7 @@ gsutil cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
 Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba env create -f vep_database_install.yml
+mamba env create -f ./envs/vep_database_install.yml
 conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh37 -c /output/file/path/GRCh37 --CONVERT
 conda activate pipeline_run_env
@@ -169,7 +169,7 @@ Create a custom [dbNSFP database](https://sites.google.com/site/jpopgen/dbNSFP)
 Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba env create -f vep_database_install.yml
+mamba env create -f ./envs/vep_database_install.yml
 conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh38 -c /output/file/path/GRCh38 --CONVERT
 conda activate pipeline_run_env
diff --git a/docs/running_on_a_single_machine.md b/docs/running_on_a_single_machine.md
index cab350d..1174a60 100644
--- a/docs/running_on_a_single_machine.md
+++ b/docs/running_on_a_single_machine.md
@@ -119,7 +119,7 @@ This software is commonly pre-installed on HPC's, likely available as modules th
 
 ```bash
 cd ./workflow/
-mamba env create -f pipeline_run_env.yml
+mamba env create -f ./envs/pipeline_run_env.yml
 conda activate pipeline_run_env
 ```
 
@@ -148,7 +148,7 @@ gsutil cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
 Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba env create -f vep_database_install.yml
+mamba env create -f ./envs/vep_database_install.yml
 conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh37 -c ./GRCh37 --CONVERT
 conda activate pipeline_run_env
@@ -168,7 +168,7 @@ Create a custom [dbNSFP database](https://sites.google.com/site/jpopgen/dbNSFP)
 Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba env create -f vep_database_install.yml
+mamba env create -f ./envs/vep_database_install.yml
 conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh38 -c /output/file/path/GRCh38 --CONVERT
 conda activate pipeline_run_env
diff --git a/workflow/pipeline_run_env.yml b/workflow/envs/pipeline_run_env.yml
similarity index 100%
rename from workflow/pipeline_run_env.yml
rename to workflow/envs/pipeline_run_env.yml
diff --git a/workflow/vep_database_install.yml b/workflow/envs/vep_database_install.yml
similarity index 100%
rename from workflow/vep_database_install.yml
rename to workflow/envs/vep_database_install.yml

From 7488badaf5880de13adb80e7258df967d78c0257 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Fri, 20 May 2022 14:16:25 +1200
Subject: [PATCH 07/20] use newer versions of software and be even more
 specific about version

---
 workflow/envs/SnpSift.yaml | 2 +-
 workflow/envs/gunzip.yaml  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/envs/SnpSift.yaml b/workflow/envs/SnpSift.yaml
index 88c4076..100fffd 100644
--- a/workflow/envs/SnpSift.yaml
+++ b/workflow/envs/SnpSift.yaml
@@ -3,4 +3,4 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - bioconda::snpsift =4.3.1t
+  - bioconda::snpsift=5.1=hdfd78af_0
diff --git a/workflow/envs/gunzip.yaml b/workflow/envs/gunzip.yaml
index bedd5b1..80a4d87 100644
--- a/workflow/envs/gunzip.yaml
+++ b/workflow/envs/gunzip.yaml
@@ -3,6 +3,6 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - conda-forge::pip =20.2.2
+  - conda-forge::pip=22.0.4=pyhd8ed1ab_0
   - pip:
     - gunzip==0.1.10
\ No newline at end of file

From 38aec8a15f0bb26327052086251c03eb5f54d130 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Fri, 20 May 2022 14:18:23 +1200
Subject: [PATCH 08/20] used more reputable containers

---
 workflow/rules/bgzip.smk | 2 +-
 workflow/rules/tabix.smk | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/bgzip.smk b/workflow/rules/bgzip.smk
index 9c912e3..7df48bc 100644
--- a/workflow/rules/bgzip.smk
+++ b/workflow/rules/bgzip.smk
@@ -4,7 +4,7 @@ rule bgzip:
     output:
         temp("../results/annotated/{sample}_filtered_dbnsfp.vcf.gz")
     singularity:
-        "docker://staphb/samtools:1.15"
+        "docker://biocontainers/samtools:v1.9-4-deb_cv1"
     message:
         "Bgzipping {input}"
     shell:
diff --git a/workflow/rules/tabix.smk b/workflow/rules/tabix.smk
index 55d8720..0c45478 100644
--- a/workflow/rules/tabix.smk
+++ b/workflow/rules/tabix.smk
@@ -4,7 +4,7 @@ rule tabix:
     output:
         temp("../results/annotated/{sample}_filtered_dbnsfp.vcf.gz.tbi")
     singularity:
-        "docker://staphb/samtools:1.15"
+        "docker://biocontainers/tabix:v1.9-11-deb_cv1"
     message:
         "Tabix indexing {input}"
     shell:

From 66f32bdd8bdca5f0393500633bcaeda03d4aa021 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Fri, 20 May 2022 14:23:12 +1200
Subject: [PATCH 09/20] add -f flag to gsutil to multithread

---
 docs/running_on_a_hpc.md            | 4 ++--
 docs/running_on_a_single_machine.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/running_on_a_hpc.md b/docs/running_on_a_hpc.md
index 2cf852c..a07ab9d 100644
--- a/docs/running_on_a_hpc.md
+++ b/docs/running_on_a_hpc.md
@@ -131,7 +131,7 @@ conda activate pipeline_run_env
 Download from [Google Cloud Bucket](https://console.cloud.google.com/storage/browser/gatk-legacy-bundles/b37?prefix=)
 
 ```bash
-gsutil cp -r gs://gatk-legacy-bundles/b37 /where/to/download/
+gsutil -m cp -r gs://gatk-legacy-bundles/b37 /where/to/download/
 ```
 
 ### hg38
@@ -139,7 +139,7 @@ gsutil cp -r gs://gatk-legacy-bundles/b37 /where/to/download/
 Download from [Google Cloud Bucket](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0)
 
 ```bash
-gsutil cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
+gsutil -m cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
 ```
 
 ## 7. Create a local copy of other databases (either GRCh37 or GRCh38)
diff --git a/docs/running_on_a_single_machine.md b/docs/running_on_a_single_machine.md
index 1174a60..c7896c7 100644
--- a/docs/running_on_a_single_machine.md
+++ b/docs/running_on_a_single_machine.md
@@ -130,7 +130,7 @@ conda activate pipeline_run_env
 Download from [Google Cloud Bucket](https://console.cloud.google.com/storage/browser/gatk-legacy-bundles/b37?prefix=)
 
 ```bash
-gsutil cp -r gs://gatk-legacy-bundles/b37 /where/to/download/
+gsutil -m cp -r gs://gatk-legacy-bundles/b37 /where/to/download/
 ```
 
 ### hg38
@@ -138,7 +138,7 @@ gsutil cp -r gs://gatk-legacy-bundles/b37 /where/to/download/
 Download from [Google Cloud Bucket](https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0)
 
 ```bash
-gsutil cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
+gsutil -m cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/download/
 ```
 
 ## 7. Create a local copy of other databases (either GRCh37 or GRCh38)

From 7bcc61a5715f5bcd9238fa9bf60abd23b812750f Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Fri, 20 May 2022 15:48:38 +1200
Subject: [PATCH 10/20] create single download db conda env

---
 docs/running_on_a_hpc.md                      | 40 +++++++----
 docs/running_on_a_single_machine.md           | 40 +++++++----
 ...line_run_env.yml => pipeline_run_env.yaml} | 51 +++++---------
 ...e_install.yml => vap_download_db_env.yaml} | 69 +++++++++++++++----
 4 files changed, 123 insertions(+), 77 deletions(-)
 rename workflow/envs/{pipeline_run_env.yml => pipeline_run_env.yaml} (80%)
 rename workflow/envs/{vep_database_install.yml => vap_download_db_env.yaml} (74%)

diff --git a/docs/running_on_a_hpc.md b/docs/running_on_a_hpc.md
index a07ab9d..96410e4 100644
--- a/docs/running_on_a_hpc.md
+++ b/docs/running_on_a_hpc.md
@@ -9,7 +9,7 @@
   - [3. Setup files and directories](#3-setup-files-and-directories)
     - [Test data](#test-data)
   - [4. Get prerequisite software/hardware](#4-get-prerequisite-softwarehardware)
-  - [5. Create and activate a conda environment with python, snakemake, gsutil and wget installed](#5-create-and-activate-a-conda-environment-with-python-snakemake-gsutil-and-wget-installed)
+  - [5. Create and activate a conda environment with software for downloading databases](#5-create-and-activate-a-conda-environment-with-software-for-downloading-databases)
   - [6. Create a local copy of the GATK resource bundle (either b37 or hg38)](#6-create-a-local-copy-of-the-gatk-resource-bundle-either-b37-or-hg38)
     - [b37](#b37)
     - [hg38](#hg38)
@@ -25,11 +25,12 @@
     - [VCF annotation](#vcf-annotation)
   - [9. Configure to run on a HPC](#9-configure-to-run-on-a-hpc)
   - [10. Modify the run scripts](#10-modify-the-run-scripts)
-  - [11. Run the pipeline](#11-run-the-pipeline)
-  - [12. Evaluate the pipeline run](#12-evaluate-the-pipeline-run)
-  - [13. Commit and push to your forked version of the github repo](#13-commit-and-push-to-your-forked-version-of-the-github-repo)
-  - [14. Repeat step 13 each time you re-run the analysis with different parameters](#14-repeat-step-13-each-time-you-re-run-the-analysis-with-different-parameters)
-  - [15. Raise issues, create feature requests or create a pull request with the upstream repo to merge any useful changes to the pipeline (optional)](#15-raise-issues-create-feature-requests-or-create-a-pull-request-with-the-upstream-repo-to-merge-any-useful-changes-to-the-pipeline-optional)
+  - [11. Create and activate a conda environment with software for running the pipeline](#11-create-and-activate-a-conda-environment-with-software-for-running-the-pipeline)
+  - [12. Run the pipeline](#12-run-the-pipeline)
+  - [13. Evaluate the pipeline run](#13-evaluate-the-pipeline-run)
+  - [14. Commit and push to your forked version of the github repo](#14-commit-and-push-to-your-forked-version-of-the-github-repo)
+  - [15. Repeat step 14 each time you re-run the analysis with different parameters](#15-repeat-step-14-each-time-you-re-run-the-analysis-with-different-parameters)
+  - [16. Raise issues, create feature requests or create a pull request with the upstream repo to merge any useful changes to the pipeline (optional)](#16-raise-issues-create-feature-requests-or-create-a-pull-request-with-the-upstream-repo-to-merge-any-useful-changes-to-the-pipeline-optional)
 
 ## 1. Fork the pipeline repo to a personal or lab account
 
@@ -116,12 +117,14 @@ Other software required to get setup and run the pipeline:
 
 This software is commonly pre-installed on HPC's, likely available as modules that can be loaded. Talk to your system administrator if you need help with this.
 
-## 5. Create and activate a conda environment with python, snakemake, gsutil and wget installed
+## 5. Create and activate a conda environment with software for downloading databases
+
+This installs [gsutil](https://cloud.google.com/storage/docs/gsutil), [ensembl-vep](https://grch37.ensembl.org/info/docs/tools/vep/index.html), [wget](https://www.gnu.org/software/wget/) and their dependencies
 
 ```bash
 cd ./workflow/
-mamba env create -f ./envs/pipeline_run_env.yml
-conda activate pipeline_run_env
+mamba env create -f ./envs/vap_download_db_env.yaml
+conda activate vap_download_db_env
 ```
 
 ## 6. Create a local copy of the [GATK resource bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle) (either b37 or hg38)
@@ -415,7 +418,16 @@ snakemake \
 
 See the [snakemake documentation](https://snakemake.readthedocs.io/en/v4.5.1/executable.html#all-options) for additional run parameters.
 
-## 11. Run the pipeline
+## 11. Create and activate a conda environment with software for running the pipeline
+
+This installs [snakemake](https://snakemake.github.io/) and it's dependencies
+
+```bash
+mamba env create -f ./envs/pipeline_run_env.yaml
+conda activate pipeline_run_env
+```
+
+## 12. Run the pipeline
 
 First carry out a dry run
 
@@ -429,7 +441,7 @@ If there are no issues, start a full run
 bash run_hpc.sh
 ```
 
-## 12. Evaluate the pipeline run
+## 13. Evaluate the pipeline run
 
 Generate an interactive html report
 
@@ -437,7 +449,7 @@ Generate an interactive html report
 bash report.sh
 ```
 
-## 13. Commit and push to your forked version of the github repo
+## 14. Commit and push to your forked version of the github repo
 
 To maintain reproducibility, commit and push:
 
@@ -445,8 +457,8 @@ To maintain reproducibility, commit and push:
 - All run scripts
 - The final report
 
-## 14. Repeat step 13 each time you re-run the analysis with different parameters
+## 15. Repeat step 14 each time you re-run the analysis with different parameters
 
-## 15. Raise issues, create feature requests or create a pull request with the [upstream repo](https://github.com/ESR-NZ/vcf_annotation_pipeline) to merge any useful changes to the pipeline (optional)
+## 16. Raise issues, create feature requests or create a pull request with the [upstream repo](https://github.com/ESR-NZ/vcf_annotation_pipeline) to merge any useful changes to the pipeline (optional)
 
 See [the README](https://github.com/ESR-NZ/vcf_annotation_pipeline/blob/dev/README.md#contribute-back) for info on how to contribute back to the pipeline!
diff --git a/docs/running_on_a_single_machine.md b/docs/running_on_a_single_machine.md
index c7896c7..f54a445 100644
--- a/docs/running_on_a_single_machine.md
+++ b/docs/running_on_a_single_machine.md
@@ -9,7 +9,7 @@
   - [3. Setup files and directories](#3-setup-files-and-directories)
     - [Test data](#test-data)
   - [4. Get prerequisite software/hardware](#4-get-prerequisite-softwarehardware)
-  - [5. Create and activate a conda environment with python, snakemake, gsutil and wget installed](#5-create-and-activate-a-conda-environment-with-python-snakemake-gsutil-and-wget-installed)
+  - [5. Create and activate a conda environment with software for downloading databases](#5-create-and-activate-a-conda-environment-with-software-for-downloading-databases)
   - [6. Create a local copy of the GATK resource bundle (either b37 or hg38)](#6-create-a-local-copy-of-the-gatk-resource-bundle-either-b37-or-hg38)
     - [b37](#b37)
     - [hg38](#hg38)
@@ -24,11 +24,12 @@
       - [Cohort samples](#cohort-samples)
     - [VCF annotation](#vcf-annotation)
   - [9. Modify the run scripts](#9-modify-the-run-scripts)
-  - [10. Run the pipeline](#10-run-the-pipeline)
-  - [11. Evaluate the pipeline run](#11-evaluate-the-pipeline-run)
-  - [12. Commit and push to your forked version of the github repo](#12-commit-and-push-to-your-forked-version-of-the-github-repo)
-  - [13. Repeat step 12 each time you re-run the analysis with different parameters](#13-repeat-step-12-each-time-you-re-run-the-analysis-with-different-parameters)
-  - [14. Raise issues, create feature requests or create a pull request with the upstream repo to merge any useful changes to the pipeline (optional)](#14-raise-issues-create-feature-requests-or-create-a-pull-request-with-the-upstream-repo-to-merge-any-useful-changes-to-the-pipeline-optional)
+  - [10. Create and activate a conda environment with software for running the pipeline](#10-create-and-activate-a-conda-environment-with-software-for-running-the-pipeline)
+  - [11. Run the pipeline](#11-run-the-pipeline)
+  - [12. Evaluate the pipeline run](#12-evaluate-the-pipeline-run)
+  - [13. Commit and push to your forked version of the github repo](#13-commit-and-push-to-your-forked-version-of-the-github-repo)
+  - [14. Repeat step 13 each time you re-run the analysis with different parameters](#14-repeat-step-13-each-time-you-re-run-the-analysis-with-different-parameters)
+  - [15. Raise issues, create feature requests or create a pull request with the upstream repo to merge any useful changes to the pipeline (optional)](#15-raise-issues-create-feature-requests-or-create-a-pull-request-with-the-upstream-repo-to-merge-any-useful-changes-to-the-pipeline-optional)
 
 ## 1. Fork the pipeline repo to a personal or lab account
 
@@ -115,12 +116,14 @@ Other software required to get setup and run the pipeline:
 
 This software is commonly pre-installed on HPC's, likely available as modules that can be loaded. Talk to your system administrator if you need help with this.
 
-## 5. Create and activate a conda environment with python, snakemake, gsutil and wget installed
+## 5. Create and activate a conda environment with software for downloading databases
+
+This installs [gsutil](https://cloud.google.com/storage/docs/gsutil), [ensembl-vep](https://grch37.ensembl.org/info/docs/tools/vep/index.html), [wget](https://www.gnu.org/software/wget/) and their dependencies
 
 ```bash
 cd ./workflow/
-mamba env create -f ./envs/pipeline_run_env.yml
-conda activate pipeline_run_env
+mamba env create -f ./envs/vap_download_db_env.yaml
+conda activate vap_download_db_env
 ```
 
 ## 6. Create a local copy of the [GATK resource bundle](https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle) (either b37 or hg38)
@@ -385,7 +388,16 @@ snakemake \
 
 See the [snakemake documentation](https://snakemake.readthedocs.io/en/v4.5.1/executable.html#all-options) for additional run parameters.
 
-## 10. Run the pipeline
+## 10. Create and activate a conda environment with software for running the pipeline
+
+This installs [snakemake](https://snakemake.github.io/) and it's dependencies
+
+```bash
+mamba env create -f ./envs/pipeline_run_env.yaml
+conda activate pipeline_run_env
+```
+
+## 11. Run the pipeline
 
 First carry out a dry run
 
@@ -399,7 +411,7 @@ If there are no issues, start a full run
 bash run_hpc.sh
 ```
 
-## 11. Evaluate the pipeline run
+## 12. Evaluate the pipeline run
 
 Generate an interactive html report
 
@@ -407,7 +419,7 @@ Generate an interactive html report
 bash report.sh
 ```
 
-## 12. Commit and push to your forked version of the github repo
+## 13. Commit and push to your forked version of the github repo
 
 To maintain reproducibility, commit and push:
 
@@ -415,8 +427,8 @@ To maintain reproducibility, commit and push:
 - All run scripts
 - The final report
 
-## 13. Repeat step 12 each time you re-run the analysis with different parameters
+## 14. Repeat step 13 each time you re-run the analysis with different parameters
 
-## 14. Raise issues, create feature requests or create a pull request with the [upstream repo](https://github.com/ESR-NZ/vcf_annotation_pipeline) to merge any useful changes to the pipeline (optional)
+## 15. Raise issues, create feature requests or create a pull request with the [upstream repo](https://github.com/ESR-NZ/vcf_annotation_pipeline) to merge any useful changes to the pipeline (optional)
 
 See [the README](https://github.com/ESR-NZ/vcf_annotation_pipeline/blob/dev/README.md#contribute-back) for info on how to contribute back to the pipeline!
diff --git a/workflow/envs/pipeline_run_env.yml b/workflow/envs/pipeline_run_env.yaml
similarity index 80%
rename from workflow/envs/pipeline_run_env.yml
rename to workflow/envs/pipeline_run_env.yaml
index e25b131..a6f5e24 100644
--- a/workflow/envs/pipeline_run_env.yml
+++ b/workflow/envs/pipeline_run_env.yaml
@@ -11,62 +11,51 @@ dependencies:
   - aiosignal=1.2.0=pyhd8ed1ab_0
   - amply=0.1.5=pyhd8ed1ab_0
   - appdirs=1.4.4=pyh9f0ad1d_0
-  - argcomplete=2.0.0=pyhd8ed1ab_0
   - async-timeout=4.0.2=pyhd8ed1ab_0
   - attmap=0.13.2=pyhd8ed1ab_0
   - attrs=21.4.0=pyhd8ed1ab_0
   - backports=1.0=py_2
   - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
   - bcrypt=3.2.2=py310h5764c6d_0
-  - boto=2.49.0=py_0
-  - boto3=1.23.2=pyhd8ed1ab_0
-  - botocore=1.26.2=pyhd8ed1ab_0
+  - boto3=1.23.3=pyhd8ed1ab_0
+  - botocore=1.26.3=pyhd8ed1ab_0
   - brotlipy=0.7.0=py310h5764c6d_1004
   - bzip2=1.0.8=h7f98852_4
   - c-ares=1.18.1=h7f98852_0
-  - ca-certificates=2022.5.18=ha878542_0
+  - ca-certificates=2022.5.18.1=ha878542_0
   - cachetools=5.0.0=pyhd8ed1ab_0
   - certifi=2022.5.18=py310hff52083_0
-  - cffi=1.15.0=py310hd667e15_1
+  - cffi=1.15.0=py310h0fdd8cc_0
   - charset-normalizer=2.0.12=pyhd8ed1ab_0
   - coincbc=2.10.5=hcee13e7_1
   - configargparse=1.5.3=pyhd8ed1ab_0
   - connection_pool=0.0.3=pyhd3deb0d_0
-  - coverage=6.3.3=py310h5764c6d_0
-  - crcmod=1.7=py310h5764c6d_1008
   - cryptography=37.0.1=py310h9ce1e76_0
   - datrie=0.8.2=py310h6acc77f_3
   - decorator=5.1.1=pyhd8ed1ab_0
   - defusedxml=0.7.1=pyhd8ed1ab_0
   - docutils=0.18.1=py310hff52083_1
   - dpath=2.0.6=py310hff52083_1
-  - dropbox=11.30.0=pyhd8ed1ab_0
-  - fasteners=0.17.3=pyhd8ed1ab_0
+  - dropbox=11.31.0=pyhd8ed1ab_0
   - filechunkio=1.8=py_2
   - filelock=3.7.0=pyhd8ed1ab_0
   - frozenlist=1.3.0=py310h5764c6d_1
   - ftputil=5.0.4=pyhd8ed1ab_0
-  - gcs-oauth2-boto-plugin=3.0=pyhd8ed1ab_0
-  - gettext=0.19.8.1=h0b5b191_1005
   - gitdb=4.0.9=pyhd8ed1ab_0
   - gitpython=3.1.27=pyhd8ed1ab_0
   - google-api-core=2.7.1=pyhd8ed1ab_0
   - google-api-python-client=2.48.0=pyhd8ed1ab_0
-  - google-apitools=0.5.32=pyhd8ed1ab_0
   - google-auth=2.6.6=pyh6c4a22f_0
   - google-auth-httplib2=0.1.0=pyhd8ed1ab_0
   - google-cloud-core=2.2.2=pyh6c4a22f_0
   - google-cloud-storage=2.1.0=pyh6c4a22f_0
   - google-crc32c=1.1.2=py310he8fe98e_3
-  - google-reauth=0.1.1=pyhd3deb0d_0
   - google-resumable-media=2.1.0=pyh6c4a22f_0
   - googleapis-common-protos=1.56.1=py310hff52083_0
-  - grpcio=1.35.0=py310hce63b2e_0
-  - gsutil=5.10=pyhd8ed1ab_0
+  - grpcio=1.46.1=py310hba10ccf_0
   - httplib2=0.20.4=pyhd8ed1ab_0
   - idna=3.3=pyhd8ed1ab_0
   - importlib-metadata=4.11.3=py310hff52083_1
-  - importlib_metadata=4.11.3=hd8ed1ab_1
   - importlib_resources=5.7.1=pyhd8ed1ab_1
   - iniconfig=1.1.1=pyh9f0ad1d_0
   - jinja2=3.1.2=pyhd8ed1ab_0
@@ -77,34 +66,32 @@ dependencies:
   - libblas=3.9.0=14_linux64_openblas
   - libcblas=3.9.0=14_linux64_openblas
   - libcrc32c=1.1.2=h9c3ff4c_0
-  - libffi=3.3=h58526e2_2
+  - libffi=3.4.2=h7f98852_5
   - libgcc-ng=12.1.0=h8d9b700_16
   - libgfortran-ng=12.1.0=h69a702a_16
   - libgfortran5=12.1.0=hdcd56e2_16
   - libgomp=12.1.0=h8d9b700_16
-  - libidn2=2.3.2=h7f98852_0
   - liblapack=3.9.0=14_linux64_openblas
   - libnsl=2.0.0=h7f98852_0
   - libopenblas=0.3.20=pthreads_h78a6416_0
-  - libprotobuf=3.20.1=h4ff587b_0
+  - libprotobuf=3.20.1=h6239696_0
   - libsodium=1.0.18=h36c2ea0_1
   - libstdcxx-ng=12.1.0=ha89aaad_16
-  - libunistring=0.9.10=h7f98852_0
-  - libuuid=1.0.3=h7f8727e_2
+  - libuuid=2.32.1=h7f98852_1000
+  - libzlib=1.2.11=h166bdaf_1014
   - logmuse=0.2.6=pyh8c360ce_0
   - markupsafe=2.1.1=py310h5764c6d_1
-  - monotonic=1.5=py_0
   - multidict=6.0.2=py310h5764c6d_1
   - nbformat=5.4.0=pyhd8ed1ab_0
   - ncurses=6.3=h27087fc_1
   - numpy=1.22.3=py310h4ef5377_2
   - oauth2client=4.1.3=py_0
-  - openssl=1.1.1o=h166bdaf_0
+  - openssl=3.0.3=h166bdaf_0
   - packaging=21.3=pyhd8ed1ab_0
   - pandas=1.4.2=py310h769672d_1
   - paramiko=2.11.0=pyhd8ed1ab_0
   - peppy=0.31.2=pyhd8ed1ab_2
-  - pip=22.1=pyhd8ed1ab_0
+  - pip=22.0.4=pyhd8ed1ab_0
   - plac=1.3.5=pyhd8ed1ab_0
   - pluggy=1.0.0=py310hff52083_3
   - ply=3.11=py_1
@@ -124,7 +111,7 @@ dependencies:
   - pysftp=0.2.9=py_1
   - pysocks=1.7.1=py310hff52083_5
   - pytest=7.1.2=py310hff52083_0
-  - python=3.10.4=h12debd9_0
+  - python=3.10.4=h2660328_0_cpython
   - python-dateutil=2.8.2=pyhd8ed1ab_0
   - python-fastjsonschema=2.15.3=pyhd8ed1ab_0
   - python-irodsclient=1.1.3=pyhd8ed1ab_0
@@ -136,10 +123,9 @@ dependencies:
   - readline=8.1=h46c0cb4_0
   - requests=2.27.1=pyhd8ed1ab_0
   - retry=0.9.2=py_0
-  - retry_decorator=1.1.1=pyh9f0ad1d_0
   - rsa=4.8=pyhd8ed1ab_0
   - s3transfer=0.5.2=pyhd8ed1ab_0
-  - setuptools=62.3.1=py310hff52083_0
+  - setuptools=62.3.2=py310hff52083_0
   - setuptools-scm=6.4.2=pyhd8ed1ab_0
   - six=1.16.0=pyh6c4a22f_0
   - slacker=0.14.0=py_0
@@ -147,12 +133,11 @@ dependencies:
   - smmap=3.0.5=pyh44b312d_0
   - snakemake=7.7.0=hdfd78af_0
   - snakemake-minimal=7.7.0=pyhdfd78af_0
-  - socksipy-branch=1.01=pyh9f0ad1d_0
-  - sqlite=3.38.3=hc218d9a_0
+  - sqlite=3.38.5=h4ff8645_0
   - stone=3.3.1=pyhd8ed1ab_0
   - stopit=1.1.2=py_0
   - tabulate=0.8.9=pyhd8ed1ab_0
-  - tk=8.6.11=h1ccaba5_1
+  - tk=8.6.12=h27826a3_0
   - tomli=2.0.1=pyhd8ed1ab_0
   - toposort=1.7=pyhd8ed1ab_0
   - traitlets=5.2.1.post0=pyhd8ed1ab_0
@@ -164,7 +149,6 @@ dependencies:
   - urllib3=1.26.9=pyhd8ed1ab_0
   - veracitools=0.1.3=py_0
   - wcwidth=0.2.5=pyh9f0ad1d_2
-  - wget=1.21.3=h0b77cf5_0
   - wheel=0.37.1=pyhd8ed1ab_0
   - wrapt=1.14.1=py310h5764c6d_0
   - xz=5.2.5=h516909a_1
@@ -172,5 +156,4 @@ dependencies:
   - yarl=1.7.2=py310h5764c6d_2
   - yte=1.4.0=py310hff52083_0
   - zipp=3.8.0=pyhd8ed1ab_0
-  - zlib=1.2.12=h7f8727e_2
-prefix: /home/lkemp/miniconda3/envs/pipeline_run_env
+  - zlib=1.2.11=h166bdaf_1014
diff --git a/workflow/envs/vep_database_install.yml b/workflow/envs/vap_download_db_env.yaml
similarity index 74%
rename from workflow/envs/vep_database_install.yml
rename to workflow/envs/vap_download_db_env.yaml
index 5614825..18cf94b 100644
--- a/workflow/envs/vep_database_install.yml
+++ b/workflow/envs/vap_download_db_env.yaml
@@ -1,4 +1,4 @@
-name: vep_database_install
+name: vap_download_db_env
 channels:
   - conda-forge
   - bioconda
@@ -6,19 +6,40 @@ channels:
 dependencies:
   - _libgcc_mutex=0.1=conda_forge
   - _openmp_mutex=4.5=2_gnu
-  - binutils_impl_linux-64=2.36.1=h193b22a_2
-  - binutils_linux-64=2.36=hf3e587d_9
+  - aiohttp=3.8.1=py37h540881e_1
+  - aiosignal=1.2.0=pyhd8ed1ab_0
+  - argcomplete=2.0.0=pyhd8ed1ab_0
+  - async-timeout=4.0.2=pyhd8ed1ab_0
+  - asynctest=0.13.0=py_0
+  - attrs=21.4.0=pyhd8ed1ab_0
+  - boto=2.49.0=py_0
+  - brotlipy=0.7.0=py37h540881e_1004
   - bzip2=1.0.8=h7f98852_4
   - c-ares=1.18.1=h7f98852_0
-  - ca-certificates=2022.5.18=ha878542_0
+  - ca-certificates=2022.5.18.1=ha878542_0
+  - cachetools=5.0.0=pyhd8ed1ab_0
+  - certifi=2022.5.18.1=py37h89c1867_0
+  - cffi=1.15.0=py37hd667e15_1
+  - charset-normalizer=2.0.12=pyhd8ed1ab_0
   - clustalw=2.1=h9f5acd7_7
+  - crcmod=1.7=py37h540881e_1008
+  - cryptography=37.0.2=py37h38fbfac_0
   - curl=7.83.1=h7bff187_0
   - ensembl-vep=106.1=pl5321h4a94de4_0
   - expat=2.4.8=h27087fc_0
-  - gcc_impl_linux-64=10.3.0=hf2f2afa_16
-  - gcc_linux-64=10.3.0=hc39de41_9
+  - fasteners=0.17.3=pyhd8ed1ab_0
+  - frozenlist=1.3.0=py37h540881e_1
+  - gcs-oauth2-boto-plugin=3.0=pyhd8ed1ab_0
+  - gettext=0.19.8.1=h0b5b191_1005
+  - google-apitools=0.5.32=pyhd8ed1ab_0
+  - google-auth=2.6.6=pyh6c4a22f_0
+  - google-reauth=0.1.1=pyhd3deb0d_0
+  - gsutil=5.10=pyhd8ed1ab_0
   - htslib=1.14=h9753748_2
-  - kernel-headers_linux-64=2.6.32=he073ed8_15
+  - httplib2=0.20.4=pyhd8ed1ab_0
+  - idna=3.3=pyhd8ed1ab_0
+  - importlib-metadata=4.11.3=py37h89c1867_1
+  - importlib_metadata=4.11.3=hd8ed1ab_1
   - keyutils=1.6.1=h166bdaf_0
   - krb5=1.19.3=h3790be6_0
   - ld_impl_linux-64=2.36.1=hea4e1c9_2
@@ -28,17 +49,20 @@ dependencies:
   - libedit=3.1.20191231=he28a2e2_2
   - libev=4.33=h516909a_1
   - libffi=3.3=h58526e2_2
-  - libgcc-devel_linux-64=10.3.0=he6cfe16_16
   - libgcc-ng=12.1.0=h8d9b700_16
   - libgomp=12.1.0=h8d9b700_16
+  - libidn2=2.3.2=h7f98852_0
   - libnghttp2=1.47.0=h727a467_0
   - libnsl=2.0.0=h7f98852_0
-  - libsanitizer=10.3.0=h26c7422_16
   - libssh2=1.10.0=ha56f1ee_2
   - libstdcxx-ng=12.1.0=ha89aaad_16
+  - libunistring=0.9.10=h7f98852_0
   - libzlib=1.2.11=h166bdaf_1014
+  - monotonic=1.5=py_0
+  - multidict=6.0.2=py37h540881e_1
   - mysql-connector-c=6.1.11=h6eb9d5d_1007
   - ncurses=6.3=h27087fc_1
+  - oauth2client=4.1.3=py_0
   - openssl=1.1.1o=h166bdaf_0
   - paml=4.9=hec16e2b_7
   - perl=5.32.1=2_h7f98852_perl5
@@ -67,9 +91,8 @@ dependencies:
   - perl-constant=1.33=pl5321hd8ed1ab_0
   - perl-data-dumper=2.183=pl5321hec16e2b_1
   - perl-db_file=1.858=pl5321h166bdaf_0
-  - perl-dbd-mysql=4.050=pl5321h9f5acd7_0
+  - perl-dbd-mysql=4.046=pl5321h9f5acd7_4
   - perl-dbi=1.643=pl5321hec16e2b_1
-  - perl-devel-checklib=1.14=pl5321hec16e2b_1
   - perl-devel-stacktrace=2.04=pl5321hdfd78af_1
   - perl-digest-hmac=1.04=pl5321hdfd78af_0
   - perl-digest-md5=2.58=pl5321hec16e2b_1
@@ -135,17 +158,33 @@ dependencies:
   - perl-xml-parser=2.44_01=pl5321hc3e0081_1003
   - perl-xml-regexp=0.04=pl5321hdfd78af_3
   - perl-xml-xpathengine=0.14=pl5321hdfd78af_3
-  - pip=22.1=pyhd8ed1ab_0
+  - pip=22.0.4=pyhd8ed1ab_0
+  - pyasn1=0.4.8=py_0
+  - pyasn1-modules=0.2.7=py_0
+  - pycparser=2.21=pyhd8ed1ab_0
+  - pyopenssl=22.0.0=pyhd8ed1ab_0
+  - pyparsing=3.0.9=pyhd8ed1ab_0
+  - pysocks=1.7.1=py37h89c1867_5
   - python=3.7.13=h12debd9_0
   - python_abi=3.7=2_cp37m
+  - pyu2f=0.1.5=pyhd8ed1ab_0
   - readline=8.1=h46c0cb4_0
-  - setuptools=62.3.1=py37h89c1867_0
+  - requests=2.27.1=pyhd8ed1ab_0
+  - retry_decorator=1.1.1=pyh9f0ad1d_0
+  - rsa=4.8=pyhd8ed1ab_0
+  - setuptools=62.3.2=py37h89c1867_0
+  - six=1.16.0=pyh6c4a22f_0
+  - socksipy-branch=1.01=pyh9f0ad1d_0
   - sqlite=3.38.5=h4ff8645_0
-  - sysroot_linux-64=2.12=he073ed8_15
   - t_coffee=11.0.8=py37hea885bf_8
   - tk=8.6.12=h27826a3_0
+  - typing-extensions=4.2.0=hd8ed1ab_1
+  - typing_extensions=4.2.0=pyha770c72_1
   - unzip=6.0=h7f98852_3
+  - urllib3=1.26.9=pyhd8ed1ab_0
+  - wget=1.20.3=ha56f1ee_1
   - wheel=0.37.1=pyhd8ed1ab_0
   - xz=5.2.5=h516909a_1
+  - yarl=1.7.2=py37h540881e_2
+  - zipp=3.8.0=pyhd8ed1ab_0
   - zlib=1.2.11=h166bdaf_1014
-prefix: /home/lkemp/miniconda3/envs/vep_database_install

From 92571af991e76c212e01447512ca206398df230a Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Fri, 20 May 2022 15:50:08 +1200
Subject: [PATCH 11/20] add names to conda envs

---
 workflow/envs/SnpSift.yaml | 1 +
 workflow/envs/gunzip.yaml  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/workflow/envs/SnpSift.yaml b/workflow/envs/SnpSift.yaml
index 100fffd..def6bca 100644
--- a/workflow/envs/SnpSift.yaml
+++ b/workflow/envs/SnpSift.yaml
@@ -1,3 +1,4 @@
+name: snpsift_env
 channels:
   - bioconda
   - conda-forge
diff --git a/workflow/envs/gunzip.yaml b/workflow/envs/gunzip.yaml
index 80a4d87..53e6537 100644
--- a/workflow/envs/gunzip.yaml
+++ b/workflow/envs/gunzip.yaml
@@ -1,3 +1,4 @@
+name: gunzip_env
 channels:
   - bioconda
   - conda-forge

From 54b7ab4fcbbf6d98dc10cd79c0c8452e1ebf6c1e Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Mon, 23 May 2022 10:37:04 +1200
Subject: [PATCH 12/20] minor change to docs

---
 docs/running_on_a_hpc.md            | 1 +
 docs/running_on_a_single_machine.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/running_on_a_hpc.md b/docs/running_on_a_hpc.md
index 96410e4..bb0348a 100644
--- a/docs/running_on_a_hpc.md
+++ b/docs/running_on_a_hpc.md
@@ -453,6 +453,7 @@ bash report.sh
 
 To maintain reproducibility, commit and push:
 
+- All documentation
 - All configuration files
 - All run scripts
 - The final report
diff --git a/docs/running_on_a_single_machine.md b/docs/running_on_a_single_machine.md
index f54a445..31f7204 100644
--- a/docs/running_on_a_single_machine.md
+++ b/docs/running_on_a_single_machine.md
@@ -423,6 +423,7 @@ bash report.sh
 
 To maintain reproducibility, commit and push:
 
+- All documentation
 - All configuration files
 - All run scripts
 - The final report

From 924f13e187cf77437262cb9ca00ff1bca49d44f6 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Mon, 23 May 2022 10:52:46 +1200
Subject: [PATCH 13/20] update docs to use download databases conda env

---
 docs/running_on_a_hpc.md            | 6 ------
 docs/running_on_a_single_machine.md | 6 ------
 2 files changed, 12 deletions(-)

diff --git a/docs/running_on_a_hpc.md b/docs/running_on_a_hpc.md
index bb0348a..0625187 100644
--- a/docs/running_on_a_hpc.md
+++ b/docs/running_on_a_hpc.md
@@ -152,10 +152,7 @@ gsutil -m cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/downloa
 Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba env create -f ./envs/vep_database_install.yml
-conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh37 -c /output/file/path/GRCh37 --CONVERT
-conda activate pipeline_run_env
 ```
 
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
@@ -172,10 +169,7 @@ Create a custom [dbNSFP database](https://sites.google.com/site/jpopgen/dbNSFP)
 Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba env create -f ./envs/vep_database_install.yml
-conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh38 -c /output/file/path/GRCh38 --CONVERT
-conda activate pipeline_run_env
 ```
 
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
diff --git a/docs/running_on_a_single_machine.md b/docs/running_on_a_single_machine.md
index 31f7204..2ddbbd6 100644
--- a/docs/running_on_a_single_machine.md
+++ b/docs/running_on_a_single_machine.md
@@ -151,10 +151,7 @@ gsutil -m cp -r gs://genomics-public-data/resources/broad/hg38 /where/to/downloa
 Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba env create -f ./envs/vep_database_install.yml
-conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh37 -c ./GRCh37 --CONVERT
-conda activate pipeline_run_env
 ```
 
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
@@ -171,10 +168,7 @@ Create a custom [dbNSFP database](https://sites.google.com/site/jpopgen/dbNSFP)
 Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database
 
 ```bash
-mamba env create -f ./envs/vep_database_install.yml
-conda activate vep_database_install
 vep_install -a cf -s homo_sapiens -y GRCh38 -c /output/file/path/GRCh38 --CONVERT
-conda activate pipeline_run_env
 ```
 
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.

From 1c7cfc50b14c12a23dd077f743f7582f543ac3c1 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Mon, 23 May 2022 10:59:17 +1200
Subject: [PATCH 14/20] add disclaimer about using correct vep version when
 creating db

---
 docs/running_on_a_hpc.md            | 22 ++++++++++++++++++++++
 docs/running_on_a_single_machine.md | 22 ++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/docs/running_on_a_hpc.md b/docs/running_on_a_hpc.md
index 0625187..4cf953d 100644
--- a/docs/running_on_a_hpc.md
+++ b/docs/running_on_a_hpc.md
@@ -155,6 +155,17 @@ Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.ht
 vep_install -a cf -s homo_sapiens -y GRCh37 -c /output/file/path/GRCh37 --CONVERT
 ```
 
+The same version of [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) that is run in the pipeline needs to be used to create the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database, therefore, if prompted to install a newer version of ensembl-vep, choose `continue (n)`. For example:
+
+```bash
+
+Version check reports a newer release of 'ensembl-vep' is available (installed: 105, available: 106)
+
+You should exit this installer and re-download 'ensembl-vep' if you wish to update
+
+Do you wish to exit so you can get updates (y) or continue (n): n
+```
+
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
 
 ```bash
@@ -172,6 +183,17 @@ Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html)
 vep_install -a cf -s homo_sapiens -y GRCh38 -c /output/file/path/GRCh38 --CONVERT
 ```
 
+The same version of [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) that is run in the pipeline needs to be used to create the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database, therefore, if prompted to install a newer version of ensembl-vep, choose `continue (n)`. For example:
+
+```bash
+
+Version check reports a newer release of 'ensembl-vep' is available (installed: 105, available: 106)
+
+You should exit this installer and re-download 'ensembl-vep' if you wish to update
+
+Do you wish to exit so you can get updates (y) or continue (n): n
+```
+
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
 
 ```bash
diff --git a/docs/running_on_a_single_machine.md b/docs/running_on_a_single_machine.md
index 2ddbbd6..37d868e 100644
--- a/docs/running_on_a_single_machine.md
+++ b/docs/running_on_a_single_machine.md
@@ -154,6 +154,17 @@ Download the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.ht
 vep_install -a cf -s homo_sapiens -y GRCh37 -c ./GRCh37 --CONVERT
 ```
 
+The same version of [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) that is run in the pipeline needs to be used to create the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database, therefore, if prompted to install a newer version of ensembl-vep, choose `continue (n)`. For example:
+
+```bash
+
+Version check reports a newer release of 'ensembl-vep' is available (installed: 105, available: 106)
+
+You should exit this installer and re-download 'ensembl-vep' if you wish to update
+
+Do you wish to exit so you can get updates (y) or continue (n): n
+```
+
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
 
 ```bash
@@ -171,6 +182,17 @@ Download [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html)
 vep_install -a cf -s homo_sapiens -y GRCh38 -c /output/file/path/GRCh38 --CONVERT
 ```
 
+The same version of [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) that is run in the pipeline needs to be used to create the [Ensembl-VEP](https://asia.ensembl.org/info/docs/tools/vep/index.html) database, therefore, if prompted to install a newer version of ensembl-vep, choose `continue (n)`. For example:
+
+```bash
+
+Version check reports a newer release of 'ensembl-vep' is available (installed: 105, available: 106)
+
+You should exit this installer and re-download 'ensembl-vep' if you wish to update
+
+Do you wish to exit so you can get updates (y) or continue (n): n
+```
+
 Download the [CADD database](https://cadd.gs.washington.edu/download) and it's associated index file.
 
 ```bash

From 98e0aaa1c55346a8bfac65370fd8de050071324d Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Mon, 23 May 2022 17:50:59 +1200
Subject: [PATCH 15/20] workaround for error downloading vep databases

---
 workflow/envs/vap_download_db_env.yaml | 2 +-
 workflow/rules/vep.smk                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/envs/vap_download_db_env.yaml b/workflow/envs/vap_download_db_env.yaml
index 18cf94b..59f059f 100644
--- a/workflow/envs/vap_download_db_env.yaml
+++ b/workflow/envs/vap_download_db_env.yaml
@@ -25,7 +25,7 @@ dependencies:
   - crcmod=1.7=py37h540881e_1008
   - cryptography=37.0.2=py37h38fbfac_0
   - curl=7.83.1=h7bff187_0
-  - ensembl-vep=106.1=pl5321h4a94de4_0
+  - ensembl-vep=105.0=pl5321h4a94de4_1
   - expat=2.4.8=h27087fc_0
   - fasteners=0.17.3=pyhd8ed1ab_0
   - frozenlist=1.3.0=py37h540881e_1
diff --git a/workflow/rules/vep.smk b/workflow/rules/vep.smk
index 484a3c0..11a5a21 100644
--- a/workflow/rules/vep.smk
+++ b/workflow/rules/vep.smk
@@ -15,7 +15,7 @@ rule vep:
     benchmark:
         "benchmarks/vep/{sample}.tsv"
     singularity:
-        "docker://ensemblorg/ensembl-vep:release_106.1"
+        "docker://ensemblorg/ensembl-vep:release_105.0"
     threads: config['THREADS']
     message:
         "Using the VEP database to determine the effect of the variants in {input.vcf}"

From 4a11c45c56c57601adad4982e32fb90958cd588e Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Tue, 24 May 2022 11:27:55 +1200
Subject: [PATCH 16/20] revert to conda env for genmod because of
 non-functioning container

---
 workflow/envs/genmod.yaml               | 8 ++++++++
 workflow/rules/genmod_annotate_CADD.smk | 4 ++--
 workflow/rules/genmod_models.smk        | 4 ++--
 workflow/rules/genmod_score_cohort.smk  | 4 ++--
 workflow/rules/genmod_score_single.smk  | 4 ++--
 5 files changed, 16 insertions(+), 8 deletions(-)
 create mode 100644 workflow/envs/genmod.yaml

diff --git a/workflow/envs/genmod.yaml b/workflow/envs/genmod.yaml
new file mode 100644
index 0000000..0514014
--- /dev/null
+++ b/workflow/envs/genmod.yaml
@@ -0,0 +1,8 @@
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - conda-forge::pip=22.1.1=pyhd8ed1ab_0
+  - pip:
+    - genmod==3.7.4
\ No newline at end of file
diff --git a/workflow/rules/genmod_annotate_CADD.smk b/workflow/rules/genmod_annotate_CADD.smk
index 749920b..731bc6b 100644
--- a/workflow/rules/genmod_annotate_CADD.smk
+++ b/workflow/rules/genmod_annotate_CADD.smk
@@ -8,8 +8,8 @@ rule genmod_annotate_CADD:
         "logs/genmod_annotate_CADD/{sample}.log"
     benchmark:
         "benchmarks/genmod_annotate_CADD/{sample}.tsv"
-    singularity:
-        "docker://clinicalgenomics/genmod:3.7.4"
+    conda:
+        "../envs/genmod.yaml"
     message:
         "Using genmod to annotate {input.vcf} with CADD"
     shell:
diff --git a/workflow/rules/genmod_models.smk b/workflow/rules/genmod_models.smk
index c6ba674..9b71291 100644
--- a/workflow/rules/genmod_models.smk
+++ b/workflow/rules/genmod_models.smk
@@ -11,8 +11,8 @@ rule genmod_models:
         "logs/genmod_models/{sample}.log"
     benchmark:
         "benchmarks/genmod_models/{sample}.tsv"
-    singularity:
-        "docker://clinicalgenomics/genmod:3.7.4"
+    conda:
+        "../envs/genmod.yaml"
     threads: config['THREADS']
     message:
         "Annotating {input.vcf} with patterns of inheritance"
diff --git a/workflow/rules/genmod_score_cohort.smk b/workflow/rules/genmod_score_cohort.smk
index 302316a..b96390e 100644
--- a/workflow/rules/genmod_score_cohort.smk
+++ b/workflow/rules/genmod_score_cohort.smk
@@ -10,8 +10,8 @@ rule genmod_score_cohort:
         "logs/genmod_score/{sample}.log"
     benchmark:
         "benchmarks/genmod_score/{sample}.tsv"
-    singularity:
-        "docker://clinicalgenomics/genmod:3.7.4"
+    conda:
+        "../envs/genmod.yaml"
     message:
         "Scoring the variants in {input.vcf} based on several annotations"
     shell:
diff --git a/workflow/rules/genmod_score_single.smk b/workflow/rules/genmod_score_single.smk
index c8a7e5b..05dc9b8 100644
--- a/workflow/rules/genmod_score_single.smk
+++ b/workflow/rules/genmod_score_single.smk
@@ -10,8 +10,8 @@ rule genmod_score_single:
         "logs/genmod_score/{sample}.log"
     benchmark:
         "benchmarks/genmod_score/{sample}.tsv"
-    singularity:
-        "docker://clinicalgenomics/genmod:3.7.4"
+    conda:
+        "../envs/genmod.yaml"
     message:
         "Scoring the variants in {input.vcf} based on several annotations"
     shell:

From a4e58af97d57ce51f901630182afadd97276edb8 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Tue, 24 May 2022 11:33:26 +1200
Subject: [PATCH 17/20] update pip version

---
 workflow/envs/gunzip.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/gunzip.yaml b/workflow/envs/gunzip.yaml
index 53e6537..4654fe9 100644
--- a/workflow/envs/gunzip.yaml
+++ b/workflow/envs/gunzip.yaml
@@ -4,6 +4,6 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - conda-forge::pip=22.0.4=pyhd8ed1ab_0
+  - conda-forge::pip=22.1.1=pyhd8ed1ab_0
   - pip:
     - gunzip==0.1.10
\ No newline at end of file

From 32e92f579d22aa62c01fcaee6b17c5a3a92906f4 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Tue, 24 May 2022 11:34:24 +1200
Subject: [PATCH 18/20] use htslib instead

---
 workflow/rules/bgzip.smk | 2 +-
 workflow/rules/tabix.smk | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/bgzip.smk b/workflow/rules/bgzip.smk
index 7df48bc..3f740c6 100644
--- a/workflow/rules/bgzip.smk
+++ b/workflow/rules/bgzip.smk
@@ -4,7 +4,7 @@ rule bgzip:
     output:
         temp("../results/annotated/{sample}_filtered_dbnsfp.vcf.gz")
     singularity:
-        "docker://biocontainers/samtools:v1.9-4-deb_cv1"
+        "docker://staphb/htslib:1.15"
     message:
         "Bgzipping {input}"
     shell:
diff --git a/workflow/rules/tabix.smk b/workflow/rules/tabix.smk
index 0c45478..ef6be71 100644
--- a/workflow/rules/tabix.smk
+++ b/workflow/rules/tabix.smk
@@ -4,7 +4,7 @@ rule tabix:
     output:
         temp("../results/annotated/{sample}_filtered_dbnsfp.vcf.gz.tbi")
     singularity:
-        "docker://biocontainers/tabix:v1.9-11-deb_cv1"
+        "docker://staphb/htslib:1.15"
     message:
         "Tabix indexing {input}"
     shell:

From a8a246caa9c3e96f80c312e1d27b4d897fbc1584 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Tue, 24 May 2022 11:36:55 +1200
Subject: [PATCH 19/20] add name to genmod conda env

---
 workflow/envs/genmod.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflow/envs/genmod.yaml b/workflow/envs/genmod.yaml
index 0514014..ed59b24 100644
--- a/workflow/envs/genmod.yaml
+++ b/workflow/envs/genmod.yaml
@@ -1,3 +1,4 @@
+name: genmod_env
 channels:
   - bioconda
   - conda-forge

From 7c70a4f084eb4abc7ae2e9039a742895b1e7a547 Mon Sep 17 00:00:00 2001
From: Leah Kemp <leah.kemp@esr.cri.nz>
Date: Tue, 31 May 2022 11:51:39 +1200
Subject: [PATCH 20/20] fix for genmod erorr

---
 workflow/envs/genmod.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/envs/genmod.yaml b/workflow/envs/genmod.yaml
index ed59b24..2c3b12f 100644
--- a/workflow/envs/genmod.yaml
+++ b/workflow/envs/genmod.yaml
@@ -6,4 +6,4 @@ channels:
 dependencies:
   - conda-forge::pip=22.1.1=pyhd8ed1ab_0
   - pip:
-    - genmod==3.7.4
\ No newline at end of file
+    - genmod==3.7.3
\ No newline at end of file