diff --git a/.github/workflows/preprocess-gisaid.yml b/.github/workflows/preprocess-gisaid.yml deleted file mode 100644 index 9e021c212..000000000 --- a/.github/workflows/preprocess-gisaid.yml +++ /dev/null @@ -1,70 +0,0 @@ -name: preprocess-gisaid - -on: - ## This workflow is intended to be run via a repository dispatch event sent from - ## https://github.com/nextstrain/ncov-ingest/blob/master/.github/workflows/fetch-and-ingest-gisaid-master.yml - repository_dispatch: - types: - - preprocess-gisaid - ## Manual running (via GitHub UI) is also be available, if needed, via workflow dispatch - workflow_dispatch: - inputs: - trial_name: - description: 'Short name to use as a prefix for the uploaded data. WARNING: without this we will overwrite the files in s3://nextstrain-ncov-private.' - required: false - -env: - S3_DST_BUCKET: ${{ github.event.inputs.trial_name && format('nextstrain-ncov-private/trial/{0}', github.event.inputs.trial_name) || 'nextstrain-ncov-private' }} - - -jobs: - gisaid: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - name: Setup - run: ./scripts/setup-github-workflow - - - name: Launch preprocess build - run: | - set -x - nextstrain build \ - --aws-batch \ - --cpus 16 \ - --detach \ - --memory 31GiB \ - . \ - upload \ - ${{ github.event.inputs.trial_name=='' && 'trigger_phylogenetic_rebuild' || '' }} \ - --set-threads align=16 \ - --profile nextstrain_profiles/nextstrain-gisaid-preprocess \ - --config \ - S3_DST_BUCKET=${S3_DST_BUCKET} \ - ${{ github.event.inputs.trial_name=='' && format('slack_token={0}', env.SLACK_TOKEN) || '' }} \ - github_token=${PAT_GITHUB_DISPATCH} \ - |& tee build-launch.log - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} - PAT_GITHUB_DISPATCH: ${{ secrets.PAT_GITHUB_DISPATCH }} - - - name: Build info - run: | - echo "--> Preprocessed files will be uploaded as:" - echo " s3://${S3_DST_BUCKET}/aligned.fasta.xz" - echo " s3://${S3_DST_BUCKET}/filtered.fasta.xz" - echo " s3://${S3_DST_BUCKET}/mutation-summary.tsv.xz" - echo - echo "--> Attach command" - tail -n1 build-launch.log - echo - JOBID=$( tail -n1 build-launch.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' ) - echo "--> View this job in the AWS console via" - echo " https://console.aws.amazon.com/batch/home?region=us-east-1#jobs/detail/${JOBID}" - echo - echo ${{ github.event.inputs.trial_name=='' && '--> When this job completes, a phylogenetic rebuild will be triggered' || '--> This is a trial build, and as such won\''t trigger a phylogenetic rebuild' }} diff --git a/.github/workflows/preprocess-open.yml b/.github/workflows/preprocess-open.yml deleted file mode 100644 index 336207e93..000000000 --- a/.github/workflows/preprocess-open.yml +++ /dev/null @@ -1,70 +0,0 @@ -name: preprocess-open - -on: - ## This workflow is intended to be run via a repository dispatch event sent from - ## https://github.com/nextstrain/ncov-ingest/blob/master/.github/workflows/fetch-and-ingest-genbank-master.yml - repository_dispatch: - types: - - preprocess-open - - preprocess-genbank - ## Manual running (via GitHub UI) is also be available, if needed, via workflow dispatch. - workflow_dispatch: - inputs: - trial_name: - description: 'Short name to use as a prefix for the uploaded data. WARNING: without this we will overwrite the files in s3://nextstrain-data/files/ncov/open.' - required: false - -env: - S3_DST_BUCKET: ${{ github.event.inputs.trial_name && format('nextstrain-staging/files/ncov/open/trial/{0}', github.event.inputs.trial_name) || 'nextstrain-data/files/ncov/open' }} - -jobs: - open: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: 3.9 - - - name: Setup - run: ./scripts/setup-github-workflow - - - name: Launch preprocess build - run: | - set -x - nextstrain build \ - --aws-batch \ - --cpus 16 \ - --detach \ - --memory 31GiB \ - . \ - upload \ - ${{ github.event.inputs.trial_name=='' && 'trigger_phylogenetic_rebuild' || '' }} \ - --set-threads align=16 \ - --profile nextstrain_profiles/nextstrain-open-preprocess \ - --config \ - S3_DST_BUCKET=${S3_DST_BUCKET} \ - ${{ github.event.inputs.trial_name=='' && format('slack_token={0}', env.SLACK_TOKEN) || '' }} \ - github_token=${PAT_GITHUB_DISPATCH} \ - |& tee build-launch.log - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} - PAT_GITHUB_DISPATCH: ${{ secrets.PAT_GITHUB_DISPATCH }} - - - name: Build info - run: | - echo "--> Preprocessed files will be uploaded as:" - echo " s3://${S3_DST_BUCKET}/aligned.fasta.xz" - echo " s3://${S3_DST_BUCKET}/filtered.fasta.xz" - echo " s3://${S3_DST_BUCKET}/mutation-summary.tsv.xz" - echo - echo "--> Attach command" - tail -n1 build-launch.log - echo - JOBID=$( tail -n1 build-launch.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' ) - echo "--> View this job in the AWS console via" - echo " https://console.aws.amazon.com/batch/home?region=us-east-1#jobs/detail/${JOBID}" - echo - echo ${{ github.event.inputs.trial_name=='' && '--> When this job completes, a phylogenetic rebuild will be triggered' || '--> This is a trial build, and as such won\''t trigger a phylogenetic rebuild' }} diff --git a/docs/dev_docs.md b/docs/dev_docs.md index 258bbc3ea..23c165604 100644 --- a/docs/dev_docs.md +++ b/docs/dev_docs.md @@ -73,27 +73,9 @@ We do not release new minor versions for new features, but you should document n The "core" nextstrain builds consist of a global analysis and six regional analyses, performed independently for GISAID data and open data (currently open data is GenBank data). Stepping back, the process can be broken into three steps: 1. Ingest and curation of raw data. This is performed by the [ncov-ingest](https://github.com/nextstrain/ncov-ingest/) repo and resulting files are uploaded to S3 buckets. -2. Preprocessing of data (alignment, masking and QC filtering). This is performed by the profiles `nextstrain_profiles/nextstrain-open-preprocess` and `nextstrain_profiles/nextstrain-gisaid-preprocess`. The resulting files are uploaded to S3 buckets by the `upload` rule. -3. Phylogenetic builds, which start from the files produced by the previous step. This is performed by the profiles `nextstrain_profiles/nextstrain-open` and `nextstrain_profiles/nextstrain-gisaid`. The resulting files are uploaded to S3 buckets by the `upload` rule. +2. Phylogenetic builds, which start from the files produced by the previous step. This is performed by the profiles `nextstrain_profiles/nextstrain-open` and `nextstrain_profiles/nextstrain-gisaid`. The resulting files are uploaded to S3 buckets by the `upload` rule. -### Manually running preprocessing - -To run these pipelines without uploading the results: -```sh -snakemake -pf results/filtered_open.fasta.xz --profile nextstrain_profiles/nextstrain-open-preprocess -snakemake -pf results/filtered_gisaid.fasta.xz --profile nextstrain_profiles/nextstrain-gisaid-preprocess -``` - -If you wish to upload the resulting information, you should run the `upload` rule. -Optionally, you may wish to define a specific `S3_DST_BUCKET` to avoid overwriting the files already present on the S3 buckets: -```sh -snakemake -pf upload --profile nextstrain_profiles/nextstrain-open-preprocess \ - --config S3_DST_BUCKET=nextstrain-staging/files/ncov/open/trial/TRIAL_NAME -snakemake -pf upload --profile nextstrain_profiles/nextstrain-gisaid-preprocess \ - --config S3_DST_BUCKET=nextstrain-ncov-private/trial/TRIAL_NAME -``` - ### Manually running phylogenetic builds To run these pipelines locally, without uploading the results: @@ -111,13 +93,13 @@ You may wish to overwrite these parameters for your local runs to avoid overwrit For instance, here are the commands used by the trial builds action (see below): ```sh snakemake -pf upload deploy \ - --profile nextstrain_profiles/nextstrain-open-preprocess \ + --profile nextstrain_profiles/nextstrain-open \ --config \ S3_DST_BUCKET=nextstrain-staging/files/ncov/open/trial/TRIAL_NAME \ deploy_url=s3://nextstrain-staging/ \ auspice_json_prefix=ncov_open_trial_TRIAL_NAME snakemake -pf upload deploy \ - --profile nextstrain_profiles/nextstrain-gisaid-preprocess \ + --profile nextstrain_profiles/nextstrain-gisaid \ --config \ S3_DST_BUCKET=nextstrain-ncov-private/trial/TRIAL_NAME \ deploy_url=s3://nextstrain-staging/ \ diff --git a/docs/src/analysis/orientation-files.md b/docs/src/analysis/orientation-files.md index 5c36a6079..17a8fcb5d 100644 --- a/docs/src/analysis/orientation-files.md +++ b/docs/src/analysis/orientation-files.md @@ -26,7 +26,7 @@ We'll walk through all of the files one by one, but here are the most important ## Output files and directories * `auspice/.json`: output file for visualization in Auspice where `` is the name of your build in the workflow configuration file. - * `results/aligned.fasta`, `results/filtered.fasta`, etc.: raw results files (dependencies) that are shared across all builds. + * `results/aligned.fasta`, etc.: raw results files (dependencies) that are shared across all builds. * `results//`: raw results files (dependencies) that are specific to a single build. * `logs/`: Log files with error messages and other information about the run. * `benchmarks/`: Run-times (and memory usage on Linux systems) for each rule in the workflow. diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md index 72749865e..30a26a019 100644 --- a/docs/src/reference/change_log.md +++ b/docs/src/reference/change_log.md @@ -3,6 +3,10 @@ As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation. We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added. +## v10 (January 2022) + + - Move filter and diagnostic steps after subsampling. For workflows with subsampling that does not depend on priority calculations, these changes allow the workflow to start subsampling from the metadata, skipping sequence alignment of the full input sequences and only looping through these input sequences once per build when subsampled sequences are extracted. To skip the alignment step, define your input sequences with the `aligned` directive. If you use priority-based subsampling, define your input sequences with the `sequences` directive. This reorganization of the workflow causes a breaking change in that the workflow no longer supports input-specific filtering with the `exclude_where`, `min_date`, and `exclude_ambiguous_dates_by` parameters. The workflow continues to support input-specific filtering by `min_length` and skipping of diagnostic filters with `skip_diagnostics`. [PR #814](https://github.com/nextstrain/ncov/pull/814). + ## New features since last version update - 20 December 2021: Surface the crowding penalty parameter via the config file: [PR #828](https://github.com/nextstrain/ncov/pull/827), [Issue #708](https://github.com/nextstrain/ncov/issues/708). The crowding penalty, used when calculating `priority scores` during subsampling, decreases the number of identical samples that are included in the tree during random subsampling to provide a broader picture of the viral diversity in your dataset. However, you may wish to set `crowding_penalty = 0.0` (default value = `0.1`) if you are interested in seeing as many samples as possible that are closely related to your `focal` set. You can change this parameter via `config['priorities']['crowding_penalty']`. There is no change to default behavior. diff --git a/docs/src/reference/configuration.md b/docs/src/reference/configuration.md index 89a450de8..f637ba962 100644 --- a/docs/src/reference/configuration.md +++ b/docs/src/reference/configuration.md @@ -273,7 +273,7 @@ Builds support any named attributes that can be referenced by subsampling scheme * required * `name` * `metadata` - * `sequences` or `aligned` or `filtered` + * `sequences` or `aligned` * examples: ```yaml inputs: @@ -283,9 +283,6 @@ inputs: - name: prealigned-data metadata: data/other_metadata.tsv.xz aligned: data/other_aligned.fasta.xz - - name: prealigned-and-filtered-data - metadata: data/other_metadata.tsv.xz - filtered: data/other_filtered.fasta.xz ``` Valid attributes for list entries in `inputs` are provided below. @@ -310,7 +307,7 @@ Valid attributes for list entries in `inputs` are provided below. ### sequences * type: string -* description: Path to a local or remote (S3, HTTP(S), GS) FASTA file with **_un_aligned and _un_filtered** genome sequences. Sequences can be uncompressed or compressed. +* description: Path to a local or remote (S3, HTTP(S), GS) FASTA file with **_un_aligned** genome sequences. Sequences can be uncompressed or compressed. * examples: * `data/example_sequences.fasta` * `data/example_sequences.fasta.xz` @@ -319,22 +316,13 @@ Valid attributes for list entries in `inputs` are provided below. ### aligned * type: string -* description: Path to a local or remote (S3, HTTP(S), GS) FASTA file with **aligned and _un_filtered** genome sequences. Sequences can be uncompressed or compressed. +* description: Path to a local or remote (S3, HTTP(S), GS) FASTA file with **aligned** genome sequences. Sequences can be uncompressed or compressed. * examples: * `data/aligned.fasta` * `data/aligned.fasta.xz` * `s3://your-bucket/aligned.fasta.gz` * `https://data.nextstrain.org/files/ncov/open/aligned.fasta.xz` -### filtered -* type: string -* description: Path to a local or remote (S3, HTTP(S), GS) FASTA file with **aligned and filtered** genome sequences. Sequences can be uncompressed or compressed. -* examples: - * `data/filtered.fasta` - * `data/filtered.fasta.xz` - * `s3://your-bucket/filtered.fasta.gz` - * `https://data.nextstrain.org/files/ncov/open/filtered.fasta.xz` - ## localrules * type: string * description: Path to a Snakemake file to include in the workflow. This parameter is redundant with `custom_rules` and may be deprecated soon. diff --git a/docs/src/reference/remote_inputs.md b/docs/src/reference/remote_inputs.md index b94979eb2..feda9980e 100644 --- a/docs/src/reference/remote_inputs.md +++ b/docs/src/reference/remote_inputs.md @@ -41,7 +41,6 @@ A side-effect of this is the creation and upload of processed versions of the en * `aligned.fasta.xz` alignment via [nextalign](https://github.com/nextstrain/nextclade/tree/master/packages/nextalign_cli). The default reference genome is [MN908947](https://www.ncbi.nlm.nih.gov/nuccore/MN908947) (Wuhan-Hu-1). * `mutation-summary.tsv.xz` A summary of the data in `aligned.fasta.xz`. -* `filtered.fasta.xz` The alignment excluding data with incomplete / invalid dates, unexpected genome lengths, missing metadata etc. We also maintain a [list of sequences to exclude](https://github.com/nextstrain/ncov/blob/master/defaults/exclude.txt) which are removed at this step. These sequences represent duplicates, outliers in terms of divergence or sequences with faulty metadata. ## Subsampled datasets @@ -71,7 +70,6 @@ This means that the full GenBank metadata and sequences are typically updated a | Full GenBank data | metadata | https://data.nextstrain.org/files/ncov/open/metadata.tsv.gz | | | sequences | https://data.nextstrain.org/files/ncov/open/sequences.fasta.xz | | | aligned | https://data.nextstrain.org/files/ncov/open/aligned.fasta.xz | -| | filtered | https://data.nextstrain.org/files/ncov/open/filtered.fasta.xz | | Global sample | metadata | https://data.nextstrain.org/files/ncov/open/global/metadata.tsv.xz | | | sequences | https://data.nextstrain.org/files/ncov/open/global/sequences.fasta.xz | | | aligned | https://data.nextstrain.org/files/ncov/open/global/aligned.fasta.xz | @@ -138,8 +136,6 @@ inputs: The following starting points are available: * replace `sequences` with `aligned` (skips alignment) -* replace `sequences` with `filtered` (skips alignment and basic filtering steps) - ## Compressed vs uncompressed starting points diff --git a/nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml deleted file mode 100644 index 00f6715c1..000000000 --- a/nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml +++ /dev/null @@ -1,22 +0,0 @@ -custom_rules: - - workflow/snakemake_rules/export_for_nextstrain.smk - -# These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. -# To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config. -# P.S. These are intentionally set as top-level keys as this allows command-line overrides. -S3_DST_BUCKET: "nextstrain-ncov-private" -S3_DST_COMPRESSION: "xz" -S3_DST_ORIGINS: ["gisaid"] - -upload: - - preprocessing-files - -inputs: - - name: gisaid - metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" - sequences: "s3://nextstrain-ncov-private/sequences.fasta.xz" - -# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds -deploy_url: s3://nextstrain-data -slack_token: ~ -slack_channel: "#ncov-gisaid-updates" \ No newline at end of file diff --git a/nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml b/nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml deleted file mode 100644 index cec6981d6..000000000 --- a/nextstrain_profiles/nextstrain-gisaid-preprocess/config.yaml +++ /dev/null @@ -1,10 +0,0 @@ -configfile: - - defaults/parameters.yaml - - nextstrain_profiles/nextstrain-gisaid-preprocess/builds.yaml - -keep-going: True -printshellcmds: True -show-failed-logs: True -restart-times: 1 -reason: True -stats: stats.json diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index cefd5b8d1..931334361 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -17,13 +17,12 @@ upload: genes: ["ORF1a", "ORF1b", "S", "ORF3a", "E", "M", "ORF6", "ORF7a", "ORF7b", "ORF8", "N", "ORF9b"] use_nextalign: true -# Note: we have a separate profile for aligning GISAID sequences. This is triggered -# as soon as new sequences are available. This workflow is thus intended to be -# started from the filtered alignment. james, sept 2021 +# Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment +# as we re-align everything after subsampling. inputs: - name: gisaid metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz" - filtered: "s3://nextstrain-ncov-private/filtered.fasta.xz" + aligned: "s3://nextstrain-ncov-private/sequences.fasta.xz" # Define locations for which builds should be created. # For each build we specify a subsampling scheme via an explicit key. diff --git a/nextstrain_profiles/nextstrain-open-preprocess/builds.yaml b/nextstrain_profiles/nextstrain-open-preprocess/builds.yaml deleted file mode 100644 index 88381f2e2..000000000 --- a/nextstrain_profiles/nextstrain-open-preprocess/builds.yaml +++ /dev/null @@ -1,22 +0,0 @@ -custom_rules: - - workflow/snakemake_rules/export_for_nextstrain.smk - -# These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. -# To modify the s3 _source_ bucket, specify this directly in the `inputs` section of the config. -# P.S. These are intentionally set as top-level keys as this allows command-line overrides. -S3_DST_BUCKET: "nextstrain-data/files/ncov/open" -S3_DST_COMPRESSION: "xz" -S3_DST_ORIGINS: ["open"] - -upload: - - preprocessing-files - -inputs: - - name: open - metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.gz" - sequences: "s3://nextstrain-data/files/ncov/open/sequences.fasta.xz" - -# Deploy and Slack options are related to Nextstrain live builds and don't need to be modified for local builds -deploy_url: s3://nextstrain-data -slack_token: ~ -slack_channel: "#ncov-genbank-updates" \ No newline at end of file diff --git a/nextstrain_profiles/nextstrain-open-preprocess/config.yaml b/nextstrain_profiles/nextstrain-open-preprocess/config.yaml deleted file mode 100644 index 22d00a1e6..000000000 --- a/nextstrain_profiles/nextstrain-open-preprocess/config.yaml +++ /dev/null @@ -1,10 +0,0 @@ -configfile: - - defaults/parameters.yaml - - nextstrain_profiles/nextstrain-open-preprocess/builds.yaml - -keep-going: True -printshellcmds: True -show-failed-logs: True -restart-times: 1 -reason: True -stats: stats.json diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml index 12dfad05d..0193c70cb 100644 --- a/nextstrain_profiles/nextstrain-open/builds.yaml +++ b/nextstrain_profiles/nextstrain-open/builds.yaml @@ -14,13 +14,12 @@ S3_DST_ORIGINS: ["open"] upload: - build-files -# Note: we have a separate profile for aligning open sequences. This is triggered -# as soon as new sequences are available. This workflow is thus intended to be -# started from the filtered alignment. james, sept 2021 +# Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment +# as we re-align everything after subsampling. inputs: - name: open metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.gz" - filtered: "s3://nextstrain-data/files/ncov/open/filtered.fasta.xz" + aligned: "s3://nextstrain-data/files/ncov/open/sequences.fasta.xz" # Define locations for which builds should be created. # For each build we specify a subsampling scheme via an explicit key. diff --git a/scripts/annotate_metadata_with_index.py b/scripts/annotate_metadata_with_index.py new file mode 100644 index 000000000..d7a77b517 --- /dev/null +++ b/scripts/annotate_metadata_with_index.py @@ -0,0 +1,39 @@ +"""Annotate a metadata file with the given sequence index. +""" +import argparse +from augur.io import read_metadata +import pandas as pd + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("--metadata", required=True, help="metadata to annotate") + parser.add_argument("--sequence-index", required=True, help="sequence index from augur index") + parser.add_argument("--output", required=True, help="metadata annotated with sequence index columns including a 'length' column based on the number of A, C, G, and T bases.") + + args = parser.parse_args() + + metadata = read_metadata(args.metadata) + + index = pd.read_csv( + args.sequence_index, + sep="\t", + ).drop( + columns=["length"], + ) + index["length"] = index.loc[:, ["A", "C", "G", "T"]].sum(axis=1) + new_columns = { + column: f"_{column}" + for column in index.columns + if column != "strain" + } + index = index.rename(columns=new_columns) + + metadata.merge( + index, + on="strain", + ).to_csv( + args.output, + sep="\t", + index=False, + ) diff --git a/scripts/diagnostic.py b/scripts/diagnostic.py index 203b96a3a..59e4a10a6 100644 --- a/scripts/diagnostic.py +++ b/scripts/diagnostic.py @@ -39,11 +39,19 @@ def earliest_clade_date(Nextstrain_clade, clade_emergence_dates_filename, window parser.add_argument("--rare-mutations", type=int, default=15, help="maximal allowed rare mutations as defined by nextclade") parser.add_argument("--clock-plus-rare", type=int, default=25, help="maximal allowed rare mutations as defined by nextclade") parser.add_argument("--clade-emergence-window", type=int, default=2, help="number of weeks before official emergence of clade at which sequences can safely be excluded") + parser.add_argument("--skip-inputs", type=str, nargs="*", help="names of inputs to skip diagnostics for based on presence of metadata fields named like '{input}' with a value of 'yes'") parser.add_argument("--output-exclusion-list", type=str, required=True, help="Output to-be-reviewed addition to exclude.txt") args = parser.parse_args() metadata = pd.read_csv(args.metadata, sep='\t') + # If any inputs should be skipped for diagnostics, remove their records from + # metadata prior to analysis. + if args.skip_inputs: + for input_name in args.skip_inputs: + if input_name in metadata.columns: + metadata = metadata.loc[metadata[input_name] != "yes"].copy() + check_recency = "date_submitted" in metadata if check_recency: recency_cutoff = (datetime.today() - timedelta(weeks=4)).toordinal() diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index c0c5072c7..a23d8c3b9 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -31,9 +31,6 @@ properties: aligned: type: string minLength: 1 - filtered: - type: string - minLength: 1 additionalProperties: false builds: @@ -59,11 +56,3 @@ properties: # A similar pattern is used in the workflow's wildcard constraints. pattern: "^[a-zA-Z0-9-]+$" - upload: - type: array - minItems: 1 - items: - type: string - enum: - - preprocessing-files - - build-files diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk index b0b75a79f..b835f9149 100644 --- a/workflow/snakemake_rules/common.smk +++ b/workflow/snakemake_rules/common.smk @@ -28,11 +28,64 @@ def numeric_date(dt=None): def _get_subsampling_scheme_by_build_name(build_name): return config["builds"][build_name].get("subsampling_scheme", build_name) +def _get_skipped_inputs_for_diagnostic(wildcards): + """Build an argument for the diagnostic script with a list of inputs to skip. + """ + inputs = config["inputs"] + diagnostics_key = "skip_diagnostics" + + arg_parts = [] + for input_name in inputs.keys(): + skip_diagnostics = config["filter"].get(diagnostics_key, False) + + if input_name in config["filter"] and diagnostics_key in config["filter"][input_name]: + skip_diagnostics = config["filter"][input_name][diagnostics_key] + + if skip_diagnostics: + arg_parts.append(input_name) + + if len(arg_parts) > 0: + argument = f"--skip-inputs {' '.join(arg_parts)}" + else: + argument = "" + + return argument + +def _get_filter_min_length_query(wildcards): + """Build a sequence length filter query for each input, checking for + input-specific length requirements. + """ + inputs = config["inputs"] + length_key = "min_length" + + query_parts = [] + for input_name in inputs.keys(): + min_length = config["filter"][length_key] + + if input_name in config["filter"] and length_key in config["filter"][input_name]: + min_length = config["filter"][input_name][length_key] + + # We only annotate input-specific columns on metadata when there are + # multiple inputs. + if len(inputs) > 1: + query_parts.append(f"({input_name} == 'yes' & _length >= {min_length})") + else: + query_parts.append(f"(_length >= {min_length})") + + query = " | ".join(query_parts) + return f"--query \"{query}\"" + def _get_filter_value(wildcards, key): - default = config["filter"].get(key, "") - if wildcards["origin"] == "": - return default - return config["filter"].get(wildcards["origin"], {}).get(key, default) + for input_name in config["inputs"].keys(): + if input_name in config["filter"] and key in config["filter"][input_name]: + print( + f"ERROR: We no longer support input-specific filtering with the '{key}' parameter.", + "Remove this parameter from your configuration file and try running the workflow again.", + file=sys.stderr, + ) + sys.exit(1) + + return config["filter"].get(key, "") def _get_path_for_input(stage, origin_wildcard): """ @@ -47,7 +100,7 @@ def _get_path_for_input(stage, origin_wildcard): if stage in {"metadata", "sequences"}: raise Exception(f"ERROR: config->input->{origin_wildcard}->{stage} is not defined.") - elif stage in {"aligned", "filtered"}: + elif stage in {"aligned"}: return f"results/{stage}_{origin_wildcard}.fasta.xz" else: raise Exception(f"_get_path_for_input with unknown stage \"{stage}\"") @@ -67,7 +120,7 @@ def _get_unified_metadata(wildcards): def _get_unified_alignment(wildcards): if len(list(config["inputs"].keys()))==1: - return _get_path_for_input("filtered", list(config["inputs"].keys())[0]) + return _get_path_for_input("aligned", list(config["inputs"].keys())[0]) return "results/combined_sequences_for_subsampling.fasta.xz", def _get_metadata_by_build_name(build_name): @@ -126,7 +179,7 @@ def _get_max_date_for_frequencies(wildcards): def _get_upload_inputs(wildcards): # The main workflow supports multiple inputs/origins, but our desired file # structure under data.nextstrain.org/files/ncov/open/… is designed around - # a single input/origin. Intermediates (aligned, filtered, etc) + # a single input/origin. Intermediates (aligned, etc) # are specific to each input/origin and thus do not match our desired # structure, while builds (global, europe, africa, etc) span all # inputs/origins (and thus do). In our desired outcome, the two kinds of @@ -140,12 +193,6 @@ def _get_upload_inputs(wildcards): origin = config["S3_DST_ORIGINS"][0] # mapping of remote → local filenames - preprocessing_files = { - f"aligned.fasta.xz": f"results/aligned_{origin}.fasta.xz", # from `rule align` - f"filtered.fasta.xz": f"results/filtered_{origin}.fasta.xz", # from `rule filter` - f"mutation-summary.tsv.xz": f"results/mutation_summary_{origin}.tsv.xz", # from `rule mutation_summary` - } - build_files = {} for build_name in config["builds"]: build_files.update({ @@ -157,14 +204,4 @@ def _get_upload_inputs(wildcards): f"{build_name}/{build_name}_tip-frequencies.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_tip-frequencies.json", f"{build_name}/{build_name}_root-sequence.json": f"auspice/{config['auspice_json_prefix']}_{build_name}_root-sequence.json" }) - - req_upload = config.get("upload", []) - if "preprocessing-files" in req_upload and "build-files" in req_upload: - return {**preprocessing_files, **build_files} - elif "preprocessing-files" in req_upload: - return preprocessing_files - elif "build-files" in req_upload: - return build_files - else: - raise Exception("The upload rule requires an 'upload' parameter in the config.") - + return build_files diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk index ed17ab476..eede8bcba 100644 --- a/workflow/snakemake_rules/export_for_nextstrain.smk +++ b/workflow/snakemake_rules/export_for_nextstrain.smk @@ -190,37 +190,6 @@ rule upload: message += f"\n\ts3://{params.s3_bucket}/{remote}" send_slack_message(message) -rule trigger_phylogenetic_rebuild: - message: "Triggering nextstrain/ncov rebuild action (via repository dispatch)" - input: - "results/upload.done" - log: - "logs/trigger_phylogenetic_rebuild.txt" - run: - # some basic checks here before we proceed - if "preprocessing-files" not in config.get("upload", []) or len(config.get("upload", []))!=1: - raise Exception("Rule trigger_phylogenetic_rebuild should only be called when you are only uploading preprocessing-files") - if "github_token" not in config: - raise Exception("Rule trigger_phylogenetic_rebuild requires a config-provided 'github_token'") - if config["S3_DST_BUCKET"] == "nextstrain-data/files/ncov/open": - dispatch_type = "open/rebuild" - elif config["S3_DST_BUCKET"] == "nextstrain-ncov-private": - dispatch_type = "gisaid/rebuild" - else: - raise Exception("Rule trigger_phylogenetic_rebuild cannot be run with a non-default S3_DST_BUCKET") - headers = { - 'Content-type': 'application/json', - 'authorization': f"Bearer {config['github_token']}", - 'Accept': 'application/vnd.github.v3+json' - } - data = json.dumps( - {"event_type": dispatch_type} - ) - send_slack_message(f"This pipeline is now triggering a GitHub action (via dispatch type {dispatch_type}) to run the corresponding phylogenetic builds.") - print(f"Triggering ncov rebuild GitHub action via repository dispatch type: {dispatch_type}") - response = requests.post("https://api.github.com/repos/nextstrain/ncov/dispatches", headers=headers, data=data) - response.raise_for_status() - storage = PersistentDict("slack") def send_slack_message(message, broadcast=False): @@ -260,10 +229,8 @@ def send_slack_message(message, broadcast=False): # onstart handler will be executed before the workflow starts. onstart: - # note config["upload"] = array of enum{"preprocessing-files", "build-files"} message = [ - "Pipeline starting which will upload " + - ' & '.join([x.replace("-", " ") for x in config.get('upload', ['nothing'])]), + "Pipeline starting which will run the phylogenetics and upload build assets", f"Deployed {deploy_origin}" ] if environ.get("AWS_BATCH_JOB_ID"): diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index 809d11d38..9a3e07080 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -104,93 +104,6 @@ rule align: xz -2 {params.outdir}/{params.basename}*.fasta """ -rule diagnostic: - message: "Scanning metadata {input.metadata} for problematic sequences. Removing sequences with >{params.clock_filter} deviation from the clock and with more than {params.snp_clusters}." - input: - metadata = "results/sanitized_metadata_{origin}.tsv.xz" - output: - to_exclude = "results/to-exclude_{origin}.txt" - params: - clock_filter = 20, - snp_clusters = 1, - rare_mutations = 100, - clock_plus_rare = 100, - log: - "logs/diagnostics_{origin}.txt" - benchmark: - "benchmarks/diagnostics_{origin}.txt" - resources: - # Memory use scales primarily with the size of the metadata file. - mem_mb=12000 - conda: config["conda_environment"] - shell: - """ - python3 scripts/diagnostic.py \ - --metadata {input.metadata} \ - --clock-filter {params.clock_filter} \ - --rare-mutations {params.rare_mutations} \ - --clock-plus-rare {params.clock_plus_rare} \ - --snp-clusters {params.snp_clusters} \ - --output-exclusion-list {output.to_exclude} 2>&1 | tee {log} - """ - -def _collect_exclusion_files(wildcards): - # This rule creates a per-input exclude file for `rule filter`. This file contains one or both of the following: - # (1) a config-defined exclude file - # (2) a dynamically created file (`rule diagnostic`) which scans the alignment for potential errors - # The second file is optional - it may be opted out via config → skip_diagnostics - if config["filter"].get(wildcards["origin"], {}).get("skip_diagnostics", False): - return [ config["files"]["exclude"] ] - return [ config["files"]["exclude"], f"results/to-exclude_{wildcards['origin']}.txt" ] - - -rule filter: - message: - """ - Filtering alignment {input.sequences} -> {output.sequences} - - excluding strains in {input.exclude} - - including strains in {input.include} - - min length: {params.min_length} - """ - input: - sequences = lambda wildcards: _get_path_for_input("aligned", wildcards.origin), - metadata = "results/sanitized_metadata_{origin}.tsv.xz", - # TODO - currently the include / exclude files are not input (origin) specific, but this is possible if we want - include = config["files"]["include"], - exclude = _collect_exclusion_files, - output: - sequences = "results/filtered_{origin}.fasta.xz" - log: - "logs/filtered_{origin}.txt" - benchmark: - "benchmarks/filter_{origin}.txt" - params: - min_length = lambda wildcards: _get_filter_value(wildcards, "min_length"), - exclude_where = lambda wildcards: _get_filter_value(wildcards, "exclude_where"), - min_date = lambda wildcards: _get_filter_value(wildcards, "min_date"), - ambiguous = lambda wildcards: f"--exclude-ambiguous-dates-by {_get_filter_value(wildcards, 'exclude_ambiguous_dates_by')}" if _get_filter_value(wildcards, "exclude_ambiguous_dates_by") else "", - date = (date.today() + datetime.timedelta(days=1)).strftime("%Y-%m-%d"), - intermediate_output=lambda wildcards, output: Path(output.sequences).with_suffix("") - resources: - # Memory use scales primarily with the size of the metadata file. - mem_mb=12000 - conda: config["conda_environment"] - shell: - """ - augur filter \ - --sequences {input.sequences} \ - --metadata {input.metadata} \ - --include {input.include} \ - --max-date {params.date} \ - --min-date {params.min_date} \ - {params.ambiguous} \ - --exclude {input.exclude} \ - --exclude-where {params.exclude_where}\ - --min-length {params.min_length} \ - --output {params.intermediate_output} 2>&1 | tee {log}; - xz -2 {params.intermediate_output} - """ - def _get_subsampling_settings(wildcards): # Allow users to override default subsampling with their own settings keyed # by location type and name. For example, "region_europe" or @@ -300,10 +213,10 @@ rule combine_sequences_for_subsampling: # Similar to rule combine_input_metadata, this rule should only be run if multiple inputs are being used (i.e. multiple origins) message: """ - Combine and deduplicate aligned & filtered FASTAs from multiple origins in preparation for subsampling. + Combine and deduplicate aligned FASTAs from multiple origins in preparation for subsampling. """ input: - lambda w: [_get_path_for_input("filtered", origin) for origin in config.get("inputs", {})] + lambda w: [_get_path_for_input("aligned", origin) for origin in config.get("inputs", {})] output: "results/combined_sequences_for_subsampling.fasta.xz" benchmark: @@ -360,14 +273,11 @@ rule subsample: - priority: {params.priority_argument} """ input: - sequences = _get_unified_alignment, metadata = _get_unified_metadata, - sequence_index = rules.index_sequences.output.sequence_index, include = config["files"]["include"], priorities = get_priorities, exclude = config["files"]["exclude"] output: - sequences = "results/{build_name}/sample-{subsample}.fasta", strains="results/{build_name}/sample-{subsample}.txt", log: "logs/subsample_{build_name}_{subsample}.txt" @@ -392,9 +302,7 @@ rule subsample: shell: """ augur filter \ - --sequences {input.sequences} \ --metadata {input.metadata} \ - --sequence-index {input.sequence_index} \ --include {input.include} \ --exclude {input.exclude} \ {params.min_date} \ @@ -408,10 +316,36 @@ rule subsample: {params.sequences_per_group} \ {params.subsample_max_sequences} \ {params.sampling_scheme} \ - --output {output.sequences} \ --output-strains {output.strains} 2>&1 | tee {log} """ +rule extract_subsampled_sequences: + input: + metadata = _get_unified_metadata, + alignment = _get_unified_alignment, + sequence_index = rules.index_sequences.output.sequence_index, + strains = "results/{build_name}/sample-{subsample}.txt", + output: + subsampled_sequences = "results/{build_name}/sample-{subsample}.fasta", + log: + "logs/extract_subsampled_sequences_{build_name}_{subsample}.txt" + benchmark: + "benchmarks/extract_subsampled_sequences_{build_name}_{subsample}.txt" + resources: + # Memory use scales primarily with the size of the metadata file. + mem_mb=12000 + conda: config["conda_environment"] + shell: + """ + augur filter \ + --metadata {input.metadata} \ + --sequences {input.alignment} \ + --sequence-index {input.sequence_index} \ + --exclude-all \ + --include {input.strains} \ + --output-sequences {output.subsampled_sequences} 2>&1 | tee {log} + """ + rule proximity_score: message: """ @@ -484,7 +418,6 @@ rule combine_samples: """ input: sequences=_get_unified_alignment, - sequence_index=rules.index_sequences.output.sequence_index, metadata=_get_unified_metadata, include=_get_subsampled_files, output: @@ -499,7 +432,6 @@ rule combine_samples: """ augur filter \ --sequences {input.sequences} \ - --sequence-index {input.sequence_index} \ --metadata {input.metadata} \ --exclude-all \ --include {input.include} \ @@ -547,6 +479,47 @@ rule build_align: --output-insertions {output.insertions} > {log} 2>&1 """ +rule diagnostic: + message: "Scanning metadata {input.metadata} for problematic sequences. Removing sequences with >{params.clock_filter} deviation from the clock and with more than {params.snp_clusters}." + input: + metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", + output: + to_exclude = "results/{build_name}/excluded_by_diagnostics.txt" + params: + clock_filter = 20, + snp_clusters = 1, + rare_mutations = 100, + clock_plus_rare = 100, + skip_inputs_arg=_get_skipped_inputs_for_diagnostic, + log: + "logs/diagnostics_{build_name}.txt" + benchmark: + "benchmarks/diagnostics_{build_name}.txt" + resources: + # Memory use scales primarily with the size of the metadata file. + mem_mb=12000 + conda: config["conda_environment"] + shell: + """ + python3 scripts/diagnostic.py \ + --metadata {input.metadata} \ + --clock-filter {params.clock_filter} \ + --rare-mutations {params.rare_mutations} \ + --clock-plus-rare {params.clock_plus_rare} \ + --snp-clusters {params.snp_clusters} \ + {params.skip_inputs_arg} \ + --output-exclusion-list {output.to_exclude} 2>&1 | tee {log} + """ + +def _collect_exclusion_files(wildcards): + # This rule creates a per-input exclude file for `rule filter`. This file contains one or both of the following: + # (1) a config-defined exclude file + # (2) a dynamically created file (`rule diagnostic`) which scans the alignment for potential errors + # The second file is optional - it may be opted out via config → skip_diagnostics + if config["filter"].get("skip_diagnostics", False): + return [ config["files"]["exclude"] ] + return [ config["files"]["exclude"], f"results/{wildcards.build_name}/excluded_by_diagnostics.txt" ] + rule mask: message: """ @@ -597,6 +570,93 @@ rule compress_build_align: xz -c {input} > {output} 2> {log} """ +rule index: + message: + """ + Index sequence composition. + """ + input: + sequences = "results/{build_name}/masked.fasta" + output: + sequence_index = "results/{build_name}/sequence_index.tsv", + log: + "logs/index_sequences_{build_name}.txt" + benchmark: + "benchmarks/index_sequences_{build_name}.txt" + conda: config["conda_environment"] + shell: + """ + augur index \ + --sequences {input.sequences} \ + --output {output.sequence_index} 2>&1 | tee {log} + """ + +rule annotate_metadata_with_index: + input: + metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz", + sequence_index = "results/{build_name}/sequence_index.tsv", + output: + metadata="results/{build_name}/metadata_with_index.tsv", + log: + "logs/annotate_metadata_with_index_{build_name}.txt", + benchmark: + "benchmarks/annotate_metadata_with_index_{build_name}.txt", + conda: config["conda_environment"] + shell: + """ + python3 scripts/annotate_metadata_with_index.py \ + --metadata {input.metadata} \ + --sequence-index {input.sequence_index} \ + --output {output.metadata} + """ + +rule filter: + message: + """ + Filtering alignment {input.sequences} -> {output.sequences} + - excluding strains in {input.exclude} + - including strains in {input.include} + - min length: {params.min_length_query} + """ + input: + sequences = "results/{build_name}/masked.fasta", + metadata = "results/{build_name}/metadata_with_index.tsv", + # TODO - currently the include / exclude files are not input (origin) specific, but this is possible if we want + include = config["files"]["include"], + exclude = _collect_exclusion_files, + output: + sequences = "results/{build_name}/filtered.fasta", + filter_log = "results/{build_name}/filtered_log.tsv", + log: + "logs/filtered_{build_name}.txt" + benchmark: + "benchmarks/filter_{build_name}.txt" + params: + min_length_query = _get_filter_min_length_query, + exclude_where = lambda wildcards: _get_filter_value(wildcards, "exclude_where"), + min_date = lambda wildcards: _get_filter_value(wildcards, "min_date"), + ambiguous = lambda wildcards: f"--exclude-ambiguous-dates-by {_get_filter_value(wildcards, 'exclude_ambiguous_dates_by')}" if _get_filter_value(wildcards, "exclude_ambiguous_dates_by") else "", + date = (date.today() + datetime.timedelta(days=1)).strftime("%Y-%m-%d"), + resources: + # Memory use scales primarily with the size of the metadata file. + mem_mb=500 + conda: config["conda_environment"] + shell: + """ + augur filter \ + --sequences {input.sequences} \ + --metadata {input.metadata} \ + --include {input.include} \ + {params.min_length_query} \ + --max-date {params.date} \ + --min-date {params.min_date} \ + {params.ambiguous} \ + --exclude {input.exclude} \ + --exclude-where {params.exclude_where}\ + --output {output.sequences} \ + --output-log {output.filter_log} 2>&1 | tee {log}; + """ + if "run_pangolin" in config and config["run_pangolin"]: rule run_pangolin: message: @@ -605,7 +665,7 @@ if "run_pangolin" in config and config["run_pangolin"]: Please remember to update your installation of pangolin regularly to ensure the most up-to-date classifications. """ input: - alignment = rules.build_align.output.alignment, + alignment = "results/{build_name}/aligned.fasta", output: lineages = "results/{build_name}/pangolineages.csv", params: @@ -676,7 +736,7 @@ rule adjust_metadata_regions: rule tree: message: "Building tree" input: - alignment = rules.mask.output.alignment + alignment = "results/{build_name}/filtered.fasta", output: tree = "results/{build_name}/tree_raw.nwk" params: @@ -713,7 +773,7 @@ rule refine: """ input: tree = rules.tree.output.tree, - alignment = rules.mask.output.alignment, + alignment = "results/{build_name}/filtered.fasta", metadata="results/{build_name}/metadata_adjusted.tsv.xz", output: tree = "results/{build_name}/tree.nwk", @@ -768,7 +828,7 @@ rule ancestral: """ input: tree = rules.refine.output.tree, - alignment = rules.build_align.output.alignment + alignment = "results/{build_name}/aligned.fasta", output: node_data = "results/{build_name}/nt_muts.json" log: @@ -997,7 +1057,6 @@ rule rename_emerging_lineages: with open(output.clade_data, "w") as fh: json.dump({"nodes": new_data}, fh, indent=2) - rule colors: message: "Constructing colors file" input: