Merge pull request #1132 from nextstrain/gray-out-early-clades

Gray out early clades
nextstrain · Oct 3, 2024 · 7db9229 · 7db9229
2 parents faa85a7 + fe10e58
commit 7db9229
Show file tree

Hide file tree

Showing 9 changed files with 273 additions and 8 deletions.
diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
@@ -134,6 +134,12 @@ refine:
 ancestral:
   inference: "joint"
 
+colors:
+  default:
+    # Amount of time back to color clades, if "all" then all clades are colored
+    # Can be specified per build in builds.yaml
+    clade_recency: "all"
+
 # Frequencies settings
 frequencies:
 

diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md
@@ -5,6 +5,8 @@ We also use this change log to document new features that maintain backward comp
 
 ## New features since last version update
 
+- 2 October 2024: Include a new parameter for `clade_recency` under `colors`. This parameter is used to define which clades should receive a color from the standard rainbow palette. A value of `6M` will cause clades with strains in the tree sampled within the last 6 months to be colored and earlier strains to not receive a color (and be colored in a palette of grays by Auspice). This `clade_recency` parameter is used in `builds.yaml` in `nextstrain_profiles` to color clades according for the `1m`, `2m`, `6m` and `all-time` timepoints. If `clade_recency` is not supplied then all clades will be colored. [PR 1132](https://github.com/nextstrain/ncov/pull/1132)
+
 - 30 September 2024: Use population-based weighted sampling for `nextstrain_profiles`. This requires a minimum Augur version of 25.3.0. PRs [1106](https://github.com/nextstrain/ncov/pull/1106), [1150](https://github.com/nextstrain/ncov/pull/1150), [1151](https://github.com/nextstrain/ncov/pull/1151)
 
 - 31 January 2024: Remove RBD-level related rules and files since this feature has been broken since May 2023 and is no longer relevant. [PR 1097](https://github.com/nextstrain/ncov/pull/1097)

diff --git a/docs/src/reference/workflow-config-file.rst b/docs/src/reference/workflow-config-file.rst
@@ -941,6 +941,34 @@ no_timetree
 -  description: Do not produce a time tree.
 -  default: ``false``
 
+colors
+------
+
+-  type: object
+-  description: Parameters for assigning colors in ``scripts/assign-colors.py``
+-  examples:
+
+.. code:: yaml
+
+   colors:
+     default:
+       clade_recency: "all"
+     global-6m:
+       # Override clade recency colors for "global-6m" build
+       clade_recency: "6M"
+
+Each named traits configuration (``default`` or build-named) supports the following attributes:
+
+.. contents::
+   :local:
+
+clade_recency
+~~~~~~~~~~~~~
+
+-  type: string
+-  format: `ISO 8601 <https://en.wikipedia.org/wiki/ISO_8601#Durations>`__ duration with optional ``P`` prefix (e.g. ``2M``, ``18M``, ``1Y6M``)
+-  description: restrict to clades found in tree within this duration from present
+-  default: ``all`` (no restriction)
 
 traits
 ------

diff --git a/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml b/nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
@@ -338,6 +338,54 @@ subsampling:
 refine:
   root: "21L"
 
+# if different traits should be reconstructed for some builds, specify here
+# otherwise the default trait config in defaults/parameters.yaml will used
+colors:
+  default:
+    clade_recency: "all"
+  global_1m:
+    clade_recency: "1M"
+  global_2m:
+    clade_recency: "2M"
+  global_6m:
+    clade_recency: "6M"
+  africa_1m:
+    clade_recency: "1M"
+  africa_2m:
+    clade_recency: "2M"
+  africa_6m:
+    clade_recency: "6M"
+  asia_1m:
+    clade_recency: "1M"
+  asia_2m:
+    clade_recency: "2M"
+  asia_6m:
+    clade_recency: "6M"
+  europe_1m:
+    clade_recency: "1M"
+  europe_2m:
+    clade_recency: "2M"
+  europe_6m:
+    clade_recency: "6M"
+  north-america_1m:
+    clade_recency: "1M"
+  north-america_2m:
+    clade_recency: "2M"
+  north-america_6m:
+    clade_recency: "6M"
+  oceania_1m:
+    clade_recency: "1M"
+  oceania_2m:
+    clade_recency: "2M"
+  oceania_6m:
+    clade_recency: "6M"
+  south-america_1m:
+    clade_recency: "1M"
+  south-america_2m:
+    clade_recency: "2M"
+  south-america_6m:
+    clade_recency: "6M"
+
 # if different traits should be reconstructed for some builds, specify here
 # otherwise the default trait config in defaults/parameters.yaml will used
 traits:

diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml
@@ -326,6 +326,54 @@ subsampling:
       group_by_weights: "defaults/population_weights.tsv"
       max_sequences: 4320
 
+# if different traits should be reconstructed for some builds, specify here
+# otherwise the default trait config in defaults/parameters.yaml will used
+colors:
+  default:
+    clade_recency: "all"
+  global_1m:
+    clade_recency: "1M"
+  global_2m:
+    clade_recency: "2M"
+  global_6m:
+    clade_recency: "6M"
+  africa_1m:
+    clade_recency: "1M"
+  africa_2m:
+    clade_recency: "2M"
+  africa_6m:
+    clade_recency: "6M"
+  asia_1m:
+    clade_recency: "1M"
+  asia_2m:
+    clade_recency: "2M"
+  asia_6m:
+    clade_recency: "6M"
+  europe_1m:
+    clade_recency: "1M"
+  europe_2m:
+    clade_recency: "2M"
+  europe_6m:
+    clade_recency: "6M"
+  north-america_1m:
+    clade_recency: "1M"
+  north-america_2m:
+    clade_recency: "2M"
+  north-america_6m:
+    clade_recency: "6M"
+  oceania_1m:
+    clade_recency: "1M"
+  oceania_2m:
+    clade_recency: "2M"
+  oceania_6m:
+    clade_recency: "6M"
+  south-america_1m:
+    clade_recency: "1M"
+  south-america_2m:
+    clade_recency: "2M"
+  south-america_6m:
+    clade_recency: "6M"
+
 # if different traits should be reconstructed for some builds, specify here
 # otherwise the default trait config in defaults/parameters.yaml will used
 traits:

diff --git a/nextstrain_profiles/nextstrain-open/builds.yaml b/nextstrain_profiles/nextstrain-open/builds.yaml
@@ -331,6 +331,54 @@ subsampling:
 refine:
   root: "Wuhan-Hu-1/2019"
 
+# if different traits should be reconstructed for some builds, specify here
+# otherwise the default trait config in defaults/parameters.yaml will used
+colors:
+  default:
+    clade_recency: "all"
+  global_1m:
+    clade_recency: "1M"
+  global_2m:
+    clade_recency: "2M"
+  global_6m:
+    clade_recency: "6M"
+  africa_1m:
+    clade_recency: "1M"
+  africa_2m:
+    clade_recency: "2M"
+  africa_6m:
+    clade_recency: "6M"
+  asia_1m:
+    clade_recency: "1M"
+  asia_2m:
+    clade_recency: "2M"
+  asia_6m:
+    clade_recency: "6M"
+  europe_1m:
+    clade_recency: "1M"
+  europe_2m:
+    clade_recency: "2M"
+  europe_6m:
+    clade_recency: "6M"
+  north-america_1m:
+    clade_recency: "1M"
+  north-america_2m:
+    clade_recency: "2M"
+  north-america_6m:
+    clade_recency: "6M"
+  oceania_1m:
+    clade_recency: "1M"
+  oceania_2m:
+    clade_recency: "2M"
+  oceania_6m:
+    clade_recency: "6M"
+  south-america_1m:
+    clade_recency: "1M"
+  south-america_2m:
+    clade_recency: "2M"
+  south-america_6m:
+    clade_recency: "6M"
+
 # if different traits should be reconstructed for some builds, specify here
 # otherwise the default trait config in defaults/parameters.yaml will used
 traits:

diff --git a/scripts/assign-colors.py b/scripts/assign-colors.py
@@ -1,10 +1,37 @@
 import argparse
+import datetime
+import isodate
 import pandas as pd
 
 # Forced colours MUST NOT appear in the ordering TSV
 forced_colors = {
 }
 
+def date_within_last_n_months(date_str, cutoff_date):
+    if 'XX' in date_str:
+        return False  # Ignore uncertain dates
+    try:
+        date = datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
+        return date >= cutoff_date
+    except ValueError:
+        return False
+
+
+def relative_date(duration: str):
+    """
+    Convert an ISO 8601 duration to an absolute date by subtracting it from the
+    current date.
+
+    `duration` should be a backwards-looking relative date in ISO 8601 duration
+    format with optional P prefix (e.g. '1W', 'P1W').
+    """
+    if duration.startswith('P'):
+        duration = duration
+    else:
+        duration = 'P' + duration
+    return datetime.date.today() - isodate.parse_duration(duration)
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
         description="Assign colors based on ordering",
@@ -15,6 +42,10 @@
     parser.add_argument('--color-schemes', type=str, required=True, help="input color schemes file")
     parser.add_argument('--metadata', type=str, help="if provided, restrict colors to only those found in metadata")
     parser.add_argument('--clade-node-data', type=str, help="if provided, restrict to only those clades found in tree")
+    parser.add_argument('--clade-recency', type=relative_date, metavar='DURATION',
+        help="""if provided, restrict to clades found in tree within this time
+             frame. Format: ISO 8601 duration with optional P prefix (e.g. '1W',
+             'P1W')""")
     parser.add_argument('--output', type=str, required=True, help="output colors tsv")
     args = parser.parse_args()
 
@@ -36,26 +67,38 @@
     if args.metadata:
         metadata = pd.read_csv(args.metadata, delimiter='\t')
         for name, trait in assignment.items():
-            if name in metadata:
+            if name in metadata['strain'].values:
                 subset_present = [x for x in assignment[name] if x in metadata[name].unique()]
                 assignment[name] = subset_present
-            if name in metadata and 'focal' in metadata:
+            if name in metadata['strain'].values and 'focal' in metadata.columns:
                 focal_list = metadata.loc[metadata['focal'] == True, name].unique()
                 subset_focal = [x for x in assignment[name] if x in focal_list]
                 assignment[name] = subset_focal
 
-    # if node json is supplied, restrict to clades names in the tree
+    # if node json is supplied, restrict to clades names in the tree within the specified recency
     if args.clade_node_data and "clade_membership" in assignment:
         with open(args.clade_node_data) as fh:
             import json
             clades = json.load(fh)['nodes']
 
-        # generate a set of present values
-        subset_present = set([x["clade_membership"] for x in clades.values()])
-        # restrict to only those present while maintaining order
-        assignment["clade_membership"] = [x for x in assignment["clade_membership"]
-                                          if x in subset_present]
+        if args.clade_recency is not None and args.metadata:
+            # Generate a set of present values within the specified recency
+            subset_present = set()
+            metadata = pd.read_csv(args.metadata, delimiter='\t')
+            for strain, info in clades.items():
+                if strain in metadata['strain'].values:
+                    date_str = metadata.loc[metadata['strain'] == strain, 'date'].values[0]
+                    if date_within_last_n_months(date_str, args.clade_recency):
+                        subset_present.add(info["clade_membership"])
 
+            # Restrict to only those present while maintaining order
+            assignment["clade_membership"] = [x for x in assignment["clade_membership"]
+                                              if x in subset_present]
+        else:
+            # If no clade_recency is provided, look for all clades present in the tree
+            subset_present = set([x["clade_membership"] for x in clades.values()])
+            assignment["clade_membership"] = [x for x in assignment["clade_membership"]
+                                              if x in subset_present]
 
     schemes = {}
     counter = 0

diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk
@@ -1,13 +1,31 @@
 """Small, shared functions used to generate inputs and parameters.
 """
 import datetime
+import isodate
 from itertools import product
 from shlex import (
     quote as shquote,       # shquote() is used in this file and also other workflow files
     split as shsplitwords,
 )
 from urllib.parse import urlsplit
 
+# TODO: deduplicate this with the same function in scripts/assign-colors.py.
+# There is no easy way to share functions between the workflow and that file at
+# the moment. One approach would be to surface it via Augur's Python API.
+def relative_date(duration: str):
+    """
+    Convert an ISO 8601 duration to an absolute date by subtracting it from the
+    current date.
+
+    `duration` should be a backwards-looking relative date in ISO 8601 duration
+    format with optional P prefix (e.g. '1W', 'P1W').
+    """
+    if duration.startswith('P'):
+        duration = duration
+    else:
+        duration = 'P' + duration
+    return datetime.date.today() - isodate.parse_duration(duration)
+
 def shquotewords(s: str) -> str:
     """
     Split string *s* into (POSIX) shell words, quote each word, and join them
@@ -168,6 +186,27 @@ def _get_metadata_by_wildcards(wildcards):
     """
     return _get_metadata_by_build_name(wildcards.build_name)
 
+def _get_clade_recency_for_wildcards(wildcards):
+    # check if builds.yaml contains colors:{build_name}:clade_recency
+    if wildcards.build_name in config["colors"] and 'clade_recency' in config["colors"][wildcards.build_name]:
+        return config["colors"][wildcards.build_name]["clade_recency"]
+    # check if builds.yaml or parameters.yaml contains colors:default:clade_recency
+    elif "colors" in config and "clade_recency" in config["colors"]["default"]:
+        return config["colors"]["default"]["clade_recency"]
+    # else return sensible default value
+    else:
+        return "all"
+
+def _get_clade_recency_argument(wildcards):
+    clade_recency_setting = _get_clade_recency_for_wildcards(wildcards)
+    if clade_recency_setting == "all":
+        return ""
+    try:
+        relative_date(clade_recency_setting)
+        return "--clade-recency " + shquote(clade_recency_setting)
+    except:
+        raise Exception(f'clade_recency must be "all" or a duration string (e.g. "6M", "1Y"). Got: {clade_recency_setting!r}')
+
 def _get_trait_columns_by_wildcards(wildcards):
     if wildcards.build_name in config["traits"]:
         return config["traits"][wildcards.build_name]["columns"]

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
@@ -1105,6 +1105,8 @@ rule colors:
         color_schemes = config["files"]["color_schemes"],
         metadata="results/{build_name}/metadata_adjusted.tsv.xz",
         clades = rules.clades.output.clade_data
+    params:
+        clade_recency_argument = _get_clade_recency_argument
     output:
         colors = "results/{build_name}/colors.tsv"
     log:
@@ -1124,6 +1126,7 @@ rule colors:
             --color-schemes {input.color_schemes} \
             --output {output.colors} \
             --clade-node-data {input.clades} \
+            {params.clade_recency_argument} \
             --metadata {input.metadata} 2>&1 | tee {log}
         """