test_example/data/func_effects_config.yml

# Configurations for determining functional effects of mutations

# --------------------------------------------------------------------------------------
# Configuration for computing functional scores and fitting global epistasis models
# to each individual selection experiment
# --------------------------------------------------------------------------------------

# Define default parameters for computing functional scores.
# Can be provided to individual selections using the alias operator (*)
func_scores_params_default: &func_scores_params_default
  pseudocount: 0.5
  # `min_wt_count` and `min_wt_frac` are designed to raise errors if selection experiment
  # has excessively low wildtype counts (which will lead to inaccurate estimates).
  min_wt_count: 1000  # error if not at least this many wildtype counts
  min_wt_frac: 0.001  # error if not at least this fraction of counts for wildtype
  # `min_pre_selection_count` and `min_pre_selection_frac` drop variants with low counts
  # or frequencies that may not have sufficient statistics for good estimates.
  min_pre_selection_count: 10  # drop variants < this many pre-selection counts
  # !!! Make min_pre_selection_frac LARGER for larger libraries, such 0.1 / (library size) !!!
  min_pre_selection_frac: 0.00005  # drop variants that are < this fraction of all counts

# Define default parameters for fitting global epistasis models to each
# individual selection to get mutationo effects.
global_epistasis_params: &global_epistasis_params
  # How to clip functional scores at upper / lower bounds. Allowable values:
  #  - median_stop: median func score of all variants with stop codons
  #  - null: no clipping
  #  - a number: clip at this number
  clip_lower: median_stop
  clip_upper: null
  # Do we collapse barcodes with same variant?
  collapse_identical_variants: false  # {false, mean, median}

# Define how to compute functional scores. Each key is a selection experiment.
# A global epistasis model is used to get mutational effects for each selection.
# Recommended naming of selection experiments is:
#   <library>-<post-selection sample date as YYMMDD>-<description>-<replicate>
# Each selection experiment should in turn provide the following keys:
#  - `post_selection_sample`: sample after selection (eg, infection of cells)
#  - `pre_selection_sample`: sample pre selection (eg, initial variant frequencies)
#  - `func_score_params`: parameters for computing functional scores
#  - `global_epistasis_params`: parameters for fitting global epistasis model
func_scores:
  LibA-220210-293T_ACE2-1:
    post_selection_sample: LibA-220210-no_antibody-1
    pre_selection_sample: LibA-211028-VSVG-1
    func_score_params: *func_scores_params_default
    global_epistasis_params: *global_epistasis_params
  LibA-220210-293T_ACE2-2:
    post_selection_sample: LibA-220210-no_antibody-2
    pre_selection_sample: LibA-211028-VSVG-2
    func_score_params: *func_scores_params_default
    global_epistasis_params: *global_epistasis_params
  LibA-220302-293T_ACE2-1:
    post_selection_sample: LibA-220302-no_antibody-1
    pre_selection_sample: LibA-211028-VSVG-1
    func_score_params: *func_scores_params_default
    global_epistasis_params: *global_epistasis_params
  LibA-220302-293T_ACE2-2:
    post_selection_sample: LibA-220302-no_antibody-2
    pre_selection_sample: LibA-211028-VSVG-2
    func_score_params: *func_scores_params_default
    global_epistasis_params: *global_epistasis_params
  LibB-220302-293T_ACE2-1:
    post_selection_sample: LibB-220302-no_antibody-1
    pre_selection_sample: LibB-211028-VSVG-1
    func_score_params: *func_scores_params_default
    global_epistasis_params: *global_epistasis_params

# --------------------------------------------------------------------------------------
# Configuration for averaging mutation functional effects across selections and plotting
# them.
# --------------------------------------------------------------------------------------

# Average/plot mutation functional effects from different selections w `avg_func_effects`
# Each key is a condition which has the following values:
#  - `selections`: list of selections for which we average mutation functional effects
#                  or if you only want to average some sites for a given selection then
#                  a dict keyed by the selection name and with the value being the sites
#                  to include from this selection. Sites should be specified as *sequential*
#                  sites as a list of site numbers or lists of inclusive ranges (so
#                  [1, [3, 5]] means [1, 3, 4, 5])
#  - `avg_method`: how to average across the selections, options are "median" and "mean"
#  - `per_selection_tooltips`: whether to show per-selection effects via tooltips
#  - `floor_for_effect_std`: before computing effect std, floor effects at this value.
#  - `plot_kwargs`: keyword arguments passed to `polyclonal.plot.lineplot_and_heatmap`

# Define some defaults for each condition, used via the merge (<<) operator.
avg_func_effects_default: &avg_func_effects_default
  avg_method: median
  per_selection_tooltips: true
  floor_for_effect_std: -3
  plot_kwargs:
    addtl_slider_stats:
      times_seen: 3
      effect_std: 2
      nt changes to codon: 3
      n_selections: 1
    addtl_slider_stats_as_max: [effect_std, nt changes to codon]
    addtl_slider_stats_hide_not_filter: [nt changes to codon]
    heatmap_max_at_least: 1
    heatmap_min_at_least: -1
    init_floor_at_zero: false
    init_site_statistic: mean
    site_zoom_bar_color_col: region  # supplied in the `site_numbering_map`
    slider_binding_range_kwargs:
      times_seen:
        step: 1
        min: 1
        max: 25
      n_selections:
        step: 1
      nt changes to codon:
        step: 1
        min: 1
        max: 3

# Define the functional effect conditions to average
avg_func_effects:
  293T_ACE2_entry:
    <<: *avg_func_effects_default
    selections:
      - LibA-220210-293T_ACE2-1
      - LibA-220210-293T_ACE2-2
      - LibA-220302-293T_ACE2-1
      - LibA-220302-293T_ACE2-2
      - LibB-220302-293T_ACE2-1
  293T_ACE2_entry_by_region:
    <<: *avg_func_effects_default
    selections:  # keep different regions for libA and libB
      LibA-220210-293T_ACE2-1: [[1, 539]]
      LibA-220210-293T_ACE2-2: [[1, 539]]
      LibA-220302-293T_ACE2-1: [[1, 539]]
      LibA-220302-293T_ACE2-2: [[1, 539]]
      LibB-220302-293T_ACE2-1: [540, 541, [542, 1251]]  # specified like this rather than [540, 1251] just for testing


# --------------------------------------------------------------------------------------
# Configuration for the simple difference of func effects from different conditions
# --------------------------------------------------------------------------------------

# Compute simple difference between average functional effects for different conditions.
# The difference is condition_1 minus condition_2. If you do not want to do this comparison,
# set `func_effect_diffs` to `null` or just leave it out altogether.
#
# Each key is a comparison. Under that name, you should have the following keys:
#  - `condition_1`: first condition, keys are name and selection (which gives list of selections,
#                   or if you want to take specific regions for a selection then a dict with values
#                   giving the sequential-site regions as for `avg_func_effects`)
#  - `condition_2`: second condition, keys are name and selection (which gives list of selections)
#  - `avg_method`: how to average across selections for a condition, "median" or "mean"
#  - `plot_kwargs`: keyword arguments passed to `polyclonal.plot.lineplot_and_heatmap`
func_effect_diffs_default: &func_effect_diffs_default
  avg_method: median
  plot_kwargs:
    addtl_slider_stats:
      times_seen: 3
      difference_std: 2  # standard deviation difference across pairwise comparisons for mutation
      fraction_pairs_w_mutation: 1  # fraction of all pairs between conditions w mutation
      best_effect: -2  # effect must be >= for at least one condition
      220210 effect: null  # slider on effect in this condition, but no initial value
      220302 effect: null  # slider on effect in this condition, but no initial value
      nt changes to codon: 3
    addtl_slider_stats_hide_not_filter: [best_effect, 220210 effect, 220302 effect, nt changes to codon]
    addtl_slider_stats_as_max: [difference_std, nt changes to codon]
    heatmap_max_at_least: 1
    heatmap_min_at_least: -1
    init_floor_at_zero: false
    init_site_statistic: mean_abs
    site_zoom_bar_color_col: region  # supplied in the `site_numbering_map`
    slider_binding_range_kwargs:
      times_seen:
        step: 1
        min: 1
        max: 25
      nt changes to codon:
        step: 1
        min: 1
        max: 3

func_effect_diffs:
  220210_vs_220302_comparison:
    <<: *func_effect_diffs_default
    condition_1:
      name: 220210
      selections:
        - LibA-220210-293T_ACE2-1
        - LibA-220210-293T_ACE2-2
    condition_2:
      name: 220302
      selections:
        - LibA-220302-293T_ACE2-1
        - LibA-220302-293T_ACE2-2

  220210_vs_220302_comparison_by_region:  # only consider specific regions of each condition comparison
    <<: *func_effect_diffs_default
    condition_1:
      name: 220210
      selections:
        LibA-220210-293T_ACE2-1: [[1, 539]]
        LibA-220210-293T_ACE2-2: [540, 541, [542, 1251]]  # specified like this rather than [540, 1251] just for testing
    condition_2:
      name: 220302
      selections:
        LibA-220302-293T_ACE2-1: [[1, 539]]
        LibA-220302-293T_ACE2-2: [540, 541, [542, 1251]]  # specified like this rather than [540, 1251] just for testing

# --------------------------------------------------------------------------------------
# Configuration for func effect shifts from `multidms` models comparing conditions.
# --------------------------------------------------------------------------------------

# Define `func_effect_shifts` comparisons of different conditions. Each key is a comparison.
# Each comparison should have the following keys:
#  - `conditions` : dict keyed by condition names with values func scores from above.
#  - `reference` : name of the reference condition, must be in `conditions`
#  - `clip_lower` : how to clip functional scores at lower bound
#  - `clip_upper` : how to clip functional scores at upper bound
#  - `collapse_identical_barcodes` : do we collapse identical barcodes?
#  - `latent_offset` : is there a condition specific offset in latent effects
#  - `lasso_shifts` : list of strength of lasso regularization on shifts
# If you are not doing comparisons, just set `func_effect_shifts` to `null` or
# leave it out altogether.

# default settings for `func_effect_shifts` comparisons
func_effect_shifts_default: &func_effect_shifts_default
  # How to clip functional scores at upper / lower bounds. Allowable values:
  #  - median_stop: median func score of all variants with stop codons
  #  - null: no clipping
  #  - a number: clip at this number
  clip_lower: median_stop
  clip_upper: null
  # Do we collapse barcodes with same variant?
  collapse_identical_variants: false  # {false, mean, median}
  # Do we have offset in latent effects for different conditions? `alpha_d` in `multidms`
  latent_offset: true
  # Strength of lasso regularization on shifts in `multidms`. You can try a range of
  # values and then pick a best one at the averaging step. 
  lasso_shifts: [0.00001, 0.00005, 0.0001, 0.0002, 0.001]

func_effect_shifts:
 LibA-date_comparison-1:
  <<: *func_effect_shifts_default
  reference: 220210
  conditions:
    220210: LibA-220210-293T_ACE2-1
    220302: LibA-220302-293T_ACE2-1
 LibA-date_comparison-2:
  <<: *func_effect_shifts_default
  reference: 220210
  conditions:
    220210: LibA-220210-293T_ACE2-2
    220302: LibA-220302-293T_ACE2-2

# --------------------------------------------------------------------------------------
# Configuration for averaging func effect shifts from multiple comparisons.
# --------------------------------------------------------------------------------------

# Define `func_effect_shifts` comparisons to average. `avg_func_effect_shifts` is keyed
# by name of averaged comparisons. Keys within that are:
#  - `comparisons`: list of comparisons from `func_effect_shifts`, must have same
#                   condition names, lasso shifts, and reference. If you want to take
#                   only specific regions from a comparison in the averages, then
#                   make this a dict with values the sequential site regions as
#                   for `avg_func_effects`.
#  - `lasso_shift`: the single lasso shift to use for the final averaged values.
#  - `avg_method`: how to average across the selections, options are "median" and "mean"
#  - `plot_kwargs`: keyword arguments passed to `polyclonal.plot.lineplot_and_heatmap`

# Define some defaults for each condition, used via the merge (<<) operator.
avg_func_effect_shifts_default: &avg_func_effect_shifts_default
  avg_method: median
  per_comparison_tooltips: true
  plot_kwargs:
    addtl_slider_stats:
      times_seen: 3
      nt changes to codon: 3
    addtl_slider_stats_as_max: [nt changes to codon]
    addtl_slider_stats_hide_not_filter: [nt changes to codon]
    heatmap_max_at_least: 0.5
    heatmap_min_at_least: -0.5
    init_floor_at_zero: false
    init_site_statistic: mean
    site_zoom_bar_color_col: region  # supplied in the `site_numbering_map`
    slider_binding_range_kwargs:
      times_seen:
        step: 1
        min: 1
        max: 25
      n_comparisons:
        step: 1
      nt changes to codon:
        step: 1
        min: 1
        max: 3

# Define the functional effect conditions to average
avg_func_effect_shifts:

  date_comparison:
    <<: *avg_func_effect_shifts_default
    comparisons:
      - LibA-date_comparison-1
      - LibA-date_comparison-2
    lasso_shift: 0.0001

  date_comparison_by_region:
    <<: *avg_func_effect_shifts_default
    comparisons:  # keep different regions for each comparison
      LibA-date_comparison-1: [[1, 539]]
      LibA-date_comparison-2: [540, 541, [542, 1251]]  # specified like this rather than [540, 1251] just for testing
    lasso_shift: 0.0001